-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparallel_cuda_c.py
149 lines (126 loc) · 4.98 KB
/
parallel_cuda_c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
CUDA PARALLEL PROGRAMMING: cuda_c_ops.py
* Purpose: Python interface for performing matrix operations using CUDA C/C++
* @author Prabhsimran Singh
* @version 2.2 15/10/18
* Build shared object library using:
nvcc -Xcompiler -fPIC -shared -o lib/cuda_mat_ops.so ops/matrix_ops.cu
"""
import ctypes
import numpy as np
from ctypes import POINTER, c_double, c_int
# extract cuda function pointers in the shared object cuda_c.so
dll = ctypes.CDLL('./lib/cuda_mat_ops.so', mode=ctypes.RTLD_GLOBAL)
# get the required functions exposed by CUDA C/C++ API
def get_cuda_device_info(dll):
func = dll.cuda_device_info
return func
def get_cuda_matmul(dll):
func = dll.cuda_matmul
func.argtypes = [POINTER(c_double), POINTER(c_double), POINTER(c_double), c_int, c_int, c_int]
return func
def get_cuda_matsum(dll):
func = dll.cuda_matsum
func.argtypes = [POINTER(c_double), POINTER(c_double), POINTER(c_double), c_int, c_int]
return func
def get_cuda_matprod(dll):
func = dll.cuda_matprod
func.argtypes = [POINTER(c_double), POINTER(c_double), POINTER(c_double), c_int, c_int]
return func
def get_cuda_sum(dll):
func = dll.cuda_sum
func.argtypes = [POINTER(c_double), c_double, POINTER(c_double), c_int, c_int]
return func
def get_cuda_prod(dll):
func = dll.cuda_prod
func.argtypes = [POINTER(c_double), c_double, POINTER(c_double), c_int, c_int]
return func
def get_cuda_maximum(dll):
func = dll.cuda_maximum
func.argtypes = [POINTER(c_double), c_double, POINTER(c_double), c_int, c_int]
return func
__cuda_device_info = get_cuda_device_info(dll)
__cuda_matmul = get_cuda_matmul(dll)
__cuda_matsum = get_cuda_matsum(dll)
__cuda_matprod = get_cuda_matprod(dll)
__cuda_sum = get_cuda_sum(dll)
__cuda_prod = get_cuda_prod(dll)
__cuda_maximum = get_cuda_maximum(dll)
# convenient python wrappers for cuda functions
def cuda_device_info():
__cuda_device_info()
def cuda_matmul(a, b, c, m, n, k):
a_p = a.ctypes.data_as(POINTER(c_double))
b_p = b.ctypes.data_as(POINTER(c_double))
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_matmul(a_p, b_p, c_p, m, n, k)
def cuda_matsum(a, b, c, m, n):
a_p = a.ctypes.data_as(POINTER(c_double))
b_p = b.ctypes.data_as(POINTER(c_double))
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_matsum(a_p, b_p, c_p, m, n)
def cuda_matprod(a, b, c, m, n):
a_p = a.ctypes.data_as(POINTER(c_double))
b_p = b.ctypes.data_as(POINTER(c_double))
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_matprod(a_p, b_p, c_p, m, n)
def cuda_sum(a, b, c, m, n):
a_p = a.ctypes.data_as(POINTER(c_double))
b_f = ctypes.c_double(b)
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_sum(a_p, b_f, c_p, m, n)
def cuda_prod(a, b, c, m, n):
a_p = a.ctypes.data_as(POINTER(c_double))
b_f = ctypes.c_double(b)
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_prod(a_p, b_f, c_p, m, n)
def cuda_maximum(a, b, c, m, n):
a_p = a.ctypes.data_as(POINTER(c_double))
b_f = ctypes.c_double(b)
c_p = c.ctypes.data_as(POINTER(c_double))
__cuda_maximum(a_p, b_f, c_p, m, n)
def get_test_params():
size = int(16)
a = np.array([3.0] * (size * size))
b = np.array([3.0] * (size * size))
c = np.zeros(shape=(size * size))
return a, b, c, size
if __name__ == '__main__':
cuda_device_info()
a, b, c, size = get_test_params()
# basic checks for all ops
cuda_matmul(a, b, c, size, size, size)
assert np.all(c==144.0), "Matrix dot-product operation is buggy"
cuda_matsum(a, b, c, size, size)
assert np.all(c==6.0), "Matrix sum operation is buggy"
cuda_matprod(a, b, c, size, size)
assert np.all(c==9.0), "Matrix product operation is buggy"
cuda_sum(a, 5.0, c, size, size)
assert np.all(c==8.0), "Element-wise sum operation is buggy"
cuda_prod(a, 2.5, c, size, size)
assert np.all(c==7.5), "Element-wise product operation is buggy"
cuda_maximum(a, 4.0, c, size, size)
assert np.all(c==4.0), "Element-wise max operation is buggy"
# robust check for matmul
a = np.random.randn(205, 510)
b = np.random.randn(510, 340)
c = np.zeros(205 * 340)
cuda_matmul(a.flatten(), b.flatten(), c, 205, 510, 340)
actual_dot = np.dot(a, b)
c = c.reshape(205, 340)
assert np.allclose(actual_dot, c), "Matrix dot-product operation is buggy"
# robust checks for other ops
a = np.random.randn(100 * 200)
b = np.random.randn(100 * 200)
c = np.zeros_like(a)
cuda_matsum(a, b, c, 100, 200)
assert np.all(a + b == c), "Matrix sum operation is buggy"
cuda_matprod(a, b, c, 100, 200)
assert np.all(a * b == c), "Matrix product operation is buggy"
cuda_sum(a, 5.3, c, 100, 200)
assert np.all(a + 5.3 == c), "Element-wise sum operation is buggy"
cuda_prod(a, 6, c, 100, 200)
assert np.all(a * 6 == c), "Element-wise product operation is buggy"
cuda_maximum(a, 0, c, 100, 200)
assert np.all(np.maximum(0, a) == c), "Element-wise max operation is buggy"
print('Passed all tests!')