-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcuda_init.py
56 lines (49 loc) · 2.26 KB
/
cuda_init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from numba import cuda, int32, float32, void
import numpy as np
from string import Template
import time
from library import num_of_orders, num_of_terms, nmToterm, map_x, map_y, polynomial_dir, blockSize
#add function for cuda kernel to execute
@cuda.jit#(void(int32, int32, int32, float32, float32, float32, float32))
def cuda_add(num_of_terms, coefficients, d_polynomial, d_result):
#index of individual threads in each block
index = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
#number of threads working in parallel
stride = cuda.blockDim.x * cuda.gridDim.x
for i in range(index, map_x * map_y, stride):
for j in range(0, num_of_terms):
m = i % map_x
n = i // map_x
d_result[n][m] = (d_result[n][m] + coefficients[j] * d_polynomial[j * map_x * map_y + i])
# + 0.15 so that 0 phase is not at the center, which avoids having white-black transition in the middle
d_result[n][m] += 0.15
# Wrapping the phase
d_result[n][m] = d_result[n][m] % 1
#initializing host polynomial memory array
h_polynomial = np.zeros((num_of_terms, map_x, map_y), np.float32)
#initializing host addition result array
h_result = np.zeros((map_x, map_y), np.float32)
#importing polynomials
for n in range(1, num_of_orders + 1):
for m in range(-n , n + 2, 2):
term = nmToterm(n, m) - 1
filename = polynomial_dir + '\\n={}_m={}.npy'.format(n, m)
h_polynomial[term] = np.load(filename)
h_polynomial = h_polynomial.ravel()
#copying polynomials from host to device
d_polynomial = cuda.device_array(num_of_terms * map_x * map_y, dtype=np.float32)
d_polynomial = cuda.to_device(h_polynomial)
#releasing device memory from polynomials imported
del h_polynomial
#creating device memory for addition result
d_result = cuda.to_device(h_result)
#calculating total number of blocks needed for one addition operation
numBlocks = (map_x * map_y + blockSize - 1) // blockSize
def add(coefficients):
global h_result
h_result = np.zeros((map_x, map_y), np.float32)
d_result = cuda.to_device(np.zeros((map_x, map_y), dtype=np.float32))
cuda.synchronize()
cuda_add[numBlocks, blockSize](num_of_terms, coefficients, d_polynomial, d_result)
h_result = d_result.copy_to_host()
return h_result