-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgpu_mmu.cu
97 lines (82 loc) · 2.97 KB
/
gpu_mmu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include </usr/local/cuda/include/cuda.h>
#include"stdio.h"
#include"k.h"
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cublas_v2.h>
#include <curand.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 1000
#define M 1000
// Export the function we will load into kdb+
extern "C" K gpu_mmf(K A, K rA, K cA, K B, K rB, K cB);
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
// m= nr_rows_A
// k= nr_cols_A
// n= nr_cols_B
void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) {
int lda=k,ldb=n,ldc=m;
timeval t1, t2;
const double alf = 1;
const double bet = 0;
const double *alpha = &alf;
const double *beta = &bet;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
// start timer
gettimeofday(&t1, NULL);
// Do the actual multiplication
// CUBLAS_OP_T means input is row major, CUBLAS_OP_N means input is column major
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
gettimeofday(&t2, NULL);
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU time = %fms\n", et2);
// Destroy the handle
cublasDestroy(handle);
}
K gpu_mmf(K A, K rA, K cA, K B, K rB, K cB) {
// Allocate 3 arrays on CPU
int nr_rows_A = rA->n;
int nr_cols_A = cA->n;
int nr_rows_B = rB->n;
int nr_cols_B = cB->n;
int nr_rows_C = nr_rows_A;
int nr_cols_C = nr_cols_B;
// allocate memory, host arrays
K C = ktn(KF,(nr_rows_C*nr_cols_C));
double *h_A = (double *)malloc(nr_rows_A * nr_cols_A * sizeof(double));
double *h_B = (double *)malloc(nr_rows_B * nr_cols_B * sizeof(double));
double *h_C = (double *)malloc(nr_rows_C * nr_cols_C * sizeof(double));
// Allocate 3 arrays on GPU, device arrays
double *d_A, *d_B, *d_C;
double *host_memoryA = (double*) &(kF(A)[0]);
double *host_memoryB = (double*) &(kF(B)[0]);
double *host_memoryC = (double*) &(kF(C)[0]);
size_t sizeA = nr_rows_A * nr_cols_A * sizeof(double);
size_t sizeB = nr_rows_B * nr_cols_B * sizeof(double);
cudaMalloc((void **)&d_A, sizeA);
cudaMalloc((void **)&d_B, sizeB);
cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(double));
// If you already have useful values in A and B you can copy them in GPU:
cudaMemcpy(d_A, host_memoryA, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, host_memoryB, sizeB, cudaMemcpyHostToDevice);
// Multiply A and B on GPU
gpu_blas_mmul(d_A, d_B, d_C, nr_rows_A, nr_cols_A, nr_cols_B);
// Copy the result on host memory
cudaMemcpy(host_memoryC,d_C,nr_rows_C * nr_cols_C * sizeof(double),cudaMemcpyDeviceToHost);
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
R(C);
}