-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatrixMultiply.cu
94 lines (77 loc) · 2.95 KB
/
MatrixMultiply.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// This program calculates matrix multiplication (SGEMM) using cuBLAS
// By: Nick from CoffeeBeforeArch
#include <cublas_v2.h>
#include <curand.h>
#include <cassert>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>
#include <stdlib.h>
#include "./MatrixMultiply.h"
bool MatrixMultiply(float * featureM, float * featureN, float * result,
int count_m, int count_n, int size, int gpu_id) {
float *dev_featureM = 0;
float *dev_featureN = 0;
float *dev_result = 0;
const float alpha = 1, beta = 0;
cublasHandle_t handle;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(gpu_id);
if (cudaStatus != cudaSuccess) {
printf("cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto out;
}
cublasCreate(&handle);
cudaStatus = cudaMalloc((void**)&dev_featureM, count_m * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_featureN, count_n * size * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMalloc((void**)&dev_result, count_m * count_n * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMemcpy(dev_featureM, featureM, count_m * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
cudaStatus = cudaMemcpy(dev_featureN, featureN, count_n * size * sizeof(float),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMalloc failed!\n", __func__, __LINE__);
goto out;
}
/*
CUBLAS assumes that the matrix in the device is stored in column major:
" where α and β are scalars, and A , B and C are matrices stored in column-major
format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively.
Also, for matrix A
// Multiply the arrays A and B on GPU and save the result in C (coloum-major)
// C(m,n) = A(m,k) * B(k,n)
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
*/
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, count_n, count_m, size,
&alpha, dev_featureN, size, dev_featureM, size, &beta, dev_result, count_n);
cudaStatus = cudaThreadSynchronize();
cudaStatus = cudaMemcpy(result, dev_result, count_m * count_n * sizeof(float),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("%s, line %d, cudaMemcpy failed!\n", __func__, __LINE__);
goto out;
}
out:
if(dev_featureM) cudaFree(dev_featureM);
if(dev_featureN) cudaFree(dev_featureN);
if(dev_result) cudaFree(dev_result);
cublasDestroy(handle);
return cudaStatus == cudaSuccess;
}