-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfloydwarshall.cu
104 lines (85 loc) · 3.01 KB
/
floydwarshall.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
#include"k.h"
#include<math.h>
#include </usr/local/cuda/include/cuda.h>
#define CMCPYHTD cudaMemcpyHostToDevice
#define CMCPYDTH cudaMemcpyDeviceToHost
#define BLOCK_WIDTH 16
// Note: I've adapted this from https://github.com/MTB90
// He has also done a "block version" which performs up to 4xfaster
// for very large matrices. Perhaps this could be adapted too one day
// I have mostly stripped out a lot of the code, cut out the preceeding
// matrix stuff, and tried to simplify things
extern "C" K gpu_floydwarshall(K matrix);
/**Kernel for wake gpu
*
* @param reps dummy variable only to perform some action
*/
__global__ void wake_gpu_kernel(int reps)
{
I idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= reps) return;
}
/**Kernel for parallel Floyd Warshall algorithm on gpu
*
* @param u number vertex of which is performed relaxation paths [v1, v2]
* @param n number of vertices in the graph G:=(V,E), n := |V(G)|
* @param d matrix of shortest paths d(G)
*/
__global__ void fw_kernel(const unsigned int u, const unsigned int n, int * const d)
{
I v1 = blockDim.y * blockIdx.y + threadIdx.y;
I v2 = blockDim.x * blockIdx.x + threadIdx.x;
if (v1 < n && v2 < n)
{
I newPath = d[v1 * n + u] + d[u * n + v2];
I oldPath = d[v1 * n + v2];
if (oldPath > newPath)
{
d[v1 * n + v2] = newPath;
}
}
}
K gpu_floydwarshall(K matrix)
{
unsigned int V = sqrt(matrix->n);
unsigned int n = V;
// Alloc host data for G - graph, d - matrix of shortest paths
unsigned int size = V * V;
I *d = (int *) malloc (sizeof(int) * size);
I *dev_d = 0;
cudaStream_t cpyStream;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice(0);
// Initialize the grid and block dimensions here
dim3 dimGrid((n - 1) / BLOCK_WIDTH + 1, (n - 1) / BLOCK_WIDTH + 1);
dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH);
// Create new stream to copy data
cudaStreamCreate(&cpyStream);
// Allocate GPU buffers for matrix of shortest paths d)
cudaMalloc((void**)&dev_d, n * n * sizeof(int));
// Wake up gpu
wake_gpu_kernel<<<1, dimBlock>>>(32);
// Copy input from host memory to GPU buffers.
I *host_memoryd = (int*)&(kI(matrix)[0]);
cudaMemcpyAsync(dev_d, host_memoryd, n * n * sizeof(int), CMCPYHTD, cpyStream);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
cudaDeviceSynchronize();
// set preference for larger L1 cache and smaller shared memory
cudaFuncSetCacheConfig(fw_kernel, cudaFuncCachePreferL1 );
for (int u = 0; u <= (n-1); ++u)
{
fw_kernel<<<dimGrid, dimBlock>>>(u, n, dev_d);
}
// Check for any errors launching the kernel
cudaGetLastError();
// copy mem from gpu back to host
cudaMemcpy(host_memoryd, dev_d, n * n * sizeof(int), CMCPYDTH);
// free memory on gpu
cudaFree(dev_d);
// Delete allocated memory on host
free(d);
R r1(matrix);
}