-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmatmul_gpu.mlir
66 lines (52 loc) · 2.12 KB
/
matmul_gpu.mlir
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#strided1D = (d0) -> (d0)
#strided2D = (d0, d1)[s0] -> (d0 * s0 + d1)
// allocates and returns a 1D memref of size %s filled with value %f.
func @alloc_filled_f32(%s: index, %f: f32) -> memref<?xi8> {
%c0 = constant 0: index
%c1 = constant 1: index
%c4 = constant 4: index
%s4 = muli %s, %c4: index
%buf = alloc(%s4) {alignment = 256} : memref<?xi8>
call @gpu_alloc(%buf) : (memref<?xi8>) -> ()
%fp32_view = view %buf[%s][] : memref<?xi8> to memref<?xf32, #strided1D>
linalg.fill(%fp32_view, %f) : memref<?xf32, #strided1D>, f32
return %buf : memref<?xi8>
}
func @matmul() -> f32 {
%c0 = constant 0 : index
%c1 = constant 1 : index
%c6 = constant 6 : index
%c7 = constant 7 : index
%m = constant 1024 : index
%k = constant 1024 : index
%n = constant 1024 : index
%mk = constant 1048576 : index
%kn = constant 1048576 : index
%mn = constant 1048576 : index
%f1 = constant 1.0e+0 : f32
%f2 = constant 2.0e+0 : f32
%f10 = constant 10.0e+0 : f32
// allocate and fill memrefs.
%bA = call @alloc_filled_f32(%mk, %f2) : (index, f32) -> memref<?xi8>
%bB = call @alloc_filled_f32(%kn, %f1) : (index, f32) -> memref<?xi8>
%bC = call @alloc_filled_f32(%mn, %f10) : (index, f32) -> memref<?xi8>
// convert to 1D f32 memref
%A = view %bA[][%m, %k] : memref<?xi8> to memref<?x?xf32, #strided2D>
%B = view %bB[][%k, %n] : memref<?xi8> to memref<?x?xf32, #strided2D>
%C = view %bC[][%m, %n] : memref<?xi8> to memref<?x?xf32, #strided2D>
linalg.matmul(%A, %B, %C) : memref<?x?xf32, #strided2D>,
memref<?x?xf32, #strided2D>,
memref<?x?xf32, #strided2D>
// Load from GPU memref.
%res = call @gpu_load2d(%C, %c6, %c7) : (memref<?x?xf32, #strided2D>, index, index) -> f32
call @gpu_dealloc(%bC) : (memref<?xi8>) -> ()
call @gpu_dealloc(%bB) : (memref<?xi8>) -> ()
call @gpu_dealloc(%bA) : (memref<?xi8>) -> ()
dealloc %bA : memref<?xi8>
dealloc %bB : memref<?xi8>
dealloc %bC : memref<?xi8>
return %res : f32
}
func @gpu_alloc(memref<?xi8>)
func @gpu_dealloc(memref<?xi8>)
func @gpu_load2d(memref<?x?xf32, #strided2D>, index, index) -> f32