Skip to content

Commit 179e4e9

Browse files
Add field-last benchmark script
wip wip
1 parent 4704699 commit 179e4e9

File tree

2 files changed

+344
-0
lines changed

2 files changed

+344
-0
lines changed

.buildkite/pipeline.yml

+10
Original file line numberDiff line numberDiff line change
@@ -1294,6 +1294,16 @@ steps:
12941294
agents:
12951295
slurm_gpus: 1
12961296

1297+
- label: "Perf: benchmark scripts benchmark_field_last"
1298+
key: benchmark_field_last
1299+
command:
1300+
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
1301+
- "julia --color=yes --project=.buildkite benchmarks/scripts/benchmark_field_last.jl"
1302+
env:
1303+
CLIMACOMMS_DEVICE: "CUDA"
1304+
agents:
1305+
slurm_gpus: 1
1306+
12971307
- group: "Perf: Operators"
12981308
steps:
12991309

+334
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
#=
2+
julia --project=.buildkite
3+
using Revise; include(joinpath("benchmarks", "scripts", "benchmark_field_last.jl"))
4+
5+
# Info
6+
7+
# Benchmark results:
8+
9+
Clima A100:
10+
```
11+
Kernel `add3(x1, x2, x3) = x1+x2+x3` and `n_reads_writes=4`:
12+
[ Info: ArrayType = CuArray
13+
Problem size: (63, 4, 4, 5400, 1), float_type = Float32, device_bandwidth_GBs=2039
14+
┌─────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
15+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
16+
├─────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
17+
│ FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 72 microseconds, 899 nanoseconds │ 54.568 │ 1112.64 │ 4 │ 100 │
18+
│ FLD.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 56 microseconds, 259 nanoseconds │ 70.708 │ 1441.74 │ 4 │ 100 │
19+
│ FLD.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 56 microseconds, 515 nanoseconds │ 70.3877 │ 1435.21 │ 4 │ 100 │
20+
│ FLD.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 67 microseconds, 462 nanoseconds │ 58.9663 │ 1202.32 │ 4 │ 100 │
21+
└─────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
22+
23+
Kernel `add3(x1, x2, x3) = x1+x2+x3` and `n_reads_writes=4`:
24+
[ Info: ArrayType = CuArray
25+
Problem size: (63, 4, 4, 5400, 1), float_type = Float64, device_bandwidth_GBs=2039
26+
┌─────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
27+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
28+
├─────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
29+
│ FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 106 microseconds, 783 nanoseconds │ 74.5051 │ 1519.16 │ 4 │ 100 │
30+
│ FLD.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 102 microseconds, 472 nanoseconds │ 77.6396 │ 1583.07 │ 4 │ 100 │
31+
│ FLD.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 102 microseconds, 523 nanoseconds │ 77.6008 │ 1582.28 │ 4 │ 100 │
32+
│ FLD.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 106 microseconds, 834 nanoseconds │ 74.4694 │ 1518.43 │ 4 │ 100 │
33+
└─────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
34+
35+
Kernel `add3(x1, x2, x3) = x1` and `n_reads_writes=2`:
36+
[ Info: ArrayType = CuArray
37+
Problem size: (63, 4, 4, 5400, 1), float_type = Float32, device_bandwidth_GBs=2039
38+
┌─────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
39+
│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │
40+
├─────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
41+
│ FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 61 microseconds, 185 nanoseconds │ 32.5079 │ 662.837 │ 2 │ 100 │
42+
│ FLD.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 31 microseconds, 376 nanoseconds │ 63.3926 │ 1292.57 │ 2 │ 100 │
43+
│ FLD.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 31 microseconds, 120 nanoseconds │ 63.9141 │ 1303.21 │ 2 │ 100 │
44+
│ FLD.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 44 microseconds, 53 nanoseconds │ 45.1499 │ 920.607 │ 2 │ 100 │
45+
└─────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
46+
```
47+
48+
# CPU (Mac M1)
49+
```
50+
[ Info: ArrayType = identity
51+
Problem size: (63, 4, 4, 5400, 1), float_type = Float32, device_bandwidth_GBs=2039
52+
┌─────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬──────────┬─────────────┬────────────────┬────────┐
53+
│ funcs │ time per call (CPU) │ bw % │ achieved bw │ n-reads/writes │ n-reps │
54+
├─────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼──────────┼─────────────┼────────────────┼────────┤
55+
│ FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100) │ 16 milliseconds, 494 microseconds │ 0.241171 │ 4.91747 │ 4 │ 100 │
56+
│ FLD.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100) │ 783 microseconds, 256 nanoseconds │ 5.07871 │ 103.555 │ 4 │ 100 │
57+
│ FLD.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 790 microseconds, 894 nanoseconds │ 5.02966 │ 102.555 │ 4 │ 100 │
58+
│ FLD.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100) │ 12 milliseconds, 522 microseconds │ 0.317663 │ 6.47714 │ 4 │ 100 │
59+
└─────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴──────────┴─────────────┴────────────────┴────────┘
60+
```
61+
62+
=#
63+
64+
#! format: off
65+
module BenchmarkFieldLastIndex
66+
67+
using CUDA
68+
include("benchmark_utils.jl")
69+
70+
@inline function const_linear_index(us::UniversalSizesStatic, I, field_index)
71+
n = (get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us), 1)
72+
i = I + prod(n)*field_index
73+
return i
74+
end
75+
76+
@inline function const_linear_index_reference(us::UniversalSizesStatic, I, field_index)
77+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us), 1))
78+
LI = LinearIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us), field_index+1))
79+
return LI[CI[I] + CartesianIndex((0, 0, 0, 0, field_index))]
80+
end
81+
82+
# add3(x1, x2, x3) = x1 + x2 + x3
83+
add3(x1, x2, x3) = x1
84+
85+
function aos_cart_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
86+
if Y isa Array
87+
e = Inf
88+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us), 1))
89+
for t in 1:n_trials
90+
et = Base.@elapsed begin
91+
for i in 1:nreps
92+
@inbounds @simd for I in 1:get_N(us)
93+
CI1 = CI[I]
94+
CI2 = CI1 + CartesianIndex((0, 0, 0, 0, 1))
95+
CI3 = CI1 + CartesianIndex((0, 0, 0, 0, 2))
96+
Y[CI1] = add3(X[CI1], X[CI2], X[CI3])
97+
end
98+
end
99+
end
100+
e = min(e, et)
101+
end
102+
else
103+
e = Inf
104+
kernel = CUDA.@cuda always_inline = true launch = false aos_cart_offset_kernel!(X,Y,us)
105+
config = CUDA.launch_configuration(kernel.fun)
106+
threads = min(get_N(us), config.threads)
107+
blocks = cld(get_N(us), threads)
108+
for t in 1:n_trials
109+
et = CUDA.@elapsed begin
110+
for i in 1:nreps # reduce variance / impact of launch latency
111+
kernel(X,Y,us; threads, blocks)
112+
end
113+
end
114+
e = min(e, et)
115+
end
116+
end
117+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__), problem_size = size(us), n_reads_writes=4)
118+
return nothing
119+
end;
120+
function aos_cart_offset_kernel!(X, Y, us)
121+
@inbounds begin
122+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
123+
if I get_N(us)
124+
n = (get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us), 1)
125+
CI1 = CartesianIndices(map(x -> Base.OneTo(x), n))[I]
126+
CI2 = CI1 + CartesianIndex((0, 0, 0, 0, 1))
127+
CI3 = CI1 + CartesianIndex((0, 0, 0, 0, 2))
128+
Y[CI1] = add3(X[CI1], X[CI2], X[CI3])
129+
end
130+
end
131+
return nothing
132+
end;
133+
134+
function aos_lin_offset!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
135+
if Y isa Array
136+
e = Inf
137+
for t in 1:n_trials
138+
et = Base.@elapsed begin
139+
for i in 1:nreps
140+
@inbounds @simd for I in 1:get_N(us)
141+
LY1 = const_linear_index(us, I, 0)
142+
LX1 = const_linear_index(us, I, 0)
143+
LX2 = const_linear_index(us, I, 1)
144+
LX3 = const_linear_index(us, I, 2)
145+
Y[LY1] = add3(X[LX1], X[LX2], X[LX3])
146+
end
147+
end
148+
end
149+
e = min(e, et)
150+
end
151+
else
152+
e = Inf
153+
kernel = CUDA.@cuda always_inline = true launch = false aos_lin_offset_kernel!(X,Y,us)
154+
config = CUDA.launch_configuration(kernel.fun)
155+
threads = min(get_N(us), config.threads)
156+
blocks = cld(get_N(us), threads)
157+
for t in 1:n_trials
158+
et = CUDA.@elapsed begin
159+
for i in 1:nreps
160+
kernel(X,Y,us; threads, blocks)
161+
end
162+
end
163+
e = min(e, et)
164+
end
165+
end
166+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__), problem_size = size(us), n_reads_writes=4)
167+
return nothing
168+
end;
169+
function aos_lin_offset_kernel!(X, Y, us)
170+
@inbounds begin
171+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
172+
if I get_N(us)
173+
LY1 = const_linear_index(us, I, 0)
174+
LX1 = const_linear_index(us, I, 0)
175+
LX2 = const_linear_index(us, I, 1)
176+
LX3 = const_linear_index(us, I, 2)
177+
Y[LY1] = add3(X[LX1], X[LX2], X[LX3])
178+
end
179+
end
180+
return nothing
181+
end;
182+
183+
function soa_cart_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
184+
e = Inf
185+
if first(Y) isa Array
186+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us)))
187+
for t in 1:n_trials
188+
et = Base.@elapsed begin
189+
for i in 1:nreps
190+
(y1,) = Y
191+
(x1, x2, x3) = X
192+
@inbounds @simd for I in 1:get_N(us)
193+
y1[CI[I]] = add3(x1[CI[I]], x2[CI[I]], x3[CI[I]])
194+
end
195+
end
196+
end
197+
e = min(e, et)
198+
end
199+
else
200+
kernel = CUDA.@cuda always_inline = true launch = false soa_cart_index_kernel!(X,Y,us)
201+
config = CUDA.launch_configuration(kernel.fun)
202+
threads = min(get_N(us), config.threads)
203+
blocks = cld(get_N(us), threads)
204+
for t in 1:n_trials
205+
et = CUDA.@elapsed begin
206+
for i in 1:nreps # reduce variance / impact of launch latency
207+
kernel(X,Y,us; threads, blocks)
208+
end
209+
end
210+
e = min(e, et)
211+
end
212+
end
213+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__), problem_size = size(us), n_reads_writes=4)
214+
return nothing
215+
end;
216+
function soa_cart_index_kernel!(X, Y, us)
217+
@inbounds begin
218+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
219+
if I get_N(us)
220+
CI = CartesianIndices((get_Nv(us), get_Nij(us), get_Nij(us), get_Nh(us)))
221+
(y1,) = Y
222+
(x1, x2, x3) = X
223+
y1[CI[I]] = add3(x1[CI[I]], x2[CI[I]], x3[CI[I]])
224+
end
225+
end
226+
return nothing
227+
end;
228+
229+
function soa_linear_index!(X, Y, us; nreps = 1, bm=nothing, n_trials = 30)
230+
e = Inf
231+
if first(Y) isa Array
232+
for t in 1:n_trials
233+
et = Base.@elapsed begin
234+
for i in 1:nreps
235+
(y1,) = Y
236+
(x1, x2, x3) = X
237+
@inbounds @simd for I in 1:get_N(us)
238+
y1[I] = add3(x1[I], x2[I], x3[I])
239+
end
240+
end
241+
end
242+
e = min(e, et)
243+
end
244+
else
245+
kernel = CUDA.@cuda always_inline = true launch = false soa_linear_index_kernel!(X,Y,us)
246+
config = CUDA.launch_configuration(kernel.fun)
247+
threads = min(get_N(us), config.threads)
248+
blocks = cld(get_N(us), threads)
249+
for t in 1:n_trials
250+
et = CUDA.@elapsed begin
251+
for i in 1:nreps # reduce variance / impact of launch latency
252+
kernel(X,Y,us; threads, blocks)
253+
end
254+
end
255+
e = min(e, et)
256+
end
257+
end
258+
push_info(bm; kernel_time_s=e/nreps, nreps, caller = @caller_name(@__FILE__), problem_size = size(us), n_reads_writes=4)
259+
return nothing
260+
end;
261+
function soa_linear_index_kernel!(X, Y, us)
262+
@inbounds begin
263+
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
264+
if I get_N(us)
265+
(y1,) = Y
266+
(x1, x2, x3) = X
267+
y1[I] = add3(x1[I], x2[I], x3[I])
268+
end
269+
end
270+
return nothing
271+
end;
272+
273+
end # module
274+
275+
import .BenchmarkFieldLastIndex as FLD
276+
277+
function fill_with_rand!(arr)
278+
FT = eltype(arr)
279+
T = typeof(arr)
280+
s = size(arr)
281+
arr .= T(rand(FT, s))
282+
end
283+
284+
using CUDA
285+
using Test
286+
@testset "Field last dim benchmark" begin
287+
bm = FLD.Benchmark(;problem_size=(63,4,4,5400,1), float_type=Float32) # size(problem_size, 4) == 1 to avoid double counting reads/writes
288+
ArrayType = CUDA.CuArray;
289+
# ArrayType = Base.identity;
290+
arr(float_type, problem_size, T) = T(zeros(float_type, problem_size...))
291+
292+
s = (63,4,4,5400,3);
293+
sY = (63,4,4,5400,1);
294+
st = (63,4,4,5400);
295+
ndofs = prod(st);
296+
us = FLD.UniversalSizesStatic(s[1], s[2], s[end-1]);
297+
298+
X_aos = arr(bm.float_type, s, ArrayType);
299+
Y_aos = arr(bm.float_type, sY, ArrayType);
300+
X_aos_ref = arr(bm.float_type, s, ArrayType);
301+
Y_aos_ref = arr(bm.float_type, sY, ArrayType);
302+
X_soa = ntuple(_ -> arr(bm.float_type, st, ArrayType), 3);
303+
Y_soa = ntuple(_ -> arr(bm.float_type, st, ArrayType), 1);
304+
fill_with_rand!(X_aos)
305+
fill_with_rand!(Y_aos)
306+
X_aos_ref .= X_aos
307+
Y_aos_ref .= Y_aos
308+
for i in 1:3; X_soa[i] .= X_aos[:,:,:,:, i]; end
309+
for i in 1:1; Y_soa[i] .= Y_aos[:,:,:,:, i]; end
310+
@info "ArrayType = $ArrayType"
311+
312+
FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; n_trials = 1, nreps = 1)
313+
FLD.aos_lin_offset!(X_aos, Y_aos, us; n_trials = 1, nreps = 1)
314+
FLD.soa_linear_index!(X_soa, Y_soa, us; n_trials = 1, nreps = 1)
315+
316+
@test all(X_aos .== X_aos_ref)
317+
@test all(Y_aos .== Y_aos_ref)
318+
for i in 1:3; @test all(X_soa[i] .== X_aos_ref[:,:,:,:,i]); end
319+
for i in 1:1; @test all(Y_soa[i] .== Y_aos_ref[:,:,:,:,i]); end
320+
321+
FLD.soa_cart_index!(X_soa, Y_soa, us; n_trials = 1, nreps = 1)
322+
323+
for i in 1:3; @test all(X_soa[i] .== X_aos_ref[:,:,:,:,i]); end
324+
for i in 1:1; @test all(Y_soa[i] .== Y_aos_ref[:,:,:,:,i]); end
325+
326+
FLD.aos_cart_offset!(X_aos_ref, Y_aos_ref, us; bm, nreps = 100)
327+
FLD.aos_lin_offset!(X_aos, Y_aos, us; bm, nreps = 100)
328+
FLD.soa_linear_index!(X_soa, Y_soa, us; bm, nreps = 100)
329+
FLD.soa_cart_index!(X_soa, Y_soa, us; bm, nreps = 100)
330+
331+
FLD.tabulate_benchmark(bm)
332+
end
333+
334+
# #! format: on

0 commit comments

Comments
 (0)