Skip to content

Commit cacbb5c

Browse files
Drop field dimension to demo thermo benchmark
Rename TupleOfArrays to FieldArrays Apply formatter wip Pass some unit tests, cleanup
1 parent 05ae9c0 commit cacbb5c

18 files changed

+993
-373
lines changed

benchmarks/scripts/thermo_bench_bw.jl

+14-12
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ using BenchmarkTools
150150
import .TestUtilities as TU;
151151

152152
using Test
153-
@testset "Thermo state" begin
153+
# @testset "Thermo state" begin
154154
FT = Float32
155155
bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
156156
device = ClimaComms.device()
@@ -175,7 +175,8 @@ using Test
175175
)
176176
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
177177
xv = fill((; ts = nt_ts, nt_core...), cspace)
178-
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
178+
fv_ts = Fields.field_values(x.ts)
179+
(_, Nij, _, Nv, Nh) = size(fv_ts)
179180
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
180181
function to_vec(ξ)
181182
pns = propertynames(ξ)
@@ -186,7 +187,7 @@ using Test
186187
end
187188
return (; zip(propertynames(ξ), dl_vals)...)
188189
end
189-
x_vec = to_vec(xv)
190+
# x_vec = to_vec(xv)
190191

191192
x_aos = fill((; ρ_read = FT(0), ρ_write = FT(0)), cspace)
192193
x_soa = (;
@@ -199,20 +200,21 @@ using Test
199200
@. x_aos.ρ_write = 7
200201
TBB.singlefield_bc!(x_soa, us; nreps=1, n_trials = 1)
201202
TBB.singlefield_bc!(x_aos, us; nreps=1, n_trials = 1)
202-
203+
203204
TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
204-
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
205+
# TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
205206

206-
rc = Fields.rcompare(x_vec, to_vec(x))
207-
rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
208-
@test rc # test correctness
207+
# rc = Fields.rcompare(x_vec, to_vec(x))
208+
# rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
209+
# @test rc # test correctness
209210

210-
TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
211-
TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
211+
# TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
212+
# TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
212213
TBB.thermo_func_bc!(x, us; nreps=100, bm)
213-
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
214+
@info "Success!"
215+
# TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
214216

215217
TBB.tabulate_benchmark(bm)
216218

217-
end
219+
# end
218220
#! format: on

ext/ClimaCoreCUDAExt.jl

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
1717
import ClimaCore.RecursiveApply:
1818
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
1919
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
20+
import ClimaCore.DataLayouts: universal_size, UniversalSize
2021

2122
include(joinpath("cuda", "cuda_utils.jl"))
2223
include(joinpath("cuda", "data_layouts.jl"))

ext/cuda/data_layouts.jl

+18
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ import CUDA
1313
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
1414
CUDA.CuArray{T, N, B} where {N}
1515

16+
parent_array_type(
17+
::Type{<:CUDA.CuArray{T, N, B} where {N}},
18+
::Val{ND},
19+
) where {T, B, ND} = CUDA.CuArray{T, ND, B}
20+
1621
# Ensure that both parent array types have the same memory buffer type.
1722
promote_parent_array_type(
1823
::Type{CUDA.CuArray{T1, N, B} where {N}},
@@ -53,3 +58,16 @@ function Adapt.adapt_structure(
5358
end,
5459
)
5560
end
61+
62+
import Adapt
63+
import CUDA
64+
function Adapt.adapt_structure(
65+
to::CUDA.KernelAdaptor,
66+
bc::DataLayouts.NonExtrudedBroadcasted{Style},
67+
) where {Style}
68+
DataLayouts.NonExtrudedBroadcasted{Style}(
69+
adapt_f(to, bc.f),
70+
Adapt.adapt(to, bc.args),
71+
Adapt.adapt(to, bc.axes),
72+
)
73+
end

ext/cuda/data_layouts_copyto.jl

+54-12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import ClimaCore.DataLayouts:
2+
to_non_extruded_broadcasted, has_uniform_datalayouts
13
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
24

35
function knl_copyto!(dest, src)
@@ -15,36 +17,76 @@ function knl_copyto!(dest, src)
1517
return nothing
1618
end
1719

20+
function knl_copyto_field_array!(dest, src, us)
21+
@inbounds begin
22+
tidx = thread_index()
23+
if tidx get_N(us)
24+
n = size(dest)
25+
I = kernel_indexes(tidx, n)
26+
dest[I] = src[I]
27+
end
28+
end
29+
return nothing
30+
end
31+
1832
function Base.copyto!(
1933
dest::IJFH{S, Nij, Nh},
2034
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
2135
::ToCUDA,
2236
) where {S, Nij, Nh}
37+
us = DataLayouts.UniversalSize(dest)
2338
if Nh > 0
2439
auto_launch!(
25-
knl_copyto!,
26-
(dest, bc);
27-
threads_s = (Nij, Nij),
28-
blocks_s = (Nh, 1),
40+
knl_copyto_field_array!,
41+
(dest, bc, us),
42+
prod(DataLayouts.universal_size(us));
43+
auto = true,
2944
)
3045
end
3146
return dest
3247
end
3348

49+
function knl_copyto_linear!(dest::AbstractData, bc, us)
50+
@inbounds begin
51+
tidx = thread_index()
52+
if tidx get_N(us)
53+
dest[tidx] = bc[tidx]
54+
end
55+
end
56+
return nothing
57+
end
58+
59+
function knl_copyto_linear!(dest::DataF, bc, us)
60+
@inbounds dest[] = bc[tidx]
61+
return nothing
62+
end
63+
64+
function knl_copyto_cart!(dest, src, us)
65+
@inbounds begin
66+
tidx = thread_index()
67+
if tidx get_N(us)
68+
n = size(dest)
69+
I = kernel_indexes(tidx, n)
70+
dest[I] = src[I]
71+
end
72+
end
73+
return nothing
74+
end
75+
3476
function Base.copyto!(
3577
dest::VIJFH{S, Nv, Nij, Nh},
3678
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
3779
::ToCUDA,
3880
) where {S, Nv, Nij, Nh}
3981
if Nv > 0 && Nh > 0
40-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
41-
Nv_blocks = cld(Nv, Nv_per_block)
42-
auto_launch!(
43-
knl_copyto!,
44-
(dest, bc);
45-
threads_s = (Nij, Nij, Nv_per_block),
46-
blocks_s = (Nh, Nv_blocks),
47-
)
82+
us = DataLayouts.UniversalSize(dest)
83+
n = prod(DataLayouts.universal_size(us))
84+
if has_uniform_datalayouts(bc)
85+
bc′ = to_non_extruded_broadcasted(bc)
86+
auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
87+
else
88+
auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
89+
end
4890
end
4991
return dest
5092
end

ext/cuda/data_layouts_fill.jl

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
function knl_fill_flat!(dest::AbstractData, val, us)
2-
@inbounds begin
3-
tidx = thread_index()
4-
if tidx get_N(us)
5-
n = size(dest)
6-
I = kernel_indexes(tidx, n)
7-
@inbounds dest[I] = val
8-
end
9-
end
2+
# @inbounds begin
3+
# tidx = thread_index()
4+
# if tidx ≤ get_N(us)
5+
# n = size(dest)
6+
# I = kernel_indexes(tidx, n)
7+
# @inbounds dest[I] = val
8+
# end
9+
# end
1010
return nothing
1111
end
1212

ext/cuda/topologies_dss.jl

+13-10
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ function dss_load_perimeter_data_kernel!(
4848
if gidx prod(sizep)
4949
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
5050
(ip, jp) = perimeter[p]
51-
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
52-
pperimeter_data[level, p, fidx, elem] = pdata[data_idx]
51+
data_idx = linear_ind(sized, (level, ip, jp, elem))
52+
pperimeter_data.arrays[fidx][level, p, elem] =
53+
pdata.arrays[fidx][data_idx]
5354
end
5455
return nothing
5556
end
@@ -89,7 +90,8 @@ function dss_unload_perimeter_data_kernel!(
8990
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
9091
(ip, jp) = perimeter[p]
9192
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
92-
pdata[data_idx] = pperimeter_data[level, p, fidx, elem]
93+
pdata.arrays[fidx][data_idx] =
94+
pperimeter_data.arrays[fidx][level, p, elem]
9395
end
9496
return nothing
9597
end
@@ -148,12 +150,12 @@ function dss_local_kernel!(
148150
for idx in st:(en - 1)
149151
(lidx, vert) = local_vertices[idx]
150152
ip = perimeter_vertex_node_index(vert)
151-
sum_data += pperimeter_data[level, ip, fidx, lidx]
153+
sum_data += pperimeter_data.arrays[fidx][level, ip, lidx]
152154
end
153155
for idx in st:(en - 1)
154156
(lidx, vert) = local_vertices[idx]
155157
ip = perimeter_vertex_node_index(vert)
156-
pperimeter_data[level, ip, fidx, lidx] = sum_data
158+
pperimeter_data.arrays[fidx][level, ip, lidx] = sum_data
157159
end
158160
elseif gidx nlevels * nfidx * (nlocalvertices + nlocalfaces) # interior faces
159161
nfacedof = div(nperimeter - 4, 4)
@@ -169,10 +171,10 @@ function dss_local_kernel!(
169171
ip1 = inc1 == 1 ? first1 + i - 1 : first1 - i + 1
170172
ip2 = inc2 == 1 ? first2 + i - 1 : first2 - i + 1
171173
val =
172-
pperimeter_data[level, ip1, fidx, lidx1] +
173-
pperimeter_data[level, ip2, fidx, lidx2]
174-
pperimeter_data[level, ip1, fidx, lidx1] = val
175-
pperimeter_data[level, ip2, fidx, lidx2] = val
174+
pperimeter_data.arrays[fidx][level, ip1, lidx1] +
175+
pperimeter_data.arrays[fidx][level, ip2, lidx2]
176+
pperimeter_data.arrays[fidx][level, ip1, lidx1] = val
177+
pperimeter_data.arrays[fidx][level, ip2, lidx2] = val
176178
end
177179
end
178180

@@ -683,7 +685,8 @@ function load_from_recv_buffer_kernel!(
683685
lidx = recv_buf_idx[irecv, 1]
684686
ip = recv_buf_idx[irecv, 2]
685687
idx = level + ((fidx - 1) + (irecv - 1) * nfid) * nlevels
686-
CUDA.@atomic pperimeter_data[level, ip, fidx, lidx] += recv_data[idx]
688+
CUDA.@atomic pperimeter_data.arrays[fidx][level, ip, lidx] +=
689+
recv_data[idx]
687690
end
688691
return nothing
689692
end

0 commit comments

Comments
 (0)