Skip to content

Commit 226bec3

Browse files
Use prescribed thread-block configurations
1 parent add3b1b commit 226bec3

14 files changed

+372
-252
lines changed

ext/ClimaCoreCUDAExt.jl

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
1717
import ClimaCore.RecursiveApply:
1818
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
1919
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
20+
import ClimaCore.DataLayouts: UniversalSize
2021

2122
include(joinpath("cuda", "cuda_utils.jl"))
2223
include(joinpath("cuda", "data_layouts.jl"))

ext/cuda/cuda_utils.jl

+6
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ function auto_launch!(
9090
return nothing
9191
end
9292

93+
function threads_via_occupancy(f!::F!, args) where {F!}
94+
kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
95+
config = CUDA.launch_configuration(kernel.fun)
96+
return config.threads
97+
end
98+
9399
"""
94100
thread_index()
95101

ext/cuda/data_layouts.jl

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ include("data_layouts_fill.jl")
2929
include("data_layouts_copyto.jl")
3030
include("data_layouts_fused_copyto.jl")
3131
include("data_layouts_mapreduce.jl")
32+
include("data_layouts_threadblock.jl")
3233

3334
adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
3435
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)

ext/cuda/data_layouts_copyto.jl

+17-90
Original file line numberDiff line numberDiff line change
@@ -1,111 +1,38 @@
11
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
22

3-
function knl_copyto!(dest, src)
4-
5-
i = CUDA.threadIdx().x
6-
j = CUDA.threadIdx().y
7-
8-
h = CUDA.blockIdx().x
9-
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
10-
11-
if v <= size(dest, 4)
12-
I = CartesianIndex((i, j, 1, v, h))
3+
function knl_copyto!(dest, src, us)
4+
I = universal_index(dest)
5+
if is_valid_index(dest, I, us)
136
@inbounds dest[I] = src[I]
147
end
158
return nothing
169
end
1710

18-
function Base.copyto!(
19-
dest::IJFH{S, Nij, Nh},
20-
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
21-
::ToCUDA,
22-
) where {S, Nij, Nh}
23-
if Nh > 0
24-
auto_launch!(
25-
knl_copyto!,
26-
(dest, bc);
27-
threads_s = (Nij, Nij),
28-
blocks_s = (Nh, 1),
29-
)
30-
end
31-
return dest
32-
end
33-
34-
function Base.copyto!(
35-
dest::VIJFH{S, Nv, Nij, Nh},
36-
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
37-
::ToCUDA,
38-
) where {S, Nv, Nij, Nh}
39-
if Nv > 0 && Nh > 0
40-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
41-
Nv_blocks = cld(Nv, Nv_per_block)
42-
auto_launch!(
43-
knl_copyto!,
44-
(dest, bc);
45-
threads_s = (Nij, Nij, Nv_per_block),
46-
blocks_s = (Nh, Nv_blocks),
47-
)
48-
end
49-
return dest
50-
end
51-
52-
function Base.copyto!(
53-
dest::VF{S, Nv},
54-
bc::DataLayouts.BroadcastedUnionVF{S, Nv},
55-
::ToCUDA,
56-
) where {S, Nv}
57-
if Nv > 0
58-
auto_launch!(
59-
knl_copyto!,
60-
(dest, bc);
61-
threads_s = (1, 1),
62-
blocks_s = (1, Nv),
63-
)
64-
end
65-
return dest
66-
end
67-
68-
function Base.copyto!(
69-
dest::DataF{S},
70-
bc::DataLayouts.BroadcastedUnionDataF{S},
71-
::ToCUDA,
72-
) where {S}
73-
auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
74-
return dest
75-
end
76-
77-
import ClimaCore.DataLayouts: isascalar
78-
function knl_copyto_flat!(dest::AbstractData, bc, us)
79-
@inbounds begin
80-
tidx = thread_index()
81-
if tidx get_N(us)
82-
n = size(dest)
83-
I = kernel_indexes(tidx, n)
84-
dest[I] = bc[I]
85-
end
86-
end
87-
return nothing
88-
end
89-
9011
function cuda_copyto!(dest::AbstractData, bc)
9112
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
9213
us = DataLayouts.UniversalSize(dest)
9314
if Nv > 0 && Nh > 0
94-
nitems = prod(DataLayouts.universal_size(dest))
95-
auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
15+
args = (dest, bc, us)
16+
threads = threads_via_occupancy(knl_copyto!, args)
17+
n_max_threads = min(threads, get_N(us))
18+
p = partition(dest, n_max_threads)
19+
auto_launch!(
20+
knl_copyto!,
21+
args;
22+
threads_s = p.threads,
23+
blocks_s = p.blocks,
24+
)
9625
end
9726
return dest
9827
end
9928

100-
# TODO: can we use CUDA's luanch configuration for all data layouts?
101-
# Currently, it seems to have a slight performance degradation.
10229
#! format: off
103-
# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
30+
Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
10431
Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
10532
Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
10633
Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
10734
Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
108-
# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
109-
# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
110-
# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
35+
Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
36+
Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
37+
Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
11138
#! format: on

ext/cuda/data_layouts_fill.jl

+16-21
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,27 @@
1-
function knl_fill_flat!(dest::AbstractData, val, us)
2-
@inbounds begin
3-
tidx = thread_index()
4-
if tidx get_N(us)
5-
n = size(dest)
6-
I = kernel_indexes(tidx, n)
7-
@inbounds dest[I] = val
8-
end
1+
function knl_fill!(dest, val, us)
2+
I = universal_index(dest)
3+
if is_valid_index(dest, I, us)
4+
@inbounds dest[I] = val
95
end
106
return nothing
117
end
128

13-
function cuda_fill!(dest::AbstractData, val)
9+
function cuda_fill!(dest::AbstractData, bc)
1410
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
1511
us = DataLayouts.UniversalSize(dest)
1612
if Nv > 0 && Nh > 0
17-
nitems = prod(DataLayouts.universal_size(dest))
18-
auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
13+
args = (dest, bc, us)
14+
threads = threads_via_occupancy(knl_fill!, args)
15+
n_max_threads = min(threads, get_N(us))
16+
p = partition(dest, n_max_threads)
17+
auto_launch!(
18+
knl_fill!,
19+
args;
20+
threads_s = p.threads,
21+
blocks_s = p.blocks,
22+
)
1923
end
2024
return dest
2125
end
2226

23-
#! format: off
24-
Base.fill!(dest::IJFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
25-
Base.fill!(dest::IFH{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
26-
Base.fill!(dest::IJF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
27-
Base.fill!(dest::IF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
28-
Base.fill!(dest::VIFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
29-
Base.fill!(dest::VIJFH{<:Any, <:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
30-
Base.fill!(dest::VF{<:Any, <:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
31-
Base.fill!(dest::DataF{<:Any}, val, ::ToCUDA) = cuda_fill!(dest, val)
32-
#! format: on
27+
Base.fill!(dest::AbstractData, val, ::ToCUDA) = cuda_fill!(dest, val)

ext/cuda/data_layouts_fused_copyto.jl

+27-74
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,59 @@
11
Base.@propagate_inbounds function rcopyto_at!(
22
pair::Pair{<:AbstractData, <:Any},
33
I,
4-
v,
4+
us,
55
)
66
dest, bc = pair.first, pair.second
7-
if 1 v <= size(dest, 4)
7+
if is_valid_index(dest, I, us)
88
dest[I] = isascalar(bc) ? bc[] : bc[I]
99
end
1010
return nothing
1111
end
12-
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, v)
12+
Base.@propagate_inbounds function rcopyto_at!(pair::Pair{<:DataF, <:Any}, I, us)
1313
dest, bc = pair.first, pair.second
14-
if 1 v <= size(dest, 4)
14+
if is_valid_index(dest, I, us)
1515
bcI = isascalar(bc) ? bc[] : bc[I]
1616
dest[] = bcI
1717
end
1818
return nothing
1919
end
20-
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, v)
21-
rcopyto_at!(first(pairs), I, v)
22-
rcopyto_at!(Base.tail(pairs), I, v)
20+
Base.@propagate_inbounds function rcopyto_at!(pairs::Tuple, I, us)
21+
rcopyto_at!(first(pairs), I, us)
22+
rcopyto_at!(Base.tail(pairs), I, us)
2323
end
24-
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, v) =
25-
rcopyto_at!(first(pairs), I, v)
26-
@inline rcopyto_at!(pairs::Tuple{}, I, v) = nothing
27-
28-
function knl_fused_copyto!(fmbc::FusedMultiBroadcast)
24+
Base.@propagate_inbounds rcopyto_at!(pairs::Tuple{<:Any}, I, us) =
25+
rcopyto_at!(first(pairs), I, us)
26+
@inline rcopyto_at!(pairs::Tuple{}, I, us) = nothing
2927

28+
function knl_fused_copyto!(fmbc::FusedMultiBroadcast, dest1, us)
3029
@inbounds begin
31-
i = CUDA.threadIdx().x
32-
j = CUDA.threadIdx().y
33-
34-
h = CUDA.blockIdx().x
35-
v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z
36-
(; pairs) = fmbc
37-
I = CartesianIndex((i, j, 1, v, h))
38-
rcopyto_at!(pairs, I, v)
30+
I = universal_index(dest1)
31+
if is_valid_index(dest1, I, us)
32+
(; pairs) = fmbc
33+
rcopyto_at!(pairs, I, us)
34+
end
3935
end
4036
return nothing
4137
end
4238

4339
function fused_copyto!(
4440
fmbc::FusedMultiBroadcast,
45-
dest1::VIJFH{S, Nv, Nij, Nh},
41+
dest1::DataLayouts.AbstractData,
4642
::ToCUDA,
47-
) where {S, Nv, Nij, Nh}
48-
if Nv > 0 && Nh > 0
49-
Nv_per_block = min(Nv, fld(256, Nij * Nij))
50-
Nv_blocks = cld(Nv, Nv_per_block)
51-
auto_launch!(
52-
knl_fused_copyto!,
53-
(fmbc,);
54-
threads_s = (Nij, Nij, Nv_per_block),
55-
blocks_s = (Nh, Nv_blocks),
56-
)
57-
end
58-
return nothing
59-
end
60-
61-
function fused_copyto!(
62-
fmbc::FusedMultiBroadcast,
63-
dest1::IJFH{S, Nij},
64-
::ToCUDA,
65-
) where {S, Nij}
66-
_, _, _, _, Nh = size(dest1)
67-
if Nh > 0
68-
auto_launch!(
69-
knl_fused_copyto!,
70-
(fmbc,);
71-
threads_s = (Nij, Nij),
72-
blocks_s = (Nh, 1),
73-
)
74-
end
75-
return nothing
76-
end
77-
function fused_copyto!(
78-
fmbc::FusedMultiBroadcast,
79-
dest1::VF{S, Nv},
80-
::ToCUDA,
81-
) where {S, Nv}
82-
_, _, _, _, Nh = size(dest1)
43+
)
44+
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest1)
8345
if Nv > 0 && Nh > 0
46+
us = DataLayouts.UniversalSize(dest1)
47+
args = (fmbc, dest1, us)
48+
threads = threads_via_occupancy(knl_fused_copyto!, args)
49+
n_max_threads = min(threads, get_N(us))
50+
p = partition(dest1, n_max_threads)
8451
auto_launch!(
8552
knl_fused_copyto!,
86-
(fmbc,);
87-
threads_s = (1, 1),
88-
blocks_s = (Nh, Nv),
53+
args;
54+
threads_s = p.threads,
55+
blocks_s = p.blocks,
8956
)
9057
end
9158
return nothing
9259
end
93-
94-
function fused_copyto!(
95-
fmbc::FusedMultiBroadcast,
96-
dest1::DataF{S},
97-
::ToCUDA,
98-
) where {S}
99-
auto_launch!(
100-
knl_fused_copyto!,
101-
(fmbc,);
102-
threads_s = (1, 1),
103-
blocks_s = (1, 1),
104-
)
105-
return nothing
106-
end

0 commit comments

Comments
 (0)