Skip to content

Commit c450698

Browse files
Add HF datalayouts
Fix some bugs Widen benchmarks, fix CPU-GPU dispatch Fix code loading Specialize in gpu pointwise kernels Fix adapt call Ensure has_uniform_datalayouts for cuda copyto Dont limit recursion in get_struct_linear Bump allocation limit, add mem to benchmark stencil job Specialize HF for fused kernels Simplify n_dofs Add some docs and change HorizontalLayout to horizontal_layout_type Apply formatter Revert n_ndofs simplification Add some unit tests, and fix empty field edge cases Add some comments, use todata for some broadcasted objects Forward todata, extend onExtrudedBroadcasted Apply formatter Use more non-specific DimensionMismatch error Add a new unit test Add more unit tests with edge case fixes Fix more edge cases
1 parent 7297f5d commit c450698

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2471
-292
lines changed

.buildkite/pipeline.yml

+31
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ steps:
9090
key: unit_data_cartesian_field_index
9191
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_cartesian_field_index.jl"
9292

93+
- label: "Unit: non_extruded_broadcast"
94+
key: unit_non_extruded_broadcast
95+
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_non_extruded_broadcast.jl"
96+
9397
- label: "Unit: mapreduce"
9498
key: unit_data_mapreduce
9599
command: "julia --color=yes --check-bounds=yes --project=.buildkite test/DataLayouts/unit_mapreduce.jl"
@@ -1352,6 +1356,8 @@ steps:
13521356
- label: "Perf: FD operator stencil benchmarks"
13531357
key: "perf_fd_ops"
13541358
command: "julia --color=yes --project=.buildkite test/Operators/finitedifference/benchmark_stencils.jl"
1359+
agents:
1360+
slurm_mem: 20GB
13551361

13561362
- label: "Perf: GPU FD operator stencil benchmarks"
13571363
key: "gpu_perf_fd_ops"
@@ -1601,6 +1607,20 @@ steps:
16011607
agents:
16021608
slurm_gpus: 1
16031609

1610+
- label: ":computer: Float 32 3D sphere baroclinic wave (ρe) HF datalayout GPU"
1611+
key: "gpu_baroclinic_wave_rho_e_float32_hf"
1612+
command:
1613+
- "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
1614+
artifact_paths:
1615+
- "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float32/*"
1616+
env:
1617+
TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
1618+
FLOAT_TYPE: "Float32"
1619+
horizontal_layout_type: "IJHF"
1620+
CLIMACOMMS_DEVICE: "CUDA"
1621+
agents:
1622+
slurm_gpus: 1
1623+
16041624
- label: ":computer: 3D Box limiters advection slotted spheres"
16051625
key: "cpu_box_advection_limiter_slotted_spheres"
16061626
command:
@@ -1870,6 +1890,17 @@ steps:
18701890
TEST_NAME: "sphere/baroclinic_wave_rhoe"
18711891
FLOAT_TYPE: "Float64"
18721892

1893+
- label: ":computer: Float 64 3D sphere baroclinic wave (ρe) HF datalayout"
1894+
key: "cpu_baroclinic_wave_rho_e_float64_hf"
1895+
command:
1896+
- "julia --color=yes --project=.buildkite examples/hybrid/driver.jl"
1897+
artifact_paths:
1898+
- "examples/hybrid/sphere/output/baroclinic_wave_rhoe_hf/Float64/*"
1899+
env:
1900+
TEST_NAME: "sphere/baroclinic_wave_rhoe_hf"
1901+
FLOAT_TYPE: "Float64"
1902+
horizontal_layout_type: "IJHF"
1903+
18731904
- label: ":computer: 3D sphere baroclinic wave (ρe)"
18741905
key: "cpu_baroclinic_wave_rho_e"
18751906
command:

NEWS.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@ ClimaCore.jl Release Notes
44
main
55
-------
66

7-
- Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).
7+
- We've added new datalayouts: `VIJHF`,`IJHF`,`IHF`,`VIHF`, to explore their performance compared to our existing datalayouts: `VIJFH`,`IJFH`,`IFH`,`VIFH`. PR [#2055](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2055).
8+
- We've refactored some modules to use less internals. PR [#2053](https://github.com/CliMA/ClimaCore.jl/pull/2053), PR [#2052](https://github.com/CliMA/ClimaCore.jl/pull/2052), [#2051](https://github.com/CliMA/ClimaCore.jl/pull/2051), [#2049](https://github.com/CliMA/ClimaCore.jl/pull/2049).
9+
- Some work was done in attempt to reduce specializations and compile time. PR [#2042](https://github.com/CliMA/ClimaCore.jl/pull/2042), [#2041](https://github.com/CliMA/ClimaCore.jl/pull/2041)
810

911
v0.14.19
1012
-------
1113

14+
- Fixed world-age issue on Julia 1.11 issue [Julia#54780](https://github.com/JuliaLang/julia/issues/54780), PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034).
15+
1216
### ![][badge-🐛bugfix] Fix undefined behavior in `DataLayout`s
1317

1418
PR [#2034](https://github.com/CliMA/ClimaCore.jl/pull/2034) fixes some undefined

docs/src/api.md

+4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ DataLayouts.IFH
3232
DataLayouts.IJFH
3333
DataLayouts.VIFH
3434
DataLayouts.VIJFH
35+
DataLayouts.IHF
36+
DataLayouts.IJHF
37+
DataLayouts.VIHF
38+
DataLayouts.VIJHF
3539
```
3640

3741
## Geometry

examples/common_spaces.jl

+12-2
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,19 @@ function make_horizontal_space(
3535
mesh,
3636
npoly,
3737
context::ClimaComms.SingletonCommsContext,
38+
horizontal_layout_type = DataLayouts.IJFH,
3839
)
3940
quad = Quadratures.GLL{npoly + 1}()
4041
if mesh isa Meshes.AbstractMesh1D
4142
topology = Topologies.IntervalTopology(ClimaComms.device(context), mesh)
4243
space = Spaces.SpectralElementSpace1D(topology, quad)
4344
elseif mesh isa Meshes.AbstractMesh2D
4445
topology = Topologies.Topology2D(context, mesh)
45-
space = Spaces.SpectralElementSpace2D(topology, quad)
46+
space = Spaces.SpectralElementSpace2D(
47+
topology,
48+
quad;
49+
horizontal_layout_type,
50+
)
4651
end
4752
return space
4853
end
@@ -51,13 +56,18 @@ function make_horizontal_space(
5156
mesh,
5257
npoly,
5358
comms_ctx::ClimaComms.MPICommsContext,
59+
horizontal_layout_type = DataLayouts.IJFH,
5460
)
5561
quad = Quadratures.GLL{npoly + 1}()
5662
if mesh isa Meshes.AbstractMesh1D
5763
error("Distributed mode does not work with 1D horizontal spaces.")
5864
elseif mesh isa Meshes.AbstractMesh2D
5965
topology = Topologies.Topology2D(comms_ctx, mesh)
60-
space = Spaces.SpectralElementSpace2D(topology, quad)
66+
space = Spaces.SpectralElementSpace2D(
67+
topology,
68+
quad;
69+
horizontal_layout_type,
70+
)
6171
end
6272
return space
6373
end

examples/hybrid/driver.jl

+16-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ ClimaComms.@import_required_backends
3030
import SciMLBase
3131
const comms_ctx = ClimaComms.context()
3232
is_distributed = comms_ctx isa ClimaComms.MPICommsContext
33+
using ClimaCore: DataLayouts
3334

3435
using Logging
3536

@@ -91,7 +92,17 @@ if haskey(ENV, "RESTART_FILE")
9192
ᶠlocal_geometry = Fields.local_geometry_field(Y.f)
9293
else
9394
t_start = FT(0)
94-
h_space = make_horizontal_space(horizontal_mesh, npoly, comms_ctx)
95+
horizontal_layout_types = Dict()
96+
horizontal_layout_types["IJFH"] = DataLayouts.IJFH
97+
horizontal_layout_types["IJHF"] = DataLayouts.IJHF
98+
horizontal_layout_type =
99+
horizontal_layout_types[get(ENV, "horizontal_layout_type", "IJFH")]
100+
h_space = make_horizontal_space(
101+
horizontal_mesh,
102+
npoly,
103+
comms_ctx,
104+
horizontal_layout_type,
105+
)
95106
center_space, face_space =
96107
make_hybrid_spaces(h_space, z_max, z_elem; z_stretch)
97108
ᶜlocal_geometry = Fields.local_geometry_field(center_space)
@@ -231,5 +242,8 @@ end
231242
if !is_distributed || ClimaComms.iamroot(comms_ctx)
232243
println("Walltime = $walltime seconds")
233244
ENV["GKSwstype"] = "nul" # avoid displaying plots
234-
postprocessing(sol, output_dir)
245+
# https://github.com/CliMA/ClimaCore.jl/issues/2058
246+
if !(Fields.field_values(sol.u[1].c) isa DataLayouts.VIJHF)
247+
postprocessing(sol, output_dir)
248+
end
235249
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
using ClimaCorePlots, Plots
2+
using ClimaCore.DataLayouts
3+
4+
include("baroclinic_wave_utilities.jl")
5+
6+
const sponge = false
7+
8+
# Variables required for driver.jl (modify as needed)
9+
horizontal_mesh = cubed_sphere_mesh(; radius = R, h_elem = 4)
10+
npoly = 4
11+
z_max = FT(30e3)
12+
z_elem = 10
13+
t_end = FT(60 * 60 * 24 * 10)
14+
dt = FT(400)
15+
dt_save_to_sol = FT(60 * 60 * 24)
16+
dt_save_to_disk = FT(0) # 0 means don't save to disk
17+
ode_algorithm = CTS.SSP333
18+
jacobian_flags = (; ∂ᶜ𝔼ₜ∂ᶠ𝕄_mode = :no_∂ᶜp∂ᶜK, ∂ᶠ𝕄ₜ∂ᶜρ_mode = :exact)
19+
20+
additional_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) = merge(
21+
hyperdiffusion_cache(ᶜlocal_geometry, ᶠlocal_geometry; κ₄ = FT(2e17)),
22+
sponge ? rayleigh_sponge_cache(ᶜlocal_geometry, ᶠlocal_geometry, dt) : (;),
23+
)
24+
function additional_tendency!(Yₜ, Y, p, t)
25+
hyperdiffusion_tendency!(Yₜ, Y, p, t)
26+
sponge && rayleigh_sponge_tendency!(Yₜ, Y, p, t)
27+
end
28+
29+
center_initial_condition(local_geometry) =
30+
center_initial_condition(local_geometry, Val(:ρe))
31+
function postprocessing(sol, output_dir)
32+
@info "L₂ norm of ρe at t = $(sol.t[1]): $(norm(sol.u[1].c.ρe))"
33+
@info "L₂ norm of ρe at t = $(sol.t[end]): $(norm(sol.u[end].c.ρe))"
34+
35+
anim = Plots.@animate for Y in sol.u
36+
ᶜv = Geometry.UVVector.(Y.c.uₕ).components.data.:2
37+
Plots.plot(ᶜv, level = 3, clim = (-6, 6))
38+
end
39+
Plots.mp4(anim, joinpath(output_dir, "v.mp4"), fps = 5)
40+
end

ext/cuda/data_layouts.jl

+19-16
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11

22
import ClimaCore.DataLayouts: AbstractData
33
import ClimaCore.DataLayouts: FusedMultiBroadcast
4-
import ClimaCore.DataLayouts: IJKFVH, IJFH, VIJFH, VIFH, IFH, IJF, IF, VF, DataF
4+
import ClimaCore.DataLayouts:
5+
IJKFVH, IJFH, IJHF, VIJFH, VIJHF, VIFH, VIHF, IFH, IHF, IJF, IF, VF, DataF
56
import ClimaCore.DataLayouts: IJFHStyle, VIJFHStyle, VFStyle, DataFStyle
7+
import ClimaCore.DataLayouts: IJHFStyle, VIJHFStyle
68
import ClimaCore.DataLayouts: promote_parent_array_type
79
import ClimaCore.DataLayouts: parent_array_type
810
import ClimaCore.DataLayouts: isascalar
@@ -34,23 +36,24 @@ include("data_layouts_threadblock.jl")
3436
adapt_f(to, f::F) where {F} = Adapt.adapt(to, f)
3537
adapt_f(to, ::Type{F}) where {F} = (x...) -> F(x...)
3638

39+
function Adapt.adapt_structure(
40+
to::CUDA.KernelAdaptor,
41+
bc::DataLayouts.NonExtrudedBroadcasted{Style},
42+
) where {Style}
43+
DataLayouts.NonExtrudedBroadcasted{Style}(
44+
adapt_f(to, bc.f),
45+
Adapt.adapt(to, bc.args),
46+
Adapt.adapt(to, bc.axes),
47+
)
48+
end
49+
3750
function Adapt.adapt_structure(
3851
to::CUDA.KernelAdaptor,
3952
fmbc::FusedMultiBroadcast,
4053
)
41-
FusedMultiBroadcast(
42-
map(fmbc.pairs) do pair
43-
dest = pair.first
44-
bc = pair.second
45-
Pair(
46-
Adapt.adapt(to, dest),
47-
Base.Broadcast.Broadcasted(
48-
bc.style,
49-
adapt_f(to, bc.f),
50-
Adapt.adapt(to, bc.args),
51-
Adapt.adapt(to, bc.axes),
52-
),
53-
)
54-
end,
55-
)
54+
FusedMultiBroadcast(map(fmbc.pairs) do pair
55+
dest = pair.first
56+
bc = pair.second
57+
Pair(Adapt.adapt(to, dest), Adapt.adapt(to, bc))
58+
end)
5659
end

ext/cuda/data_layouts_copyto.jl

+67-15
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,74 @@ function knl_copyto!(dest, src, us)
88
return nothing
99
end
1010

11-
function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
12-
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
13-
us = DataLayouts.UniversalSize(dest)
14-
if Nv > 0 && Nh > 0
15-
args = (dest, bc, us)
16-
threads = threads_via_occupancy(knl_copyto!, args)
17-
n_max_threads = min(threads, get_N(us))
18-
p = partition(dest, n_max_threads)
19-
auto_launch!(
20-
knl_copyto!,
21-
args;
22-
threads_s = p.threads,
23-
blocks_s = p.blocks,
24-
)
11+
function knl_copyto_linear!(dest, src, us)
12+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
13+
if linear_is_valid_index(i, us)
14+
@inbounds dest[i] = src[i]
15+
end
16+
return nothing
17+
end
18+
19+
if VERSION v"1.11.0-beta"
20+
# https://github.com/JuliaLang/julia/issues/56295
21+
# Julia 1.11's Base.Broadcast currently requires
22+
# multiple integer indexing, wheras Julia 1.10 did not.
23+
# This means that we cannot reserve linear indexing to
24+
# special-case fixes for https://github.com/JuliaLang/julia/issues/28126
25+
# (including the GPU-variant related issue resolution efforts:
26+
# JuliaGPU/GPUArrays.jl#454, JuliaGPU/GPUArrays.jl#464).
27+
function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
28+
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
29+
us = DataLayouts.UniversalSize(dest)
30+
if Nv > 0 && Nh > 0
31+
args = (dest, bc, us)
32+
threads = threads_via_occupancy(knl_copyto!, args)
33+
n_max_threads = min(threads, get_N(us))
34+
p = partition(dest, n_max_threads)
35+
auto_launch!(
36+
knl_copyto!,
37+
args;
38+
threads_s = p.threads,
39+
blocks_s = p.blocks,
40+
)
41+
end
42+
return dest
43+
end
44+
else
45+
function Base.copyto!(dest::AbstractData, bc, ::ToCUDA)
46+
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
47+
us = DataLayouts.UniversalSize(dest)
48+
if Nv > 0 && Nh > 0
49+
if DataLayouts.has_uniform_datalayouts(bc) &&
50+
dest isa DataLayouts.EndsWithField
51+
bc′ = Base.Broadcast.instantiate(
52+
DataLayouts.to_non_extruded_broadcasted(bc),
53+
)
54+
args = (dest, bc′, us)
55+
threads = threads_via_occupancy(knl_copyto_linear!, args)
56+
n_max_threads = min(threads, get_N(us))
57+
p = linear_partition(prod(size(dest)), n_max_threads)
58+
auto_launch!(
59+
knl_copyto_linear!,
60+
args;
61+
threads_s = p.threads,
62+
blocks_s = p.blocks,
63+
)
64+
else
65+
args = (dest, bc, us)
66+
threads = threads_via_occupancy(knl_copyto!, args)
67+
n_max_threads = min(threads, get_N(us))
68+
p = partition(dest, n_max_threads)
69+
auto_launch!(
70+
knl_copyto!,
71+
args;
72+
threads_s = p.threads,
73+
blocks_s = p.blocks,
74+
)
75+
end
76+
end
77+
return dest
2578
end
26-
return dest
2779
end
2880

2981
# broadcasting scalar assignment

ext/cuda/data_layouts_fill.jl

+30-10
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,40 @@ function knl_fill!(dest, val, us)
66
return nothing
77
end
88

9+
function knl_fill_linear!(dest, val, us)
10+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
11+
if linear_is_valid_index(i, us)
12+
@inbounds dest[i] = val
13+
end
14+
return nothing
15+
end
16+
917
function Base.fill!(dest::AbstractData, bc, ::ToCUDA)
1018
(_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
1119
us = DataLayouts.UniversalSize(dest)
20+
args = (dest, bc, us)
1221
if Nv > 0 && Nh > 0
13-
args = (dest, bc, us)
14-
threads = threads_via_occupancy(knl_fill!, args)
15-
n_max_threads = min(threads, get_N(us))
16-
p = partition(dest, n_max_threads)
17-
auto_launch!(
18-
knl_fill!,
19-
args;
20-
threads_s = p.threads,
21-
blocks_s = p.blocks,
22-
)
22+
if !(VERSION v"1.11.0-beta") && dest isa DataLayouts.EndsWithField
23+
threads = threads_via_occupancy(knl_fill_linear!, args)
24+
n_max_threads = min(threads, get_N(us))
25+
p = linear_partition(prod(size(dest)), n_max_threads)
26+
auto_launch!(
27+
knl_fill_linear!,
28+
args;
29+
threads_s = p.threads,
30+
blocks_s = p.blocks,
31+
)
32+
else
33+
threads = threads_via_occupancy(knl_fill!, args)
34+
n_max_threads = min(threads, get_N(us))
35+
p = partition(dest, n_max_threads)
36+
auto_launch!(
37+
knl_fill!,
38+
args;
39+
threads_s = p.threads,
40+
blocks_s = p.blocks,
41+
)
42+
end
2343
end
2444
return dest
2545
end

0 commit comments

Comments
 (0)