Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop field dimension to demo thermo benchmark #1929

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions benchmarks/scripts/thermo_bench_bw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ using BenchmarkTools
import .TestUtilities as TU;

using Test
@testset "Thermo state" begin
# @testset "Thermo state" begin
FT = Float32
bm = TBB.Benchmark(;problem_size=(63,4,4,1,5400), float_type=FT)
device = ClimaComms.device()
Expand All @@ -175,7 +175,8 @@ using Test
)
x = fill((; ts = zero(TBB.PhaseEquil{FT}), nt_core...), cspace)
xv = fill((; ts = nt_ts, nt_core...), cspace)
(_, Nij, _, Nv, Nh) = size(Fields.field_values(x.ts))
fv_ts = Fields.field_values(x.ts)
(_, Nij, _, Nv, Nh) = size(fv_ts)
us = TBB.UniversalSizesStatic(Nv, Nij, Nh)
function to_vec(ξ)
pns = propertynames(ξ)
Expand All @@ -186,7 +187,7 @@ using Test
end
return (; zip(propertynames(ξ), dl_vals)...)
end
x_vec = to_vec(xv)
# x_vec = to_vec(xv)

x_aos = fill((; ρ_read = FT(0), ρ_write = FT(0)), cspace)
x_soa = (;
Expand All @@ -199,20 +200,21 @@ using Test
@. x_aos.ρ_write = 7
TBB.singlefield_bc!(x_soa, us; nreps=1, n_trials = 1)
TBB.singlefield_bc!(x_aos, us; nreps=1, n_trials = 1)

TBB.thermo_func_bc!(x, us; nreps=1, n_trials = 1)
TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)
# TBB.thermo_func_sol!(x_vec, us; nreps=1, n_trials = 1)

rc = Fields.rcompare(x_vec, to_vec(x))
rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
@test rc # test correctness
# rc = Fields.rcompare(x_vec, to_vec(x))
# rc || Fields.@rprint_diff(x_vec, to_vec(x)) # test correctness (should print nothing)
# @test rc # test correctness

TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
# TBB.singlefield_bc!(x_soa, us; nreps=100, bm)
# TBB.singlefield_bc!(x_aos, us; nreps=100, bm)
TBB.thermo_func_bc!(x, us; nreps=100, bm)
TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)
@info "Success!"
# TBB.thermo_func_sol!(x_vec, us; nreps=100, bm)

TBB.tabulate_benchmark(bm)

end
# end
#! format: on
1 change: 1 addition & 0 deletions ext/ClimaCoreCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import ClimaCore.Utilities: cart_ind, linear_ind
import ClimaCore.RecursiveApply:
, , , radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
import ClimaCore.DataLayouts: universal_size, UniversalSize

include(joinpath("cuda", "cuda_utils.jl"))
include(joinpath("cuda", "data_layouts.jl"))
Expand Down
24 changes: 24 additions & 0 deletions ext/cuda/data_layouts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ import CUDA
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} =
CUDA.CuArray{T, N, B} where {N}

# Can we remove this?
# parent_array_type(
# ::Type{<:CUDA.CuArray{T, N, B} where {N}},
# ::Val{ND},
# ) where {T, B, ND} = CUDA.CuArray{T, ND, B}

parent_array_type(
::Type{<:CUDA.CuArray{T, N, B} where {N}},
as::ArraySize,
) where {T, B} = CUDA.CuArray{T, ndims(as), B}

# Ensure that both parent array types have the same memory buffer type.
promote_parent_array_type(
::Type{CUDA.CuArray{T1, N, B} where {N}},
Expand Down Expand Up @@ -53,3 +64,16 @@ function Adapt.adapt_structure(
end,
)
end

import Adapt
import CUDA
function Adapt.adapt_structure(
to::CUDA.KernelAdaptor,
bc::DataLayouts.NonExtrudedBroadcasted{Style},
) where {Style}
DataLayouts.NonExtrudedBroadcasted{Style}(
adapt_f(to, bc.f),
Adapt.adapt(to, bc.args),
Adapt.adapt(to, bc.axes),
)
end
66 changes: 54 additions & 12 deletions ext/cuda/data_layouts_copyto.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import ClimaCore.DataLayouts:
to_non_extruded_broadcasted, has_uniform_datalayouts
DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()

function knl_copyto!(dest, src)
Expand All @@ -15,36 +17,76 @@ function knl_copyto!(dest, src)
return nothing
end

function knl_copyto_field_array!(dest, src, us)
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
dest[I] = src[I]
end
end
return nothing
end

function Base.copyto!(
dest::IJFH{S, Nij, Nh},
bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh},
::ToCUDA,
) where {S, Nij, Nh}
us = DataLayouts.UniversalSize(dest)
if Nh > 0
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij),
blocks_s = (Nh, 1),
knl_copyto_field_array!,
(dest, bc, us),
prod(DataLayouts.universal_size(us));
auto = true,
)
end
return dest
end

function knl_copyto_linear!(dest::AbstractData, bc, us)
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
dest[tidx] = bc[tidx]
end
end
return nothing
end

function knl_copyto_linear!(dest::DataF, bc, us)
@inbounds dest[] = bc[tidx]
return nothing
end

function knl_copyto_cart!(dest, src, us)
@inbounds begin
tidx = thread_index()
if tidx ≤ get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
dest[I] = src[I]
end
end
return nothing
end

function Base.copyto!(
dest::VIJFH{S, Nv, Nij, Nh},
bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh},
::ToCUDA,
) where {S, Nv, Nij, Nh}
if Nv > 0 && Nh > 0
Nv_per_block = min(Nv, fld(256, Nij * Nij))
Nv_blocks = cld(Nv, Nv_per_block)
auto_launch!(
knl_copyto!,
(dest, bc);
threads_s = (Nij, Nij, Nv_per_block),
blocks_s = (Nh, Nv_blocks),
)
us = DataLayouts.UniversalSize(dest)
n = prod(DataLayouts.universal_size(us))
if has_uniform_datalayouts(bc)
bc′ = to_non_extruded_broadcasted(bc)
auto_launch!(knl_copyto_linear!, (dest, bc′, us), n; auto = true)
else
auto_launch!(knl_copyto_cart!, (dest, bc, us), n; auto = true)
end
end
return dest
end
Expand Down
16 changes: 8 additions & 8 deletions ext/cuda/data_layouts_fill.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
function knl_fill_flat!(dest::AbstractData, val, us)
@inbounds begin
tidx = thread_index()
if tidx get_N(us)
n = size(dest)
I = kernel_indexes(tidx, n)
@inbounds dest[I] = val
end
end
# @inbounds begin
# tidx = thread_index()
# if tidx ≤ get_N(us)
# n = size(dest)
# I = kernel_indexes(tidx, n)
# @inbounds dest[I] = val
# end
# end
return nothing
end

Expand Down
47 changes: 24 additions & 23 deletions ext/cuda/topologies_dss.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ function dss_load_perimeter_data_kernel!(
if gidx prod(sizep)
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
(ip, jp) = perimeter[p]
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
pperimeter_data[level, p, fidx, elem] = pdata[data_idx]
data_idx = linear_ind(sized, (level, ip, jp, elem))
pperimeter_data.arrays[fidx][level, p, elem] =
pdata.arrays[fidx][data_idx]
end
return nothing
end
Expand Down Expand Up @@ -89,7 +90,8 @@ function dss_unload_perimeter_data_kernel!(
(level, p, fidx, elem) = cart_ind(sizep, gidx).I
(ip, jp) = perimeter[p]
data_idx = linear_ind(sized, (level, ip, jp, fidx, elem))
pdata[data_idx] = pperimeter_data[level, p, fidx, elem]
pdata.arrays[fidx][data_idx] =
pperimeter_data.arrays[fidx][level, p, elem]
end
return nothing
end
Expand Down Expand Up @@ -148,12 +150,12 @@ function dss_local_kernel!(
for idx in st:(en - 1)
(lidx, vert) = local_vertices[idx]
ip = perimeter_vertex_node_index(vert)
sum_data += pperimeter_data[level, ip, fidx, lidx]
sum_data += pperimeter_data.arrays[fidx][level, ip, lidx]
end
for idx in st:(en - 1)
(lidx, vert) = local_vertices[idx]
ip = perimeter_vertex_node_index(vert)
pperimeter_data[level, ip, fidx, lidx] = sum_data
pperimeter_data.arrays[fidx][level, ip, lidx] = sum_data
end
elseif gidx nlevels * nfidx * (nlocalvertices + nlocalfaces) # interior faces
nfacedof = div(nperimeter - 4, 4)
Expand All @@ -169,10 +171,10 @@ function dss_local_kernel!(
ip1 = inc1 == 1 ? first1 + i - 1 : first1 - i + 1
ip2 = inc2 == 1 ? first2 + i - 1 : first2 - i + 1
val =
pperimeter_data[level, ip1, fidx, lidx1] +
pperimeter_data[level, ip2, fidx, lidx2]
pperimeter_data[level, ip1, fidx, lidx1] = val
pperimeter_data[level, ip2, fidx, lidx2] = val
pperimeter_data.arrays[fidx][level, ip1, lidx1] +
pperimeter_data.arrays[fidx][level, ip2, lidx2]
pperimeter_data.arrays[fidx][level, ip1, lidx1] = val
pperimeter_data.arrays[fidx][level, ip2, lidx2] = val
end
end

Expand Down Expand Up @@ -254,7 +256,7 @@ function dss_transform_kernel!(
if gidx nlevels * nperimeter * nlocalelems
sizet = (nlevels, nperimeter, nlocalelems)
sizet_data = (nlevels, Nq, Nq, nfid, nelems)
sizet_wt = (Nq, Nq, 1, nelems)
sizet_wt = (Nq, Nq, nelems)
sizet_metric = (nlevels, Nq, Nq, nmetric, nelems)

(level, p, localelemno) = cart_ind(sizet, gidx).I
Expand All @@ -267,26 +269,24 @@ function dss_transform_kernel!(
pperimeter_data[level, p, fidx, elem] = pdata[data_idx] * weight
end
for fidx in covariant12fidx
data_idx1 = linear_ind(sizet_data, (level, ip, jp, fidx, elem))
data_idx2 = linear_ind(sizet_data, (level, ip, jp, fidx + 1, elem))
(idx11, idx12, idx21, idx22) =
Topologies._get_idx_metric(sizet_metric, (level, ip, jp, elem))
data_idx = linear_ind(sizet_data, (level, ip, jp, elem))
(idx11, idx12, idx21, idx22) = (1,2,3,4)
# Topologies._get_idx_metric(sizet_metric, (level, ip, jp, elem))
pperimeter_data[level, p, fidx, elem] =
(
p∂ξ∂x[idx11] * pdata[data_idx1] +
p∂ξ∂x[idx12] * pdata[data_idx2]
p∂ξ∂x.arrays[idx11][data_idx] * pdata.arrays[fidx][data_idx] +
p∂ξ∂x.arrays[idx12][data_idx] * pdata.arrays[fidx+1][data_idx]
) * weight
pperimeter_data[level, p, fidx + 1, elem] =
(
p∂ξ∂x[idx21] * pdata[data_idx1] +
p∂ξ∂x[idx22] * pdata[data_idx2]
p∂ξ∂x.arrays[idx21][data_idx] * pdata.arrays[fidx][data_idx] +
p∂ξ∂x.arrays[idx22][data_idx] * pdata.arrays[fidx+1][data_idx]
) * weight
end
for fidx in contravariant12fidx
data_idx1 = linear_ind(sizet_data, (level, ip, jp, fidx, elem))
data_idx2 = linear_ind(sizet_data, (level, ip, jp, fidx + 1, elem))
(idx11, idx12, idx21, idx22) =
Topologies._get_idx_metric(sizet_metric, (level, ip, jp, elem))
data_idx = linear_ind(sizet_data, (level, ip, jp, elem))
(idx11, idx12, idx21, idx22) = (1,2,3,4)
# Topologies._get_idx_metric(sizet_metric, (level, ip, jp, elem))
pperimeter_data[level, p, fidx, elem] =
(
p∂x∂ξ[idx11] * pdata[data_idx1] +
Expand Down Expand Up @@ -683,7 +683,8 @@ function load_from_recv_buffer_kernel!(
lidx = recv_buf_idx[irecv, 1]
ip = recv_buf_idx[irecv, 2]
idx = level + ((fidx - 1) + (irecv - 1) * nfid) * nlevels
CUDA.@atomic pperimeter_data[level, ip, fidx, lidx] += recv_data[idx]
CUDA.@atomic pperimeter_data.arrays[fidx][level, ip, lidx] +=
recv_data[idx]
end
return nothing
end
Expand Down
Loading
Loading