Skip to content

Commit 1c512be

Browse files
FieldVectors, add benchmarks, improve perf
1 parent b9714ab commit 1c512be

File tree

7 files changed

+349
-44
lines changed

7 files changed

+349
-44
lines changed

.buildkite/pipeline.yml

+2
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,7 @@ steps:
486486
key: unit_field
487487
command:
488488
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
489+
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_fieldvectors.jl"
489490
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
490491
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
491492
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/inference_repro.jl"
@@ -495,6 +496,7 @@ steps:
495496
command:
496497
- "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
497498
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/unit_field.jl"
499+
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_fieldvectors.jl"
498500
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/benchmark_field_multi_broadcast_fusion.jl"
499501
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/convergence_field_integrals.jl"
500502
- "julia --color=yes --check-bounds=yes --project=.buildkite test/Fields/inference_repro.jl"

ext/cuda/data_layouts_copyto.jl

+90
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,93 @@ function Base.copyto!(
9696
@inbounds bc0 = bc[]
9797
fill!(dest, bc0)
9898
end
99+
100+
# For field-vector operations
101+
function DataLayouts.copyto_per_field!(
102+
array::AbstractArray,
103+
bc::Union{AbstractArray, Base.Broadcast.Broadcasted},
104+
::ToCUDA,
105+
)
106+
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
107+
# All field variables are treated separately, so
108+
# we can parallelize across the field index, and
109+
# leverage linear indexing:
110+
nitems = prod(size(array))
111+
N = prod(size(array))
112+
args = (array, bc′, N)
113+
threads = threads_via_occupancy(copyto_per_field_kernel!, args)
114+
n_max_threads = min(threads, nitems)
115+
p = linear_partition(nitems, n_max_threads)
116+
auto_launch!(
117+
copyto_per_field_kernel!,
118+
args;
119+
threads_s = p.threads,
120+
blocks_s = p.blocks,
121+
)
122+
return array
123+
end
124+
function copyto_per_field_kernel!(array, bc, N)
125+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
126+
if 1 i N
127+
@inbounds array[i] = bc[i]
128+
end
129+
return nothing
130+
end
131+
132+
# Need 2 methods here to avoid unbound arguments:
133+
function DataLayouts.copyto_per_field_scalar!(
134+
array::AbstractArray,
135+
bc::Base.Broadcast.Broadcasted{Style},
136+
::ToCUDA,
137+
) where {
138+
Style <:
139+
Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}},
140+
}
141+
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
142+
# All field variables are treated separately, so
143+
# we can parallelize across the field index, and
144+
# leverage linear indexing:
145+
nitems = prod(size(array))
146+
N = prod(size(array))
147+
args = (array, bc′, N)
148+
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
149+
n_max_threads = min(threads, nitems)
150+
p = linear_partition(nitems, n_max_threads)
151+
auto_launch!(
152+
copyto_per_field_kernel_0D!,
153+
args;
154+
threads_s = p.threads,
155+
blocks_s = p.blocks,
156+
)
157+
return array
158+
end
159+
function DataLayouts.copyto_per_field_scalar!(
160+
array::AbstractArray,
161+
bc::Real,
162+
::ToCUDA,
163+
)
164+
bc′ = DataLayouts.to_non_extruded_broadcasted(bc)
165+
# All field variables are treated separately, so
166+
# we can parallelize across the field index, and
167+
# leverage linear indexing:
168+
nitems = prod(size(array))
169+
N = prod(size(array))
170+
args = (array, bc′, N)
171+
threads = threads_via_occupancy(copyto_per_field_kernel_0D!, args)
172+
n_max_threads = min(threads, nitems)
173+
p = linear_partition(nitems, n_max_threads)
174+
auto_launch!(
175+
copyto_per_field_kernel_0D!,
176+
args;
177+
threads_s = p.threads,
178+
blocks_s = p.blocks,
179+
)
180+
return array
181+
end
182+
function copyto_per_field_kernel_0D!(array, bc, N)
183+
i = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
184+
if 1 i N
185+
@inbounds array[i] = bc[]
186+
end
187+
return nothing
188+
end

src/DataLayouts/copyto.jl

+48
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,51 @@ function Base.copyto!(
180180
end
181181
return dest
182182
end
183+
184+
function copyto_per_field!(
185+
array::AbstractArray,
186+
bc::Union{AbstractArray, Base.Broadcast.Broadcasted},
187+
::ToCPU,
188+
)
189+
bc′ = to_non_extruded_broadcasted(bc)
190+
# All field variables are treated separately, so
191+
# we can parallelize across the field index, and
192+
# leverage linear indexing:
193+
N = prod(size(array))
194+
@inbounds @simd for I in 1:N
195+
array[I] = bc′[I]
196+
end
197+
return array
198+
end
199+
200+
# Need 2 methods here to avoid unbound arguments:
201+
function copyto_per_field_scalar!(array::AbstractArray, bc::Real, ::ToCPU)
202+
bc′ = to_non_extruded_broadcasted(bc)
203+
# All field variables are treated separately, so
204+
# we can parallelize across the field index, and
205+
# leverage linear indexing:
206+
N = prod(size(array))
207+
@inbounds @simd for I in 1:N
208+
array[I] = bc′[]
209+
end
210+
return array
211+
end
212+
213+
function copyto_per_field_scalar!(
214+
array::AbstractArray,
215+
bc::Base.Broadcast.Broadcasted{Style},
216+
::ToCPU,
217+
) where {
218+
Style <:
219+
Union{Base.Broadcast.AbstractArrayStyle{0}, Base.Broadcast.Style{Tuple}},
220+
}
221+
bc′ = to_non_extruded_broadcasted(bc)
222+
# All field variables are treated separately, so
223+
# we can parallelize across the field index, and
224+
# leverage linear indexing:
225+
N = prod(size(array))
226+
@inbounds @simd for I in 1:N
227+
array[I] = bc′[]
228+
end
229+
return array
230+
end

src/Fields/Fields.jl

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ import ..DataLayouts:
1010
FusedMultiBroadcast,
1111
@fused_direct,
1212
isascalar,
13-
check_fused_broadcast_axes
13+
check_fused_broadcast_axes,
14+
ToCPU,
15+
ToCUDA,
16+
copyto_per_field!,
17+
copyto_per_field_scalar!
1418
import ..Domains
1519
import ..Topologies
1620
import ..Quadratures

src/Fields/fieldvector.jl

+65-43
Original file line numberDiff line numberDiff line change
@@ -175,15 +175,6 @@ function Base.similar(
175175
error("Cannot construct FieldVector")
176176
end
177177

178-
@inline function Base.copyto!(dest::FV, src::FV) where {FV <: FieldVector}
179-
for symb in propertynames(dest)
180-
pd = parent(getproperty(dest, symb))
181-
ps = parent(getproperty(src, symb))
182-
copyto!(pd, ps)
183-
end
184-
return dest
185-
end
186-
187178
"""
188179
Spaces.create_dss_buffer(fv::FieldVector)
189180
@@ -218,32 +209,6 @@ function Spaces.weighted_dss!(
218209
Spaces.weighted_dss!(pairs...)
219210
end
220211

221-
222-
# Recursively call transform_bc_args() on broadcast arguments in a way that is statically reducible by the optimizer
223-
# see Base.Broadcast.preprocess_args
224-
@inline transform_bc_args(args::Tuple, inds...) = (
225-
transform_broadcasted(args[1], inds...),
226-
transform_bc_args(Base.tail(args), inds...)...,
227-
)
228-
@inline transform_bc_args(args::Tuple{Any}, inds...) =
229-
(transform_broadcasted(args[1], inds...),)
230-
@inline transform_bc_args(args::Tuple{}, inds...) = ()
231-
232-
@inline function transform_broadcasted(
233-
bc::Base.Broadcast.Broadcasted{FieldVectorStyle},
234-
symb,
235-
axes,
236-
)
237-
Base.Broadcast.Broadcasted(
238-
bc.f,
239-
transform_bc_args(bc.args, symb, axes),
240-
axes,
241-
)
242-
end
243-
@inline transform_broadcasted(fv::FieldVector, symb, axes) =
244-
parent(getfield(_values(fv), symb))
245-
@inline transform_broadcasted(x, symb, axes) = x
246-
247212
@inline function first_fieldvector_in_bc(args::Tuple, rargs...)
248213
x1 = first_fieldvector_in_bc(args[1], rargs...)
249214
x1 isa FieldVector && return x1
@@ -330,26 +295,83 @@ function Base.Broadcast.instantiate(
330295
return Base.Broadcast.Broadcasted{FieldVectorStyle}(bc.f, bc.args, axes)
331296
end
332297

333-
@inline function Base.copyto!(
334-
dest::FieldVector,
298+
# Recursively call transform_bc_args() on broadcast arguments in a way that is statically reducible by the optimizer
299+
# see Base.Broadcast.preprocess_args
300+
@inline transform_bc_args(args::Tuple, inds...) = (
301+
transform_broadcasted(args[1], inds...),
302+
transform_bc_args(Base.tail(args), inds...)...,
303+
)
304+
@inline transform_bc_args(args::Tuple{Any}, inds...) =
305+
(transform_broadcasted(args[1], inds...),)
306+
@inline transform_bc_args(args::Tuple{}, inds...) = ()
307+
308+
@inline function transform_broadcasted(
335309
bc::Base.Broadcast.Broadcasted{FieldVectorStyle},
310+
symb,
311+
axes,
312+
)
313+
Base.Broadcast.Broadcasted(
314+
bc.f,
315+
transform_bc_args(bc.args, symb, axes),
316+
axes,
317+
)
318+
end
319+
@inline transform_broadcasted(fv::FieldVector, symb, axes) =
320+
parent(getfield(_values(fv), symb))
321+
@inline transform_broadcasted(x, symb, axes) = x
322+
323+
@inline Base.copyto!(
324+
dest::FieldVector,
325+
bc::Union{FieldVector, Base.Broadcast.Broadcasted{FieldVectorStyle}},
326+
) = copyto_per_field!(dest, bc)
327+
328+
@inline function copyto_per_field!(
329+
dest::FieldVector,
330+
bc::Union{FieldVector, Base.Broadcast.Broadcasted{FieldVectorStyle}},
336331
)
337332
map(propertynames(dest)) do symb
338333
Base.@_inline_meta
339-
p = parent(getfield(_values(dest), symb))
340-
copyto!(p, transform_broadcasted(bc, symb, axes(p)))
334+
array = parent(getfield(_values(dest), symb))
335+
bct = transform_broadcasted(bc, symb, axes(array))
336+
if array isa FieldVector # recurse
337+
copyto_per_field!(array, bct)
338+
else
339+
copyto_per_field!(
340+
array,
341+
Base.Broadcast.instantiate(bct),
342+
DataLayouts.device_dispatch(array),
343+
)
344+
end
341345
end
342346
return dest
343347
end
344348

345-
@inline function Base.copyto!(
349+
@inline Base.copyto!(
350+
dest::FieldVector,
351+
bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.Style{Tuple}},
352+
) = copyto_per_field_scalar!(dest, bc)
353+
354+
@inline Base.copyto!(
346355
dest::FieldVector,
347356
bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}},
348-
)
357+
) = copyto_per_field_scalar!(dest, bc)
358+
359+
@inline Base.copyto!(dest::FieldVector, bc::Real) =
360+
copyto_per_field_scalar!(dest, bc)
361+
362+
@inline function copyto_per_field_scalar!(dest::FieldVector, bc)
349363
map(propertynames(dest)) do symb
350364
Base.@_inline_meta
351-
p = parent(getfield(_values(dest), symb))
352-
copyto!(p, bc)
365+
array = parent((getfield(_values(dest), symb)))
366+
if array isa FieldVector # recurse
367+
copyto_per_field_scalar!(array, bc)
368+
else
369+
copyto_per_field_scalar!(
370+
array,
371+
Base.Broadcast.instantiate(bc),
372+
DataLayouts.device_dispatch(array),
373+
)
374+
end
353375
nothing
354376
end
355377
return dest

0 commit comments

Comments
 (0)