From 8c57f337fc0b82091bb574bf3d4e6483347b81b8 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 28 Jan 2025 15:48:57 +0100 Subject: [PATCH] Forbid divergent execution of work-group barriers --- src/KernelAbstractions.jl | 10 +++++++- src/macros.jl | 54 ++++++++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index b82dadc5..430718a3 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -284,6 +284,9 @@ end After a `@synchronize` statement all read and writes to global and local memory from each thread in the workgroup are visible in from all other threads in the workgroup. + +!!! note + `@synchronize()` must be encountered by all workitems of a work-group executing the kernel or by none at all. """ macro synchronize() return quote @@ -301,10 +304,15 @@ workgroup. `cond` is not allowed to have any visible sideffects. # Platform differences - `GPU`: This synchronization will only occur if the `cond` evaluates. - `CPU`: This synchronization will always occur. + +!!! warn + This variant of the `@synchronize` macro violates the requirement that `@synchronize` must be encountered + by all workitems of a work-group executing the kernel or by none at all. + Since v`0.9.34` this version of the macro is deprecated and lowers to `@synchronize()` """ macro synchronize(cond) return quote - $(esc(cond)) && $__synchronize() + $__synchronize() end end diff --git a/src/macros.jl b/src/macros.jl index 02b93ed7..e23822b0 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -86,22 +86,24 @@ function transform_gpu!(def, constargs, force_inbounds) end end pushfirst!(def[:args], :__ctx__) - body = def[:body] + new_stmts = Expr[] + body = MacroTools.flatten(def[:body]) + stmts = body.args + push!(new_stmts, Expr(:aliasscope)) + push!(new_stmts, :(__active_lane__ = $__validindex(__ctx__))) if force_inbounds - body = quote - @inbounds $(body) - end + push!(new_stmts, Expr(:inbounds, true)) end - body = quote - if $__validindex(__ctx__) - $(body) - end - return nothing + append!(new_stmts, split(emit_gpu, body.args)) + if force_inbounds + push!(new_stmts, Expr(:inbounds, :pop)) end + push!(new_stmts, Expr(:popaliasscope)) + push!(new_stmts, :(return nothing)) def[:body] = Expr( :let, Expr(:block, let_constargs...), - body, + Expr(:block, new_stmts...), ) return end @@ -127,7 +129,7 @@ function transform_cpu!(def, constargs, force_inbounds) if force_inbounds push!(new_stmts, Expr(:inbounds, true)) end - append!(new_stmts, split(body.args)) + append!(new_stmts, split(emit_cpu, body.args)) if force_inbounds push!(new_stmts, Expr(:inbounds, :pop)) end @@ -147,6 +149,7 @@ struct WorkgroupLoop allocations::Vector{Any} private_allocations::Vector{Any} private::Set{Symbol} + terminated_in_sync::Bool end is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_)) @@ -167,6 +170,7 @@ end # TODO proper handling of LineInfo function split( + emit, stmts, indicies = Any[], private = Set{Symbol}(), ) @@ -182,7 +186,7 @@ function split( for stmt in stmts has_sync = find_sync(stmt) if has_sync - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) + loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), is_sync(stmt)) push!(new_stmts, emit(loop)) allocations = Any[] private_allocations = Any[] @@ -197,7 +201,7 @@ function split( function recurse(expr::Expr) expr = unblock(expr) if is_scope_construct(expr) && any(find_sync, expr.args) - new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private))) + new_args = unblock(split(emit, expr.args, deepcopy(indicies), deepcopy(private))) return Expr(expr.head, new_args...) else return Expr(expr.head, map(recurse, expr.args)...) @@ -240,13 +244,13 @@ function split( # everything since the last `@synchronize` if !isempty(current) - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) + loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), false) push!(new_stmts, emit(loop)) end return new_stmts end -function emit(loop) +function emit_cpu(loop) idx = gensym(:I) for stmt in loop.indicies # splice index into the i = @index(Cartesian, $idx) @@ -300,3 +304,23 @@ function emit(loop) return unblock(Expr(:block, stmts...)) end + +function emit_gpu(loop) + stmts = Any[] + + body = Expr(:block, loop.stmts...) + loopexpr = quote + if __active_lane__ + $(loop.indicies...) + $(loop.allocations...) + $(loop.private_allocations...) + $(unblock(body)) + end + end + push!(stmts, loopexpr) + if loop.terminated_in_sync + push!(stmts, :($__synchronize())) + end + + return unblock(Expr(:block, stmts...)) +end