Skip to content

Commit

Permalink
Forbid divergent execution of work-group barriers
Browse files Browse the repository at this point in the history
  • Loading branch information
vchuravy committed Feb 11, 2025
1 parent b435bb2 commit 8c57f33
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 16 deletions.
10 changes: 9 additions & 1 deletion src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,9 @@ end
After a `@synchronize` statement all read and writes to global and local memory
from each thread in the workgroup are visible in from all other threads in the
workgroup.
!!! note
`@synchronize()` must be encountered by all workitems of a work-group executing the kernel or by none at all.
"""
macro synchronize()
return quote
Expand All @@ -301,10 +304,15 @@ workgroup. `cond` is not allowed to have any visible sideffects.
# Platform differences
- `GPU`: This synchronization will only occur if the `cond` evaluates.
- `CPU`: This synchronization will always occur.
!!! warn
This variant of the `@synchronize` macro violates the requirement that `@synchronize` must be encountered
by all workitems of a work-group executing the kernel or by none at all.
Since v`0.9.34` this version of the macro is deprecated and lowers to `@synchronize()`
"""
macro synchronize(cond)
return quote
$(esc(cond)) && $__synchronize()
$__synchronize()
end
end

Expand Down
54 changes: 39 additions & 15 deletions src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,22 +86,24 @@ function transform_gpu!(def, constargs, force_inbounds)
end
end
pushfirst!(def[:args], :__ctx__)
body = def[:body]
new_stmts = Expr[]
body = MacroTools.flatten(def[:body])
stmts = body.args
push!(new_stmts, Expr(:aliasscope))
push!(new_stmts, :(__active_lane__ = $__validindex(__ctx__)))
if force_inbounds
body = quote
@inbounds $(body)
end
push!(new_stmts, Expr(:inbounds, true))
end
body = quote
if $__validindex(__ctx__)
$(body)
end
return nothing
append!(new_stmts, split(emit_gpu, body.args))
if force_inbounds
push!(new_stmts, Expr(:inbounds, :pop))
end
push!(new_stmts, Expr(:popaliasscope))
push!(new_stmts, :(return nothing))
def[:body] = Expr(
:let,
Expr(:block, let_constargs...),
body,
Expr(:block, new_stmts...),
)
return
end
Expand All @@ -127,7 +129,7 @@ function transform_cpu!(def, constargs, force_inbounds)
if force_inbounds
push!(new_stmts, Expr(:inbounds, true))
end
append!(new_stmts, split(body.args))
append!(new_stmts, split(emit_cpu, body.args))
if force_inbounds
push!(new_stmts, Expr(:inbounds, :pop))
end
Expand All @@ -147,6 +149,7 @@ struct WorkgroupLoop
allocations::Vector{Any}
private_allocations::Vector{Any}
private::Set{Symbol}
terminated_in_sync::Bool
end

is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_))
Expand All @@ -167,6 +170,7 @@ end

# TODO proper handling of LineInfo
function split(
emit,
stmts,
indicies = Any[], private = Set{Symbol}(),
)
Expand All @@ -182,7 +186,7 @@ function split(
for stmt in stmts
has_sync = find_sync(stmt)
if has_sync
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private))
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), is_sync(stmt))
push!(new_stmts, emit(loop))
allocations = Any[]
private_allocations = Any[]
Expand All @@ -197,7 +201,7 @@ function split(
function recurse(expr::Expr)
expr = unblock(expr)
if is_scope_construct(expr) && any(find_sync, expr.args)
new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private)))
new_args = unblock(split(emit, expr.args, deepcopy(indicies), deepcopy(private)))
return Expr(expr.head, new_args...)
else
return Expr(expr.head, map(recurse, expr.args)...)
Expand Down Expand Up @@ -240,13 +244,13 @@ function split(

# everything since the last `@synchronize`
if !isempty(current)
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private))
loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private), false)
push!(new_stmts, emit(loop))
end
return new_stmts
end

function emit(loop)
function emit_cpu(loop)
idx = gensym(:I)
for stmt in loop.indicies
# splice index into the i = @index(Cartesian, $idx)
Expand Down Expand Up @@ -300,3 +304,23 @@ function emit(loop)

return unblock(Expr(:block, stmts...))
end

function emit_gpu(loop)
stmts = Any[]

body = Expr(:block, loop.stmts...)
loopexpr = quote
if __active_lane__
$(loop.indicies...)
$(loop.allocations...)
$(loop.private_allocations...)
$(unblock(body))
end
end
push!(stmts, loopexpr)
if loop.terminated_in_sync
push!(stmts, :($__synchronize()))
end

return unblock(Expr(:block, stmts...))
end

0 comments on commit 8c57f33

Please sign in to comment.