From af03bceb8f0bb3e9b510d42f8a33f984db1a7225 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 5 Feb 2025 16:14:00 +0100 Subject: [PATCH] fix formatting --- src/pocl/backend.jl | 4 +- src/pocl/compiler/compilation.jl | 20 +- src/pocl/compiler/execution.jl | 66 ++++--- src/pocl/compiler/reflection.jl | 13 +- src/pocl/device/array.jl | 110 ++++++----- src/pocl/nanoOpenCL.jl | 315 ++++++++++++++++++------------- src/pocl/pocl.jl | 18 +- 7 files changed, 312 insertions(+), 234 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 2a5bb45b..5aaa6a2a 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -25,13 +25,13 @@ KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, d function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} arr = KA.allocate(backend, T, dims) - kernel = init_kernel(backend) + kernel = KA.init_kernel(backend) kernel(arr, zero, T, ndrange = length(arr)) return arr end function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} arr = KA.allocate(backend, T, dims) - kernel = init_kernel(backend) + kernel = KA.init_kernel(backend) kernel(arr, one, T; ndrange = length(arr)) return arr end diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl index b79f5f66..375e0508 100644 --- a/src/pocl/compiler/compilation.jl +++ b/src/pocl/compiler/compilation.jl @@ -2,18 +2,20 @@ struct OpenCLCompilerParams <: AbstractCompilerParams end const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} -const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget,OpenCLCompilerParams} +const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams} -GPUCompiler.runtime_module(::CompilerJob{<:Any,OpenCLCompilerParams}) = POCL +GPUCompiler.runtime_module(::CompilerJob{<:Any, OpenCLCompilerParams}) = POCL GPUCompiler.method_table(::OpenCLCompilerJob) = method_table # filter out OpenCL built-ins # TODO: eagerly lower these using the translator API GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = - invoke(GPUCompiler.isintrinsic, - Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)}, - job, fn) || + invoke( + GPUCompiler.isintrinsic, + Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)}, + job, fn +) || in(fn, opencl_builtins) @@ -42,14 +44,14 @@ function compiler_config(dev::cl.Device; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel=true, name=nothing, always_inline=false, kwargs...) +@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...) supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) params = OpenCLCompilerParams() - CompilerConfig(target, params; kernel, name, always_inline) + return CompilerConfig(target, params; kernel, name, always_inline) end # compile to executable machine code @@ -59,7 +61,7 @@ function compile(@nospecialize(job::CompilerJob)) GPUCompiler.compile(:obj, job) end - (;obj, entry=LLVM.name(meta.entry)) + return (; obj, entry = LLVM.name(meta.entry)) end # link into an executable kernel @@ -70,5 +72,5 @@ function link(@nospecialize(job::CompilerJob), compiled) error("Your device does not support SPIR-V, which is currently required for native execution.") end cl.build!(prog) - cl.Kernel(prog, compiled.entry) + return cl.Kernel(prog, compiled.entry) end diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl index c4d2a70e..dc47cb30 100644 --- a/src/pocl/compiler/execution.jl +++ b/src/pocl/compiler/execution.jl @@ -9,7 +9,7 @@ const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...) call = ex[end] - kwargs = map(ex[1:end-1]) do kwarg + kwargs = map(ex[1:(end - 1)]) do kwarg if kwarg isa Symbol :($kwarg = $kwarg) elseif Meta.isexpr(kwarg, :(=)) @@ -31,14 +31,14 @@ macro opencl(ex...) macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) if !isempty(other_kwargs) - key,val = first(other_kwargs).args + key, val = first(other_kwargs).args throw(ArgumentError("Unsupported keyword argument '$key'")) end # handle keyword arguments that influence the macro's behavior launch = true for kwarg in macro_kwargs - key,val = kwarg.args + key, val = kwarg.args if key == :launch isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @opencl should be a constant value")) launch = val::Bool @@ -56,7 +56,8 @@ macro opencl(ex...) # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive - push!(code.args, + push!( + code.args, quote $f_var = $f GC.@preserve $(vars...) $f_var begin @@ -69,13 +70,16 @@ macro opencl(ex...) end $kernel end - end) + end + ) - return esc(quote - let - $code + return esc( + quote + let + $code + end end - end) + ) end @@ -101,7 +105,7 @@ end # Base.RefValue isn't GPU compatible, so provide a compatible alternative # TODO: port improvements from CUDA.jl struct CLRefValue{T} <: Ref{T} - x::T + x::T end Base.getindex(r::CLRefValue) = r.x Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CLRefValue(adapt(to, r[])) @@ -109,13 +113,15 @@ Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CLRefValue(adapt(to # broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box. # avoid that by using a special kind of ref that knows about the boxed type. struct CLRefType{T} <: Ref{DataType} end -Base.getindex(r::CLRefType{T}) where T = T -Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) = +Base.getindex(r::CLRefType{T}) where {T} = T +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType, Type}}) = CLRefType{r[]}() # case where type is the function being broadcasted -Adapt.adapt_structure(to::KernelAdaptor, - bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} = +Adapt.adapt_structure( + to::KernelAdaptor, + bc::Broadcast.Broadcasted{Style, <:Any, Type{T}} +) where {Style, T} = Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes) """ @@ -131,29 +137,30 @@ register methods for the the `OpenCL.KernelAdaptor` type. The `pointers` argument is used to collect pointers to indirect SVM buffers, which need to be registered with OpenCL before invoking the kernel. """ -function clconvert(arg, pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[]) - adapt(KernelAdaptor(pointers), arg) +function clconvert(arg, pointers::Vector{Ptr{Cvoid}} = Ptr{Cvoid}[]) + return adapt(KernelAdaptor(pointers), arg) end - ## abstract kernel functionality -abstract type AbstractKernel{F,TT} end +abstract type AbstractKernel{F, TT} end -@inline @generated function (kernel::AbstractKernel{F,TT})(args...; - call_kwargs...) where {F,TT} +@inline @generated function (kernel::AbstractKernel{F, TT})( + args...; + call_kwargs... + ) where {F, TT} sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type - args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...) + args = (:(kernel.f), (:(clconvert(args[$i], svm_pointers)) for i in 1:length(args))...) # filter out ghost arguments that shouldn't be passed predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt) to_pass = map(!predicate, sig.parameters) - call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] - call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr, Symbol}[x[1] for x in zip(args, to_pass) if x[2]] # replace non-isbits arguments (they should be unused, or compilation would have failed) - for (i,dt) in enumerate(call_t) + for (i, dt) in enumerate(call_t) if !isbitstype(dt) call_t[i] = Ptr{Any} call_args[i] = :C_NULL @@ -163,17 +170,16 @@ abstract type AbstractKernel{F,TT} end # finalize types call_tt = Base.to_tuple_type(call_t) - quote + return quote svm_pointers = Ptr{Cvoid}[] $cl.clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...) end end - ## host-side kernels -struct HostKernel{F,TT} <: AbstractKernel{F,TT} +struct HostKernel{F, TT} <: AbstractKernel{F, TT} f::F fun::cl.Kernel end @@ -183,7 +189,7 @@ end const clfunction_lock = ReentrantLock() -function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} +function clfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F, TT} ctx = context() dev = device() @@ -200,10 +206,10 @@ function clfunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT} kernel = get(_kernel_instances, h, nothing) if kernel === nothing # create the kernel state object - kernel = HostKernel{F,tt}(f, fun) + kernel = HostKernel{F, tt}(f, fun) _kernel_instances[h] = kernel end - return kernel::HostKernel{F,tt} + return kernel::HostKernel{F, tt} end end diff --git a/src/pocl/compiler/reflection.jl b/src/pocl/compiler/reflection.jl index a0c9c5eb..55dd8ba6 100644 --- a/src/pocl/compiler/reflection.jl +++ b/src/pocl/compiler/reflection.jl @@ -24,13 +24,15 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) args = method == :code_typed ? (:job,) : (:io, :job) @eval begin - function $method(io::IO, @nospecialize(func), @nospecialize(types); - kernel::Bool=false, kwargs...) + function $method( + io::IO, @nospecialize(func), @nospecialize(types); + kernel::Bool = false, kwargs... + ) compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS) source = methodinstance(typeof(func), Base.to_tuple_type(types)) config = compiler_config(device(); kernel, compiler_kwargs...) job = CompilerJob(source, config) - GPUCompiler.$method($(args...); kwargs...) + return GPUCompiler.$method($(args...); kwargs...) end $method(@nospecialize(func), @nospecialize(types); kwargs...) = $method(stdout, func, types; kwargs...) @@ -38,13 +40,12 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) end - # # @device_code_* functions # export @device_code_lowered, @device_code_typed, @device_code_warntype, @device_code_llvm, - @device_code_native, @device_code + @device_code_native, @device_code # forward to GPUCompiler @eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered"))) @@ -70,5 +71,5 @@ function return_type(@nospecialize(func), @nospecialize(tt)) job = CompilerJob(source, config) interp = GPUCompiler.get_interpreter(job) sig = Base.signature_type(func, tt) - Core.Compiler.return_type(interp, sig) + return Core.Compiler.return_type(interp, sig) end diff --git a/src/pocl/device/array.jl b/src/pocl/device/array.jl index f9726222..63757911 100644 --- a/src/pocl/device/array.jl +++ b/src/pocl/device/array.jl @@ -8,8 +8,8 @@ export CLDeviceArray, CLDeviceVector, CLDeviceMatrix, CLLocalArray # NOTE: we can't support the typical `tuple or series of integer` style construction, # because we're currently requiring a trailing pointer argument. -struct CLDeviceArray{T,N,A} <: DenseArray{T,N} - ptr::LLVMPtr{T,A} +struct CLDeviceArray{T, N, A} <: DenseArray{T, N} + ptr::LLVMPtr{T, A} maxsize::Int dims::Dims{N} @@ -17,27 +17,29 @@ struct CLDeviceArray{T,N,A} <: DenseArray{T,N} # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) # TODO: deprecate; put `ptr` first like oneArray - CLDeviceArray{T,N,A}(dims::Dims{N}, ptr::LLVMPtr{T,A}, - maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} = + CLDeviceArray{T, N, A}( + dims::Dims{N}, ptr::LLVMPtr{T, A}, + maxsize::Int = prod(dims) * sizeof(T) + ) where {T, A, N} = new(ptr, maxsize, dims, prod(dims)) end -const CLDeviceVector = CLDeviceArray{T,1,A} where {T,A} -const CLDeviceMatrix = CLDeviceArray{T,2,A} where {T,A} +const CLDeviceVector = CLDeviceArray{T, 1, A} where {T, A} +const CLDeviceMatrix = CLDeviceArray{T, 2, A} where {T, A} # outer constructors, non-parameterized -CLDeviceArray(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) -CLDeviceArray(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) +CLDeviceArray(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceArray(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) # outer constructors, partially parameterized -CLDeviceArray{T}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) -CLDeviceArray{T}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) -CLDeviceArray{T,N}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(dims, p) -CLDeviceVector{T}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((len,), p) +CLDeviceArray{T}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceArray{T}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) +CLDeviceArray{T, N}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceVector{T}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) # outer constructors, fully parameterized -CLDeviceArray{T,N,A}(dims::NTuple{N,<:Integer}, p::LLVMPtr{T,A}) where {T,A,N} = CLDeviceArray{T,N,A}(Int.(dims), p) -CLDeviceVector{T,A}(len::Integer, p::LLVMPtr{T,A}) where {T,A} = CLDeviceVector{T,A}((Int(len),), p) +CLDeviceArray{T, N, A}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(Int.(dims), p) +CLDeviceVector{T, A}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((Int(len),), p) ## array interface @@ -50,19 +52,19 @@ Base.sizeof(x::CLDeviceArray) = Base.elsize(x) * length(x) # we store the array length too; computing prod(size) is expensive Base.length(g::CLDeviceArray) = g.len -Base.pointer(x::CLDeviceArray{T,<:Any,A}) where {T,A} = Base.unsafe_convert(LLVMPtr{T,A}, x) -@inline function Base.pointer(x::CLDeviceArray{T,<:Any,A}, i::Integer) where {T,A} - Base.unsafe_convert(LLVMPtr{T,A}, x) + Base._memory_offset(x, i) +Base.pointer(x::CLDeviceArray{T, <:Any, A}) where {T, A} = Base.unsafe_convert(LLVMPtr{T, A}, x) +@inline function Base.pointer(x::CLDeviceArray{T, <:Any, A}, i::Integer) where {T, A} + return Base.unsafe_convert(LLVMPtr{T, A}, x) + Base._memory_offset(x, i) end -typetagdata(a::CLDeviceArray{<:Any,<:Any,A}, i=1) where {A} = - reinterpret(LLVMPtr{UInt8,A}, a.ptr + a.maxsize) + i - one(i) +typetagdata(a::CLDeviceArray{<:Any, <:Any, A}, i = 1) where {A} = + reinterpret(LLVMPtr{UInt8, A}, a.ptr + a.maxsize) + i - one(i) ## conversions -Base.unsafe_convert(::Type{LLVMPtr{T,A}}, x::CLDeviceArray{T,<:Any,A}) where {T,A} = - x.ptr +Base.unsafe_convert(::Type{LLVMPtr{T, A}}, x::CLDeviceArray{T, <:Any, A}) where {T, A} = + x.ptr ## indexing intrinsics @@ -72,7 +74,7 @@ Base.unsafe_convert(::Type{LLVMPtr{T,A}}, x::CLDeviceArray{T,<:Any,A}) where {T, # (cfr. shared memory and its wider-than-datatype alignment) @generated function alignment(::CLDeviceArray{T}) where {T} - if Base.isbitsunion(T) + return if Base.isbitsunion(T) _, sz, al = Base.uniontype_layout(T) al else @@ -91,10 +93,10 @@ end @inline function arrayref_bits(A::CLDeviceArray{T}, index::Integer) where {T} align = alignment(A) - unsafe_load(pointer(A), index, Val(align)) + return unsafe_load(pointer(A), index, Val(align)) end -@inline @generated function arrayref_union(A::CLDeviceArray{T,<:Any,AS}, index::Integer) where {T,AS} +@inline @generated function arrayref_union(A::CLDeviceArray{T, <:Any, AS}, index::Integer) where {T, AS} typs = Base.uniontypes(T) # generate code that conditionally loads a value based on the selector value. @@ -102,8 +104,8 @@ end ex = :(Base.llvmcall("unreachable", $T, Tuple{})) for (sel, typ) in Iterators.reverse(enumerate(typs)) ex = quote - if selector == $(sel-1) - ptr = reinterpret(LLVMPtr{$typ,AS}, data_ptr) + if selector == $(sel - 1) + ptr = reinterpret(LLVMPtr{$typ, AS}, data_ptr) unsafe_load(ptr, 1, Val(align)) else $ex @@ -111,7 +113,7 @@ end end end - quote + return quote selector_ptr = typetagdata(A, index) selector = unsafe_load(selector_ptr) @@ -134,21 +136,21 @@ end @inline function arrayset_bits(A::CLDeviceArray{T}, x::T, index::Integer) where {T} align = alignment(A) - unsafe_store!(pointer(A), x, index, Val(align)) + return unsafe_store!(pointer(A), x, index, Val(align)) end -@inline @generated function arrayset_union(A::CLDeviceArray{T,<:Any,AS}, x::T, index::Integer) where {T,AS} +@inline @generated function arrayset_union(A::CLDeviceArray{T, <:Any, AS}, x::T, index::Integer) where {T, AS} typs = Base.uniontypes(T) sel = findfirst(isequal(x), typs) - quote + return quote selector_ptr = typetagdata(A, index) - unsafe_store!(selector_ptr, $(UInt8(sel-1))) + unsafe_store!(selector_ptr, $(UInt8(sel - 1))) align = alignment(A) data_ptr = pointer(A, index) - unsafe_store!(reinterpret(LLVMPtr{$x,AS}, data_ptr), x, 1, Val(align)) + unsafe_store!(reinterpret(LLVMPtr{$x, AS}, data_ptr), x, 1, Val(align)) return end end @@ -167,7 +169,7 @@ Base.IndexStyle(::Type{<:CLDeviceArray}) = Base.IndexLinear() Base.@propagate_inbounds Base.getindex(A::CLDeviceArray{T}, i1::Integer) where {T} = arrayref(A, i1) Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray{T}, x, i1::Integer) where {T} = - arrayset(A, convert(T,x)::T, i1) + arrayset(A, convert(T, x)::T, i1) # preserve the specific integer type when indexing device arrays, # to avoid extending 32-bit hardware indices to 64-bit. @@ -175,11 +177,15 @@ Base.to_index(::CLDeviceArray, i::Integer) = i # Base doesn't like Integer indices, so we need our own ND get and setindex! routines. # See also: https://github.com/JuliaLang/julia/pull/42289 -Base.@propagate_inbounds Base.getindex(A::CLDeviceArray, - I::Union{Integer, CartesianIndex}...) = +Base.@propagate_inbounds Base.getindex( + A::CLDeviceArray, + I::Union{Integer, CartesianIndex}... +) = A[Base._to_linear_index(A, to_indices(A, I)...)] -Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray, x, - I::Union{Integer, CartesianIndex}...) = +Base.@propagate_inbounds Base.setindex!( + A::CLDeviceArray, x, + I::Union{Integer, CartesianIndex}... +) = A[Base._to_linear_index(A, to_indices(A, I)...)] = x @@ -196,8 +202,8 @@ This API can only be used on devices with compute capability 3.5 or higher. !!! warning Experimental API. Subject to change without deprecation. """ -struct Const{T,N,AS} <: DenseArray{T,N} - a::CLDeviceArray{T,N,AS} +struct Const{T, N, AS} <: DenseArray{T, N} + a::CLDeviceArray{T, N, AS} end Base.Experimental.Const(A::CLDeviceArray) = Const(A) @@ -216,26 +222,26 @@ Base.show(io::IO, a::CLDeviceArray) = Base.show(io::IO, mime::MIME"text/plain", a::CLDeviceArray) = show(io, a) -@inline function Base.iterate(A::CLDeviceArray, i=1) - if (i % UInt) - 1 < length(A) +@inline function Base.iterate(A::CLDeviceArray, i = 1) + return if (i % UInt) - 1 < length(A) (@inbounds A[i], i + 1) else nothing end end -function Base.reinterpret(::Type{T}, a::CLDeviceArray{S,N,A}) where {T,S,N,A} - err = _reinterpret_exception(T, a) - err === nothing || throw(err) +function Base.reinterpret(::Type{T}, a::CLDeviceArray{S, N, A}) where {T, S, N, A} + err = _reinterpret_exception(T, a) + err === nothing || throw(err) - if sizeof(T) == sizeof(S) # fast case - return CLDeviceArray{T,N,A}(size(a), reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize) - end + if sizeof(T) == sizeof(S) # fast case + return CLDeviceArray{T, N, A}(size(a), reinterpret(LLVMPtr{T, A}, a.ptr), a.maxsize) + end - isize = size(a) - size1 = div(isize[1]*sizeof(S), sizeof(T)) - osize = tuple(size1, Base.tail(isize)...) - return CLDeviceArray{T,N,A}(osize, reinterpret(LLVMPtr{T,A}, a.ptr), a.maxsize) + isize = size(a) + size1 = div(isize[1] * sizeof(S), sizeof(T)) + osize = tuple(size1, Base.tail(isize)...) + return CLDeviceArray{T, N, A}(osize, reinterpret(LLVMPtr{T, A}, a.ptr), a.maxsize) end @@ -248,5 +254,5 @@ end # NOTE: this relies on const-prop to forward the literal length to the generator. # maybe we should include the size in the type, like StaticArrays does? ptr = emit_localmemory(T, Val(len)) - CLDeviceArray(dims, ptr) + return CLDeviceArray(dims, ptr) end diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl index 3f9b4091..79da6bff 100644 --- a/src/pocl/nanoOpenCL.jl +++ b/src/pocl/nanoOpenCL.jl @@ -489,37 +489,53 @@ const cl_device_svm_capabilities = cl_bitfield const cl_command_queue_properties = cl_bitfield @checked function clGetPlatformIDs(num_entries, platforms, num_platforms) - @ccall libopencl.clGetPlatformIDs(num_entries::cl_uint, platforms::Ptr{cl_platform_id}, - num_platforms::Ptr{cl_uint})::cl_int + @ccall libopencl.clGetPlatformIDs( + num_entries::cl_uint, platforms::Ptr{cl_platform_id}, + num_platforms::Ptr{cl_uint} + )::cl_int end -@checked function clGetPlatformInfo(platform, param_name, param_value_size, param_value, - param_value_size_ret) - @ccall libopencl.clGetPlatformInfo(platform::cl_platform_id, - param_name::cl_platform_info, - param_value_size::Csize_t, param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int +@checked function clGetPlatformInfo( + platform, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetPlatformInfo( + platform::cl_platform_id, + param_name::cl_platform_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int end @checked function clGetDeviceIDs(platform, device_type, num_entries, devices, num_devices) - @ccall libopencl.clGetDeviceIDs(platform::cl_platform_id, device_type::cl_device_type, - num_entries::cl_uint, devices::Ptr{cl_device_id}, - num_devices::Ptr{cl_uint})::cl_int + @ccall libopencl.clGetDeviceIDs( + platform::cl_platform_id, device_type::cl_device_type, + num_entries::cl_uint, devices::Ptr{cl_device_id}, + num_devices::Ptr{cl_uint} + )::cl_int end -@checked function clGetDeviceInfo(device, param_name, param_value_size, param_value, - param_value_size_ret) - @ccall libopencl.clGetDeviceInfo(device::cl_device_id, param_name::cl_device_info, - param_value_size::Csize_t, param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int +@checked function clGetDeviceInfo( + device, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetDeviceInfo( + device::cl_device_id, param_name::cl_device_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int end -function clCreateContext(properties, num_devices, devices, pfn_notify, user_data, - errcode_ret) - @ccall libopencl.clCreateContext(properties::Ptr{cl_context_properties}, - num_devices::cl_uint, devices::Ptr{cl_device_id}, - pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid}, - errcode_ret::Ptr{cl_int})::cl_context +function clCreateContext( + properties, num_devices, devices, pfn_notify, user_data, + errcode_ret + ) + return @ccall libopencl.clCreateContext( + properties::Ptr{cl_context_properties}, + num_devices::cl_uint, devices::Ptr{cl_device_id}, + pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid}, + errcode_ret::Ptr{cl_int} + )::cl_context end @checked function clReleaseContext(context) @@ -527,41 +543,57 @@ end end function clCreateProgramWithIL(context, il, length, errcode_ret) - @ccall libopencl.clCreateProgramWithIL(context::cl_context, il::Ptr{Cvoid}, - length::Csize_t, - errcode_ret::Ptr{cl_int})::cl_program + return @ccall libopencl.clCreateProgramWithIL( + context::cl_context, il::Ptr{Cvoid}, + length::Csize_t, + errcode_ret::Ptr{cl_int} + )::cl_program end @checked function clReleaseProgram(program) @ccall libopencl.clReleaseProgram(program::cl_program)::cl_int end -@checked function clBuildProgram(program, num_devices, device_list, options, pfn_notify, - user_data) - @ccall libopencl.clBuildProgram(program::cl_program, num_devices::cl_uint, - device_list::Ptr{cl_device_id}, options::Ptr{Cchar}, - pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid})::cl_int +@checked function clBuildProgram( + program, num_devices, device_list, options, pfn_notify, + user_data + ) + @ccall libopencl.clBuildProgram( + program::cl_program, num_devices::cl_uint, + device_list::Ptr{cl_device_id}, options::Ptr{Cchar}, + pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid} + )::cl_int end -@checked function clGetProgramInfo(program, param_name, param_value_size, param_value, - param_value_size_ret) - @ccall libopencl.clGetProgramInfo(program::cl_program, param_name::cl_program_info, - param_value_size::Csize_t, param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int +@checked function clGetProgramInfo( + program, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetProgramInfo( + program::cl_program, param_name::cl_program_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int end -@checked function clGetProgramBuildInfo(program, device, param_name, param_value_size, - param_value, param_value_size_ret) - @ccall libopencl.clGetProgramBuildInfo(program::cl_program, device::cl_device_id, - param_name::cl_program_build_info, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int +@checked function clGetProgramBuildInfo( + program, device, param_name, param_value_size, + param_value, param_value_size_ret + ) + @ccall libopencl.clGetProgramBuildInfo( + program::cl_program, device::cl_device_id, + param_name::cl_program_build_info, + param_value_size::Csize_t, + param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int end function clCreateKernel(program, kernel_name, errcode_ret) - @ccall libopencl.clCreateKernel(program::cl_program, kernel_name::Ptr{Cchar}, - errcode_ret::Ptr{cl_int})::cl_kernel + return @ccall libopencl.clCreateKernel( + program::cl_program, kernel_name::Ptr{Cchar}, + errcode_ret::Ptr{cl_int} + )::cl_kernel end @checked function clReleaseKernel(kernel) @@ -569,42 +601,56 @@ end end @checked function clSetKernelArg(kernel, arg_index, arg_size, arg_value) - @ccall libopencl.clSetKernelArg(kernel::cl_kernel, arg_index::cl_uint, - arg_size::Csize_t, arg_value::Ptr{Cvoid})::cl_int + @ccall libopencl.clSetKernelArg( + kernel::cl_kernel, arg_index::cl_uint, + arg_size::Csize_t, arg_value::Ptr{Cvoid} + )::cl_int end @checked function clSetKernelArgSVMPointer(kernel, arg_index, arg_value) - @ccall libopencl.clSetKernelArgSVMPointer(kernel::cl_kernel, arg_index::cl_uint, - arg_value::Ptr{Cvoid})::cl_int + @ccall libopencl.clSetKernelArgSVMPointer( + kernel::cl_kernel, arg_index::cl_uint, + arg_value::Ptr{Cvoid} + )::cl_int end -@checked function clGetKernelWorkGroupInfo(kernel, device, param_name, param_value_size, - param_value, param_value_size_ret) - @ccall libopencl.clGetKernelWorkGroupInfo(kernel::cl_kernel, device::cl_device_id, - param_name::cl_kernel_work_group_info, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int +@checked function clGetKernelWorkGroupInfo( + kernel, device, param_name, param_value_size, + param_value, param_value_size_ret + ) + @ccall libopencl.clGetKernelWorkGroupInfo( + kernel::cl_kernel, device::cl_device_id, + param_name::cl_kernel_work_group_info, + param_value_size::Csize_t, + param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int end -@checked function clEnqueueNDRangeKernel(command_queue, kernel, work_dim, - global_work_offset, global_work_size, - local_work_size, num_events_in_wait_list, - event_wait_list, event) - @ccall libopencl.clEnqueueNDRangeKernel(command_queue::cl_command_queue, - kernel::cl_kernel, work_dim::cl_uint, - global_work_offset::Ptr{Csize_t}, - global_work_size::Ptr{Csize_t}, - local_work_size::Ptr{Csize_t}, - num_events_in_wait_list::cl_uint, - event_wait_list::Ptr{cl_event}, - event::Ptr{cl_event})::cl_int +@checked function clEnqueueNDRangeKernel( + command_queue, kernel, work_dim, + global_work_offset, global_work_size, + local_work_size, num_events_in_wait_list, + event_wait_list, event + ) + @ccall libopencl.clEnqueueNDRangeKernel( + command_queue::cl_command_queue, + kernel::cl_kernel, work_dim::cl_uint, + global_work_offset::Ptr{Csize_t}, + global_work_size::Ptr{Csize_t}, + local_work_size::Ptr{Csize_t}, + num_events_in_wait_list::cl_uint, + event_wait_list::Ptr{cl_event}, + event::Ptr{cl_event} + )::cl_int end function clCreateCommandQueue(context, device, properties, errcode_ret) - @ccall libopencl.clCreateCommandQueue(context::cl_context, device::cl_device_id, - properties::cl_command_queue_properties, - errcode_ret::Ptr{cl_int})::cl_command_queue + return @ccall libopencl.clCreateCommandQueue( + context::cl_context, device::cl_device_id, + properties::cl_command_queue_properties, + errcode_ret::Ptr{cl_int} + )::cl_command_queue end @checked function clReleaseCommandQueue(command_queue) @@ -626,7 +672,7 @@ const initialized = Ref{Bool}(false) if is_high_integrity_level() @warn """Running at high integrity level, preventing OpenCL.jl from loading drivers from JLLs. - Only system drivers will be available. To enable JLL drivers, do not run Julia as an administrator.""" + Only system drivers will be available. To enable JLL drivers, do not run Julia as an administrator.""" end end @@ -635,17 +681,18 @@ const initialized = Ref{Bool}(false) ocd_filenames *= ":" * ENV["OCL_ICD_FILENAMES"] end - withenv("OCL_ICD_FILENAMES"=>ocd_filenames) do + return withenv("OCL_ICD_FILENAMES" => ocd_filenames) do num_platforms = Ref{Cuint}() @ccall libopencl.clGetPlatformIDs( 0::cl_uint, C_NULL::Ptr{cl_platform_id}, - num_platforms::Ptr{cl_uint})::cl_int + num_platforms::Ptr{cl_uint} + )::cl_int if num_platforms[] == 0 && isempty(OpenCL_jll.drivers) @error """No OpenCL drivers available, either system-wide or provided by a JLL. - Please install a system-wide OpenCL driver, or load one together with OpenCL.jl, - e.g., by doing `using OpenCL, pocl_jll`.""" + Please install a system-wide OpenCL driver, or load one together with OpenCL.jl, + e.g., by doing `using OpenCL, pocl_jll`.""" end end end @@ -691,7 +738,7 @@ function Base.getproperty(p::Platform, s::Symbol) error("Could not parse OpenCL version string: $str") end return strip(m["vendor"]) - elseif s === :opencl_version + elseif s === :opencl_version str = get_string(CL_PLATFORM_VERSION) m = match(version_re, str) if m === nothing @@ -703,7 +750,7 @@ function Base.getproperty(p::Platform, s::Symbol) elseif s === :vendor return get_string(CL_PLATFORM_VENDOR) end - + if s == :extensions size = Ref{Csize_t}() clGetPlatformInfo(p, CL_PLATFORM_EXTENSIONS, 0, C_NULL, size) @@ -868,17 +915,17 @@ devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) end if s == :max_image2d_shape - width = Ref{Csize_t}() + width = Ref{Csize_t}() height = Ref{Csize_t}() - clGetDeviceInfo(d, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(Csize_t), width, C_NULL) + clGetDeviceInfo(d, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(Csize_t), width, C_NULL) clGetDeviceInfo(d, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(Csize_t), height, C_NULL) return (width[], height[]) end if s == :max_image3d_shape - width = Ref{Csize_t}() + width = Ref{Csize_t}() height = Ref{Csize_t}() - depth = Ref{Csize_t}() + depth = Ref{Csize_t}() clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(Csize_t), width, C_NULL) clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(Csize_t), height, C_NULL) clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(Csize_t), depth, C_NULL) @@ -905,7 +952,8 @@ function Context(device::Device) err_code = Ref{Cint}() ctx_id = clCreateContext( - C_NULL, 1, device_id, C_NULL, C_NULL, err_code) + C_NULL, 1, device_id, C_NULL, C_NULL, err_code + ) if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end @@ -930,11 +978,11 @@ function Program(il, ctx) if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end - Program(program_id) + return Program(program_id) end #TODO: build callback... -function build!(p::Program; options="") +function build!(p::Program; options = "") opts = String(options) ndevices = 0 device_ids = C_NULL @@ -950,7 +998,7 @@ function build!(p::Program; options="") if p.source !== nothing println(io) println(io, "Source code:") - for (i,line) in enumerate(split(p.source, "\n")) + for (i, line) in enumerate(split(p.source, "\n")) println(io, @sprintf("%s%-2d: %s", " ", i, line)) end end @@ -1019,7 +1067,7 @@ function Base.getproperty(p::Program, s::Symbol) elseif s == :context ctx = Ref{cl_context}() clGetProgramInfo(p, CL_PROGRAM_CONTEXT, sizeof(cl_context), ctx, C_NULL) - return Context(ctx[], retain=true) + return Context(ctx[], retain = true) elseif s == :build_status status_dict = Dict{Device, cl_build_status}() for device in p.devices @@ -1074,7 +1122,7 @@ struct LocalMem{T} nbytes::Csize_t end -function LocalMem(::Type{T}, len::Integer) where T +function LocalMem(::Type{T}, len::Integer) where {T} @assert len > 0 nbytes = sizeof(T) * len return LocalMem{T}(convert(Csize_t, nbytes)) @@ -1091,7 +1139,7 @@ Base.unsafe_convert(::Type{Ptr{T}}, l::LocalMem{T}) where {T} = l function set_arg!(k::Kernel, idx::Integer, arg::Nothing) @assert idx > 0 - clSetKernelArg(k, cl_uint(idx-1), sizeof(cl_mem), C_NULL) + clSetKernelArg(k, cl_uint(idx - 1), sizeof(cl_mem), C_NULL) return k end @@ -1102,7 +1150,7 @@ end # return k # end ## when passing with `clcall`, which has pre-converted the buffer -function set_arg!(k::Kernel, idx::Integer, arg::Union{Ptr,Core.LLVMPtr}) +function set_arg!(k::Kernel, idx::Integer, arg::Union{Ptr, Core.LLVMPtr}) arg = reinterpret(Ptr{Cvoid}, arg) if arg != C_NULL # XXX: this assumes that the receiving argument is pointer-typed, which is not the @@ -1110,7 +1158,7 @@ function set_arg!(k::Kernel, idx::Integer, arg::Union{Ptr,Core.LLVMPtr}) # `Core.LLVMPtr`, which _is_ pointer-valued. We retain this handling for `Ptr` # for users passing pointers to OpenCL C, and because `Ptr` is pointer-valued # starting with Julia 1.12. - clSetKernelArgSVMPointer(k, cl_uint(idx-1), arg) + clSetKernelArgSVMPointer(k, cl_uint(idx - 1), arg) end return k end @@ -1123,24 +1171,26 @@ end # end function set_arg!(k::Kernel, idx::Integer, arg::LocalMem) - clSetKernelArg(k, cl_uint(idx-1), arg.nbytes, C_NULL) + clSetKernelArg(k, cl_uint(idx - 1), arg.nbytes, C_NULL) return k end -function set_arg!(k::Kernel, idx::Integer, arg::T) where T +function set_arg!(k::Kernel, idx::Integer, arg::T) where {T} ref = Ref(arg) tsize = sizeof(ref) err = unchecked_clSetKernelArg(k, cl_uint(idx - 1), tsize, ref) if err == CL_INVALID_ARG_SIZE - error("""Mismatch between Julia and OpenCL type for kernel argument $idx. - - Possible reasons: - - OpenCL does not support empty types. - - Vectors of length 3 (e.g., `float3`) are packed as 4-element vectors; - consider padding your tuples. - - The alignment of fields in your struct may not match the OpenCL layout. - Make sure your Julia definition matches the OpenCL layout, e.g., by - using `__attribute__((packed))` in your OpenCL struct definition.""") + error( + """Mismatch between Julia and OpenCL type for kernel argument $idx. + + Possible reasons: + - OpenCL does not support empty types. + - Vectors of length 3 (e.g., `float3`) are packed as 4-element vectors; + consider padding your tuples. + - The alignment of fields in your struct may not match the OpenCL layout. + Make sure your Julia definition matches the OpenCL layout, e.g., by + using `__attribute__((packed))` in your OpenCL struct definition.""" + ) elseif err != CL_SUCCESS throw(CLError(err)) end @@ -1151,12 +1201,15 @@ function set_args!(k::Kernel, args...) for (i, a) in enumerate(args) set_arg!(k, i, a) end + return end -function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; - global_work_offset=nothing) +function enqueue_kernel( + k::Kernel, global_work_size, local_work_size = nothing; + global_work_offset = nothing + ) max_work_dim = device().max_work_item_dims - work_dim = length(global_work_size) + work_dim = length(global_work_size) if work_dim > max_work_dim throw(ArgumentError("global_work_size has max dim of $max_work_dim")) end @@ -1201,20 +1254,26 @@ function enqueue_kernel(k::Kernel, global_work_size, local_work_size=nothing; wait_event_ids = C_NULL ret_event = C_NULL - clEnqueueNDRangeKernel(queue(), k, cl_uint(work_dim), goffset, gsize, lsize, - n_events, wait_event_ids, ret_event) + clEnqueueNDRangeKernel( + queue(), k, cl_uint(work_dim), goffset, gsize, lsize, + n_events, wait_event_ids, ret_event + ) return nothing end -function call(k::Kernel, args...; global_size=(1,), local_size=nothing, - global_work_offset=nothing, - svm_pointers::Vector{Ptr{Cvoid}}=Ptr{Cvoid}[]) +function call( + k::Kernel, args...; global_size = (1,), local_size = nothing, + global_work_offset = nothing, + svm_pointers::Vector{Ptr{Cvoid}} = Ptr{Cvoid}[] + ) set_args!(k, args...) if !isempty(svm_pointers) - clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, - sizeof(svm_pointers), svm_pointers) + clSetKernelExecInfo( + k, CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(svm_pointers), svm_pointers + ) end - enqueue_kernel(k, global_size, local_size; global_work_offset) + return enqueue_kernel(k, global_size, local_size; global_work_offset) end # convert the argument values to match the kernel's signature (specified by the user) @@ -1233,23 +1292,27 @@ end push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) end - append!(ex.args, (quote - GC.@preserve $(converted_args...) begin - f($(arg_ptrs...)) - end - end).args) + append!( + ex.args, ( + quote + GC.@preserve $(converted_args...) begin + f($(arg_ptrs...)) + end + end + ).args + ) return ex end -clcall(f::F, types::Tuple, args::Vararg{Any,N}; kwargs...) where {N,F} = +clcall(f::F, types::Tuple, args::Vararg{Any, N}; kwargs...) where {N, F} = clcall(f, _to_tuple_type(types), args...; kwargs...) -function clcall(k::Kernel, types::Type{T}, args::Vararg{Any,N}; kwargs...) where {T,N} - call_closure = function (converted_args::Vararg{Any,N}) - call(k, converted_args...; kwargs...) +function clcall(k::Kernel, types::Type{T}, args::Vararg{Any, N}; kwargs...) where {T, N} + call_closure = function (converted_args::Vararg{Any, N}) + return call(k, converted_args...; kwargs...) end - convert_arguments(call_closure, types, args...) + return convert_arguments(call_closure, types, args...) end struct KernelWorkGroupInfo @@ -1268,7 +1331,7 @@ function Base.getproperty(ki::KernelWorkGroupInfo, s::Symbol) return result[] end - if s == :size + return if s == :size Int(get(CL_KERNEL_WORK_GROUP_SIZE, Csize_t)) elseif s == :compile_size Int.(get(CL_KERNEL_COMPILE_WORK_GROUP_SIZE, NTuple{3, Csize_t})) @@ -1289,7 +1352,7 @@ mutable struct CmdQueue function CmdQueue(q_id::cl_command_queue) q = new(q_id) finalizer(q) do _ - clReleaseCommandQueue(q) + clReleaseCommandQueue(q) end return q end @@ -1298,7 +1361,7 @@ end Base.unsafe_convert(::Type{cl_command_queue}, q::CmdQueue) = q.id function CmdQueue() - flags=cl_command_queue_properties(0) + flags = cl_command_queue_properties(0) err_code = Ref{Cint}() queue_id = clCreateCommandQueue(context(), device(), flags, err_code) if err_code[] != CL_SUCCESS @@ -1315,4 +1378,4 @@ function finish(q::CmdQueue) return q end -end \ No newline at end of file +end diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl index a5c68b7b..9c995daf 100644 --- a/src/pocl/pocl.jl +++ b/src/pocl/pocl.jl @@ -2,15 +2,15 @@ module POCL function platform end function device end -function context end -function queue end +function context end +function queue end include("nanoOpenCL.jl") import .nanoOpenCL as cl function platform() - get!(task_local_storage(), :POCLPlatform) do + return get!(task_local_storage(), :POCLPlatform) do for p in cl.platforms() if p.vendor == "The pocl project" return p @@ -21,7 +21,7 @@ function platform() end function device() - get!(task_local_storage(), :POCLDevice) do + return get!(task_local_storage(), :POCLDevice) do p = platform() return cl.default_device(p) end::cl.Device @@ -29,13 +29,13 @@ end # TODO: add a device context dict function context() - get!(task_local_storage(), :POCLContext) do + return get!(task_local_storage(), :POCLContext) do cl.Context(device()) end::cl.Context end function queue() - get!(task_local_storage(), :POCLQueue) do + return get!(task_local_storage(), :POCLQueue) do cl.CmdQueue() end::cl.CmdQueue end @@ -58,8 +58,8 @@ include("device/array.jl") include("device/quirks.jl") include("device/runtime.jl") -function Adapt.adapt_storage(to::KernelAdaptor, xs::Array{T,N}) where {T,N} - CLDeviceArray{T,N,AS.Global}(size(xs), reinterpret(LLVMPtr{T,AS.Global}, pointer(xs))) +function Adapt.adapt_storage(to::KernelAdaptor, xs::Array{T, N}) where {T, N} + return CLDeviceArray{T, N, AS.Global}(size(xs), reinterpret(LLVMPtr{T, AS.Global}, pointer(xs))) end include("backend.jl") @@ -70,4 +70,4 @@ import KernelAbstractions as KA Adapt.adapt_storage(::POCLBackend, a::Array) = a -end \ No newline at end of file +end