1
+ import ClimaCore. DataLayouts:
2
+ to_non_extruded_broadcasted, has_uniform_datalayouts
1
3
DataLayouts. _device_dispatch (x:: CUDA.CuArray ) = ToCUDA ()
2
4
3
- function knl_copyto! (dest, src)
4
-
5
- i = CUDA. threadIdx (). x
6
- j = CUDA. threadIdx (). y
7
-
8
- h = CUDA. blockIdx (). x
9
- v = CUDA. blockDim (). z * (CUDA. blockIdx (). y - 1 ) + CUDA. threadIdx (). z
10
-
11
- if v <= size (dest, 4 )
12
- I = CartesianIndex ((i, j, 1 , v, h))
13
- @inbounds dest[I] = src[I]
14
- end
15
- return nothing
16
- end
17
-
18
- function Base. copyto! (
19
- dest:: IJFH{S, Nij, Nh} ,
20
- bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} ,
21
- :: ToCUDA ,
22
- ) where {S, Nij, Nh}
23
- if Nh > 0
24
- auto_launch! (
25
- knl_copyto!,
26
- (dest, bc),
27
- dest;
28
- threads_s = (Nij, Nij),
29
- blocks_s = (Nh, 1 ),
30
- )
31
- end
32
- return dest
33
- end
34
-
35
- function Base. copyto! (
36
- dest:: VIJFH{S, Nv, Nij, Nh} ,
37
- bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} ,
38
- :: ToCUDA ,
39
- ) where {S, Nv, Nij, Nh}
40
- if Nv > 0 && Nh > 0
41
- Nv_per_block = min (Nv, fld (256 , Nij * Nij))
42
- Nv_blocks = cld (Nv, Nv_per_block)
43
- auto_launch! (
44
- knl_copyto!,
45
- (dest, bc),
46
- dest;
47
- threads_s = (Nij, Nij, Nv_per_block),
48
- blocks_s = (Nh, Nv_blocks),
49
- )
50
- end
51
- return dest
52
- end
53
-
54
- function Base. copyto! (
55
- dest:: VF{S, Nv} ,
56
- bc:: DataLayouts.BroadcastedUnionVF{S, Nv} ,
57
- :: ToCUDA ,
58
- ) where {S, Nv}
59
- if Nv > 0
60
- auto_launch! (
61
- knl_copyto!,
62
- (dest, bc),
63
- dest;
64
- threads_s = (1 , 1 ),
65
- blocks_s = (1 , Nv),
66
- )
67
- end
68
- return dest
69
- end
70
-
71
- function Base. copyto! (
72
- dest:: DataF{S} ,
73
- bc:: DataLayouts.BroadcastedUnionDataF{S} ,
74
- :: ToCUDA ,
75
- ) where {S}
76
- auto_launch! (
77
- knl_copyto!,
78
- (dest, bc),
79
- dest;
80
- threads_s = (1 , 1 ),
81
- blocks_s = (1 , 1 ),
82
- )
83
- return dest
84
- end
85
-
86
5
import ClimaCore. DataLayouts: isascalar
87
- function knl_copyto_flat ! (dest:: AbstractData , bc, us)
6
+ function knl_copyto_cart ! (dest:: AbstractData , bc, us)
88
7
@inbounds begin
89
8
tidx = thread_index ()
90
9
if tidx ≤ get_N (us)
@@ -96,24 +15,38 @@ function knl_copyto_flat!(dest::AbstractData, bc, us)
96
15
return nothing
97
16
end
98
17
18
+ function knl_copyto_linear! (dest:: AbstractData , bc, us)
19
+ @inbounds begin
20
+ tidx = thread_index ()
21
+ if tidx ≤ get_N (us)
22
+ dest[tidx] = bc[tidx]
23
+ end
24
+ end
25
+ return nothing
26
+ end
27
+
99
28
function cuda_copyto! (dest:: AbstractData , bc)
100
29
(_, _, Nv, Nh) = DataLayouts. universal_size (dest)
30
+ (Nv > 0 && Nh > 0 ) || return dest
101
31
us = DataLayouts. UniversalSize (dest)
102
- if Nv > 0 && Nh > 0
103
- auto_launch! (knl_copyto_flat!, (dest, bc, us), dest; auto = true )
32
+ if has_uniform_datalayouts (bc)
33
+ bc′ = to_non_extruded_broadcasted (bc)
34
+ auto_launch! (knl_copyto_linear!, (dest, bc′, us), dest; auto = true )
35
+ else
36
+ auto_launch! (knl_copyto_cart!, (dest, bc, us), dest; auto = true )
104
37
end
105
38
return dest
106
39
end
107
40
108
41
# TODO : can we use CUDA's luanch configuration for all data layouts?
109
42
# Currently, it seems to have a slight performance degradation.
110
43
# ! format: off
111
- # Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc)
44
+ Base. copyto! (dest:: IJFH{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh} , :: ToCUDA ) where {S, Nij, Nh} = cuda_copyto! (dest, bc)
112
45
Base. copyto! (dest:: IFH{S, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionIFH{S, Ni, Nh} , :: ToCUDA ) where {S, Ni, Nh} = cuda_copyto! (dest, bc)
113
46
Base. copyto! (dest:: IJF{S, Nij} , bc:: DataLayouts.BroadcastedUnionIJF{S, Nij} , :: ToCUDA ) where {S, Nij} = cuda_copyto! (dest, bc)
114
47
Base. copyto! (dest:: IF{S, Ni} , bc:: DataLayouts.BroadcastedUnionIF{S, Ni} , :: ToCUDA ) where {S, Ni} = cuda_copyto! (dest, bc)
115
48
Base. copyto! (dest:: VIFH{S, Nv, Ni, Nh} , bc:: DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh} , :: ToCUDA ) where {S, Nv, Ni, Nh} = cuda_copyto! (dest, bc)
116
- # Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc)
117
- # Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc)
118
- # Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc)
49
+ Base. copyto! (dest:: VIJFH{S, Nv, Nij, Nh} , bc:: DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh} , :: ToCUDA ) where {S, Nv, Nij, Nh} = cuda_copyto! (dest, bc)
50
+ Base. copyto! (dest:: VF{S, Nv} , bc:: DataLayouts.BroadcastedUnionVF{S, Nv} , :: ToCUDA ) where {S, Nv} = cuda_copyto! (dest, bc)
51
+ Base. copyto! (dest:: DataF{S} , bc:: DataLayouts.BroadcastedUnionDataF{S} , :: ToCUDA ) where {S} = cuda_copyto! (dest, bc)
119
52
# ! format: on
0 commit comments