|
1 | 1 | DataLayouts._device_dispatch(x::CUDA.CuArray) = ToCUDA()
|
2 | 2 |
|
3 |
| -function knl_copyto!(dest, src) |
4 |
| - |
5 |
| - i = CUDA.threadIdx().x |
6 |
| - j = CUDA.threadIdx().y |
7 |
| - |
8 |
| - h = CUDA.blockIdx().x |
9 |
| - v = CUDA.blockDim().z * (CUDA.blockIdx().y - 1) + CUDA.threadIdx().z |
10 |
| - |
11 |
| - if v <= size(dest, 4) |
12 |
| - I = CartesianIndex((i, j, 1, v, h)) |
| 3 | +function knl_copyto!(dest, src, us) |
| 4 | + I = universal_index(dest) |
| 5 | + if is_valid_index(dest, I, us) |
13 | 6 | @inbounds dest[I] = src[I]
|
14 | 7 | end
|
15 | 8 | return nothing
|
16 | 9 | end
|
17 | 10 |
|
18 |
| -function Base.copyto!( |
19 |
| - dest::IJFH{S, Nij, Nh}, |
20 |
| - bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, |
21 |
| - ::ToCUDA, |
22 |
| -) where {S, Nij, Nh} |
23 |
| - if Nh > 0 |
24 |
| - auto_launch!( |
25 |
| - knl_copyto!, |
26 |
| - (dest, bc); |
27 |
| - threads_s = (Nij, Nij), |
28 |
| - blocks_s = (Nh, 1), |
29 |
| - ) |
30 |
| - end |
31 |
| - return dest |
32 |
| -end |
33 |
| - |
34 |
| -function Base.copyto!( |
35 |
| - dest::VIJFH{S, Nv, Nij, Nh}, |
36 |
| - bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, |
37 |
| - ::ToCUDA, |
38 |
| -) where {S, Nv, Nij, Nh} |
39 |
| - if Nv > 0 && Nh > 0 |
40 |
| - Nv_per_block = min(Nv, fld(256, Nij * Nij)) |
41 |
| - Nv_blocks = cld(Nv, Nv_per_block) |
42 |
| - auto_launch!( |
43 |
| - knl_copyto!, |
44 |
| - (dest, bc); |
45 |
| - threads_s = (Nij, Nij, Nv_per_block), |
46 |
| - blocks_s = (Nh, Nv_blocks), |
47 |
| - ) |
48 |
| - end |
49 |
| - return dest |
50 |
| -end |
51 |
| - |
52 |
| -function Base.copyto!( |
53 |
| - dest::VF{S, Nv}, |
54 |
| - bc::DataLayouts.BroadcastedUnionVF{S, Nv}, |
55 |
| - ::ToCUDA, |
56 |
| -) where {S, Nv} |
57 |
| - if Nv > 0 |
58 |
| - auto_launch!( |
59 |
| - knl_copyto!, |
60 |
| - (dest, bc); |
61 |
| - threads_s = (1, 1), |
62 |
| - blocks_s = (1, Nv), |
63 |
| - ) |
64 |
| - end |
65 |
| - return dest |
66 |
| -end |
67 |
| - |
68 |
| -function Base.copyto!( |
69 |
| - dest::DataF{S}, |
70 |
| - bc::DataLayouts.BroadcastedUnionDataF{S}, |
71 |
| - ::ToCUDA, |
72 |
| -) where {S} |
73 |
| - auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1)) |
74 |
| - return dest |
75 |
| -end |
76 |
| - |
77 |
| -import ClimaCore.DataLayouts: isascalar |
78 |
| -function knl_copyto_flat!(dest::AbstractData, bc, us) |
79 |
| - @inbounds begin |
80 |
| - tidx = thread_index() |
81 |
| - if tidx ≤ get_N(us) |
82 |
| - n = size(dest) |
83 |
| - I = kernel_indexes(tidx, n) |
84 |
| - dest[I] = bc[I] |
85 |
| - end |
86 |
| - end |
87 |
| - return nothing |
88 |
| -end |
89 |
| - |
90 | 11 | function cuda_copyto!(dest::AbstractData, bc)
|
91 | 12 | (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
|
92 | 13 | us = DataLayouts.UniversalSize(dest)
|
93 | 14 | if Nv > 0 && Nh > 0
|
94 |
| - nitems = prod(DataLayouts.universal_size(dest)) |
95 |
| - auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true) |
| 15 | + args = (dest, bc, us) |
| 16 | + threads = threads_via_occupancy(knl_copyto!, args) |
| 17 | + n_max_threads = min(threads, get_N(us)) |
| 18 | + p = partition(dest, n_max_threads) |
| 19 | + auto_launch!( |
| 20 | + knl_copyto!, |
| 21 | + args; |
| 22 | + threads_s = p.threads, |
| 23 | + blocks_s = p.blocks, |
| 24 | + ) |
96 | 25 | end
|
97 | 26 | return dest
|
98 | 27 | end
|
99 | 28 |
|
100 |
| -# TODO: can we use CUDA's luanch configuration for all data layouts? |
101 |
| -# Currently, it seems to have a slight performance degradation. |
102 | 29 | #! format: off
|
103 |
| -# Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) |
| 30 | +Base.copyto!(dest::IJFH{S, Nij}, bc::DataLayouts.BroadcastedUnionIJFH{S, Nij, Nh}, ::ToCUDA) where {S, Nij, Nh} = cuda_copyto!(dest, bc) |
104 | 31 | Base.copyto!(dest::IFH{S, Ni, Nh}, bc::DataLayouts.BroadcastedUnionIFH{S, Ni, Nh}, ::ToCUDA) where {S, Ni, Nh} = cuda_copyto!(dest, bc)
|
105 | 32 | Base.copyto!(dest::IJF{S, Nij}, bc::DataLayouts.BroadcastedUnionIJF{S, Nij}, ::ToCUDA) where {S, Nij} = cuda_copyto!(dest, bc)
|
106 | 33 | Base.copyto!(dest::IF{S, Ni}, bc::DataLayouts.BroadcastedUnionIF{S, Ni}, ::ToCUDA) where {S, Ni} = cuda_copyto!(dest, bc)
|
107 | 34 | Base.copyto!(dest::VIFH{S, Nv, Ni, Nh}, bc::DataLayouts.BroadcastedUnionVIFH{S, Nv, Ni, Nh}, ::ToCUDA) where {S, Nv, Ni, Nh} = cuda_copyto!(dest, bc)
|
108 |
| -# Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) |
109 |
| -# Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) |
110 |
| -# Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) |
| 35 | +Base.copyto!(dest::VIJFH{S, Nv, Nij, Nh}, bc::DataLayouts.BroadcastedUnionVIJFH{S, Nv, Nij, Nh}, ::ToCUDA) where {S, Nv, Nij, Nh} = cuda_copyto!(dest, bc) |
| 36 | +Base.copyto!(dest::VF{S, Nv}, bc::DataLayouts.BroadcastedUnionVF{S, Nv}, ::ToCUDA) where {S, Nv} = cuda_copyto!(dest, bc) |
| 37 | +Base.copyto!(dest::DataF{S}, bc::DataLayouts.BroadcastedUnionDataF{S}, ::ToCUDA) where {S} = cuda_copyto!(dest, bc) |
111 | 38 | #! format: on
|
0 commit comments