diff --git a/ext/cuda/cuda_utils.jl b/ext/cuda/cuda_utils.jl
index d9ee184d78..15ee90ce34 100644
--- a/ext/cuda/cuda_utils.jl
+++ b/ext/cuda/cuda_utils.jl
@@ -3,11 +3,6 @@ import ClimaCore.Fields
 import ClimaCore.DataLayouts
 import ClimaCore.DataLayouts: empty_kernel_stats
 
-get_n_items(field::Fields.Field) = get_n_items(Fields.field_values(field))
-get_n_items(data::DataLayouts.AbstractData) = get_n_items(size(data))
-get_n_items(arr::AbstractArray) = get_n_items(size(parent(arr)))
-get_n_items(tup::Tuple) = prod(tup)
-
 const reported_stats = Dict()
 # Call via ClimaCore.DataLayouts.empty_kernel_stats()
 empty_kernel_stats(::ClimaComms.CUDADevice) = empty!(reported_stats)
@@ -37,7 +32,7 @@ to benchmark compare against auto-determined threads/blocks (if `auto=false`).
 function auto_launch!(
     f!::F!,
     args,
-    data;
+    nitems::Union{Integer, Nothing} = nothing;
     auto = false,
     threads_s = nothing,
     blocks_s = nothing,
@@ -45,7 +40,7 @@ function auto_launch!(
     caller = :unknown,
 ) where {F!}
     if auto
-        nitems = get_n_items(data)
+        @assert !isnothing(nitems)
         if nitems ≥ 0
             kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
             config = CUDA.launch_configuration(kernel.fun)
@@ -64,7 +59,7 @@ function auto_launch!(
         # CUDA.registers(kernel) > 50 || return nothing # for debugging
         # occursin("single_field_solve_kernel", string(nameof(F!))) || return nothing
         if !haskey(reported_stats, key)
-            nitems = get_n_items(data)
+            @assert !isnothing(nitems)
             kernel = CUDA.@cuda always_inline = true launch = false f!(args...)
             config = CUDA.launch_configuration(kernel.fun)
             threads = min(nitems, config.threads)
diff --git a/ext/cuda/data_layouts_copyto.jl b/ext/cuda/data_layouts_copyto.jl
index d32b6aee54..82cf46d88c 100644
--- a/ext/cuda/data_layouts_copyto.jl
+++ b/ext/cuda/data_layouts_copyto.jl
@@ -23,8 +23,7 @@ function Base.copyto!(
     if Nh > 0
         auto_launch!(
             knl_copyto!,
-            (dest, bc),
-            dest;
+            (dest, bc);
             threads_s = (Nij, Nij),
             blocks_s = (Nh, 1),
         )
@@ -42,8 +41,7 @@ function Base.copyto!(
         Nv_blocks = cld(Nv, Nv_per_block)
         auto_launch!(
             knl_copyto!,
-            (dest, bc),
-            dest;
+            (dest, bc);
             threads_s = (Nij, Nij, Nv_per_block),
             blocks_s = (Nh, Nv_blocks),
         )
@@ -59,8 +57,7 @@ function Base.copyto!(
     if Nv > 0
         auto_launch!(
             knl_copyto!,
-            (dest, bc),
-            dest;
+            (dest, bc);
             threads_s = (1, 1),
             blocks_s = (1, Nv),
         )
@@ -73,13 +70,7 @@ function Base.copyto!(
     bc::DataLayouts.BroadcastedUnionDataF{S},
     ::ToCUDA,
 ) where {S}
-    auto_launch!(
-        knl_copyto!,
-        (dest, bc),
-        dest;
-        threads_s = (1, 1),
-        blocks_s = (1, 1),
-    )
+    auto_launch!(knl_copyto!, (dest, bc); threads_s = (1, 1), blocks_s = (1, 1))
     return dest
 end
 
@@ -100,7 +91,8 @@ function cuda_copyto!(dest::AbstractData, bc)
     (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
     if Nv > 0 && Nh > 0
-        auto_launch!(knl_copyto_flat!, (dest, bc, us), dest; auto = true)
+        nitems = prod(DataLayouts.universal_size(dest))
+        auto_launch!(knl_copyto_flat!, (dest, bc, us), nitems; auto = true)
     end
     return dest
 end
diff --git a/ext/cuda/data_layouts_fill.jl b/ext/cuda/data_layouts_fill.jl
index 087d5f2a84..cac5bdf526 100644
--- a/ext/cuda/data_layouts_fill.jl
+++ b/ext/cuda/data_layouts_fill.jl
@@ -14,7 +14,8 @@ function cuda_fill!(dest::AbstractData, val)
     (_, _, Nv, _, Nh) = DataLayouts.universal_size(dest)
     us = DataLayouts.UniversalSize(dest)
     if Nv > 0 && Nh > 0
-        auto_launch!(knl_fill_flat!, (dest, val, us), dest; auto = true)
+        nitems = prod(DataLayouts.universal_size(dest))
+        auto_launch!(knl_fill_flat!, (dest, val, us), nitems; auto = true)
     end
     return dest
 end
diff --git a/ext/cuda/data_layouts_fused_copyto.jl b/ext/cuda/data_layouts_fused_copyto.jl
index a566e69a5f..0b1d1126d1 100644
--- a/ext/cuda/data_layouts_fused_copyto.jl
+++ b/ext/cuda/data_layouts_fused_copyto.jl
@@ -50,8 +50,7 @@ function fused_copyto!(
         Nv_blocks = cld(Nv, Nv_per_block)
         auto_launch!(
             knl_fused_copyto!,
-            (fmbc,),
-            dest1;
+            (fmbc,);
             threads_s = (Nij, Nij, Nv_per_block),
             blocks_s = (Nh, Nv_blocks),
         )
@@ -68,8 +67,7 @@ function fused_copyto!(
     if Nh > 0
         auto_launch!(
             knl_fused_copyto!,
-            (fmbc,),
-            dest1;
+            (fmbc,);
             threads_s = (Nij, Nij),
             blocks_s = (Nh, 1),
         )
@@ -85,8 +83,7 @@ function fused_copyto!(
     if Nv > 0 && Nh > 0
         auto_launch!(
             knl_fused_copyto!,
-            (fmbc,),
-            dest1;
+            (fmbc,);
             threads_s = (1, 1),
             blocks_s = (Nh, Nv),
         )
@@ -101,8 +98,7 @@ function fused_copyto!(
 ) where {S}
     auto_launch!(
         knl_fused_copyto!,
-        (fmbc,),
-        dest1;
+        (fmbc,);
         threads_s = (1, 1),
         blocks_s = (1, 1),
     )
diff --git a/ext/cuda/data_layouts_mapreduce.jl b/ext/cuda/data_layouts_mapreduce.jl
index 5d63cf365d..18435eade6 100644
--- a/ext/cuda/data_layouts_mapreduce.jl
+++ b/ext/cuda/data_layouts_mapreduce.jl
@@ -28,7 +28,7 @@ function mapreduce_cuda(
     pdata = parent(data)
     T = eltype(pdata)
     (Ni, Nj, Nk, Nv, Nh) = size(data)
-    Nf = div(length(pdata), prod(size(data))) # length of field dimension
+    Nf = DataLayouts.ncomponents(data) # length of field dimension
     pwt = parent(weighted_jacobian)
 
     nitems = Nv * Ni * Nj * Nk * Nh
diff --git a/ext/cuda/limiters.jl b/ext/cuda/limiters.jl
index 7511c279f4..a7dd6e393a 100644
--- a/ext/cuda/limiters.jl
+++ b/ext/cuda/limiters.jl
@@ -21,23 +21,15 @@ function compute_element_bounds!(
     ρ,
     ::ClimaComms.CUDADevice,
 )
-    S = size(Fields.field_values(ρ))
-    (Ni, Nj, _, Nv, Nh) = S
+    ρ_values = Fields.field_values(Operators.strip_space(ρ, axes(ρ)))
+    ρq_values = Fields.field_values(Operators.strip_space(ρq, axes(ρq)))
+    (_, _, _, Nv, Nh) = DataLayouts.universal_size(ρ_values)
     nthreads, nblocks = config_threadblock(Nv, Nh)
 
-    args = (
-        limiter,
-        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
-        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
-        Nv,
-        Nh,
-        Val(Ni),
-        Val(Nj),
-    )
+    args = (limiter, ρq_values, ρ_values)
     auto_launch!(
         compute_element_bounds_kernel!,
-        args,
-        ρ;
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
     )
@@ -45,15 +37,8 @@ function compute_element_bounds!(
 end
 
 
-function compute_element_bounds_kernel!(
-    limiter,
-    ρq,
-    ρ,
-    Nv,
-    Nh,
-    ::Val{Ni},
-    ::Val{Nj},
-) where {Ni, Nj}
+function compute_element_bounds_kernel!(limiter, ρq, ρ)
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(ρ)
     n = (Nv, Nh)
     tidx = thread_index()
     @inbounds if valid_range(tidx, prod(n))
@@ -88,21 +73,18 @@ function compute_neighbor_bounds_local!(
     ::ClimaComms.CUDADevice,
 )
     topology = Spaces.topology(axes(ρ))
-    Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
+    us = DataLayouts.UniversalSize(Fields.field_values(ρ))
+    (_, _, _, Nv, Nh) = DataLayouts.universal_size(us)
     nthreads, nblocks = config_threadblock(Nv, Nh)
     args = (
         limiter,
         topology.local_neighbor_elem,
         topology.local_neighbor_elem_offset,
-        Nv,
-        Nh,
-        Val(Ni),
-        Val(Nj),
+        us,
     )
     auto_launch!(
         compute_neighbor_bounds_local_kernel!,
-        args,
-        ρ;
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
     )
@@ -112,12 +94,9 @@ function compute_neighbor_bounds_local_kernel!(
     limiter,
     local_neighbor_elem,
     local_neighbor_elem_offset,
-    Nv,
-    Nh,
-    ::Val{Ni},
-    ::Val{Nj},
-) where {Ni, Nj}
-
+    us::DataLayouts.UniversalSize,
+)
+    (_, _, _, Nv, Nh) = DataLayouts.universal_size(us)
     n = (Nv, Nh)
     tidx = thread_index()
     @inbounds if valid_range(tidx, prod(n))
@@ -147,9 +126,10 @@ function apply_limiter!(
     ::ClimaComms.CUDADevice,
 )
     ρq_data = Fields.field_values(ρq)
-    (Ni, Nj, _, Nv, Nh) = size(ρq_data)
-    Nf = DataLayouts.ncomponents(ρq_data)
+    us = DataLayouts.UniversalSize(ρq_data)
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us)
     maxiter = Ni * Nj
+    Nf = DataLayouts.ncomponents(ρq_data)
     WJ = Spaces.local_geometry_data(axes(ρq)).WJ
     nthreads, nblocks = config_threadblock(Nv, Nh)
     args = (
@@ -157,17 +137,13 @@ function apply_limiter!(
         Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
         Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
         WJ,
-        Nv,
-        Nh,
+        us,
         Val(Nf),
-        Val(Ni),
-        Val(Nj),
         Val(maxiter),
     )
     auto_launch!(
         apply_limiter_kernel!,
-        args,
-        ρ;
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
     )
@@ -179,15 +155,13 @@ function apply_limiter_kernel!(
     ρq_data,
     ρ_data,
     WJ_data,
-    Nv,
-    Nh,
+    us::DataLayouts.UniversalSize,
     ::Val{Nf},
-    ::Val{Ni},
-    ::Val{Nj},
     ::Val{maxiter},
-) where {Nf, Ni, Nj, maxiter}
+) where {Nf, maxiter}
     (; q_bounds_nbr, rtol) = limiter
     converged = true
+    (Ni, Nj, _, Nv, Nh) = DataLayouts.universal_size(us)
     n = (Nv, Nh)
     tidx = thread_index()
     @inbounds if valid_range(tidx, prod(n))
diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl
index afd5c4adb8..d4150da5bc 100644
--- a/ext/cuda/matrix_fields_multiple_field_solve.jl
+++ b/ext/cuda/matrix_fields_multiple_field_solve.jl
@@ -38,8 +38,7 @@ NVTX.@annotate function multiple_field_solve!(
 
     auto_launch!(
         multiple_field_solve_kernel!,
-        args,
-        x1;
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
         always_inline = true,
diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl
index c1149a1e8a..67e520a823 100644
--- a/ext/cuda/matrix_fields_single_field_solve.jl
+++ b/ext/cuda/matrix_fields_single_field_solve.jl
@@ -21,8 +21,7 @@ function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
     args = (device, cache, x, A, b)
     auto_launch!(
         single_field_solve_kernel!,
-        args,
-        x;
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
     )
diff --git a/ext/cuda/operators_finite_difference.jl b/ext/cuda/operators_finite_difference.jl
index 980e5a813a..1cb8407136 100644
--- a/ext/cuda/operators_finite_difference.jl
+++ b/ext/cuda/operators_finite_difference.jl
@@ -36,8 +36,7 @@ function Base.copyto!(
         (strip_space(out, space), strip_space(bc, space), axes(out), bounds, us)
     auto_launch!(
         copyto_stencil_kernel!,
-        args,
-        out;
+        args;
         threads_s = (nthreads,),
         blocks_s = (nblocks,),
     )
diff --git a/ext/cuda/operators_integral.jl b/ext/cuda/operators_integral.jl
index 651a010a47..cf5e5d2ac8 100644
--- a/ext/cuda/operators_integral.jl
+++ b/ext/cuda/operators_integral.jl
@@ -29,7 +29,7 @@ function column_reduce_device!(
         init,
         space,
     )
-    auto_launch!(bycolumn_kernel!, args, (); threads_s, blocks_s)
+    auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s)
 end
 
 function column_accumulate_device!(
@@ -52,7 +52,7 @@ function column_accumulate_device!(
         init,
         space,
     )
-    auto_launch!(bycolumn_kernel!, args, (); threads_s, blocks_s)
+    auto_launch!(bycolumn_kernel!, args; threads_s, blocks_s)
 end
 
 bycolumn_kernel!(
diff --git a/ext/cuda/operators_spectral_element.jl b/ext/cuda/operators_spectral_element.jl
index d99400ba97..47c6cb1c82 100644
--- a/ext/cuda/operators_spectral_element.jl
+++ b/ext/cuda/operators_spectral_element.jl
@@ -51,8 +51,7 @@ function Base.copyto!(
     )
     auto_launch!(
         copyto_spectral_kernel!,
-        args,
-        out;
+        args;
         threads_s = (Nq, Nq, Nvthreads),
         blocks_s = (Nh, Nvblocks),
     )
diff --git a/ext/cuda/operators_thomas_algorithm.jl b/ext/cuda/operators_thomas_algorithm.jl
index 9d416dfc7c..83546518ab 100644
--- a/ext/cuda/operators_thomas_algorithm.jl
+++ b/ext/cuda/operators_thomas_algorithm.jl
@@ -10,8 +10,7 @@ function column_thomas_solve!(::ClimaComms.CUDADevice, A, b)
     args = (A, b)
     auto_launch!(
         thomas_algorithm_kernel!,
-        args,
-        size(Fields.field_values(A));
+        args;
         threads_s = nthreads,
         blocks_s = nblocks,
     )
diff --git a/ext/cuda/remapping_distributed.jl b/ext/cuda/remapping_distributed.jl
index f5fc20183b..70246c47d4 100644
--- a/ext/cuda/remapping_distributed.jl
+++ b/ext/cuda/remapping_distributed.jl
@@ -29,8 +29,7 @@ function _set_interpolated_values_device!(
     )
     auto_launch!(
         set_interpolated_values_kernel!,
-        args,
-        out;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
@@ -163,8 +162,7 @@ function _set_interpolated_values_device!(
     )
     auto_launch!(
         set_interpolated_values_kernel!,
-        args,
-        out;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
diff --git a/ext/cuda/remapping_interpolate_array.jl b/ext/cuda/remapping_interpolate_array.jl
index 88d3234913..d96862c679 100644
--- a/ext/cuda/remapping_interpolate_array.jl
+++ b/ext/cuda/remapping_interpolate_array.jl
@@ -22,8 +22,7 @@ function interpolate_slab!(
     args = (output_cuarray, field, cuslab_indices, cuweights)
     auto_launch!(
         interpolate_slab_kernel!,
-        args,
-        output_cuarray;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
@@ -107,8 +106,7 @@ function interpolate_slab_level!(
     args = (output_cuarray, field, cuvidx_ref_coordinates, h, Is)
     auto_launch!(
         interpolate_slab_level_kernel!,
-        args,
-        out;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
diff --git a/ext/cuda/topologies_dss.jl b/ext/cuda/topologies_dss.jl
index ac4a08b722..062917cde2 100644
--- a/ext/cuda/topologies_dss.jl
+++ b/ext/cuda/topologies_dss.jl
@@ -20,16 +20,13 @@ function Topologies.dss_load_perimeter_data!(
     data::Union{DataLayouts.IJFH, DataLayouts.VIJFH},
     perimeter::Topologies.Perimeter2D,
 )
-    pperimeter_data = parent(dss_buffer.perimeter_data)
-    pdata = parent(data)
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
-    nitems = nlevels * nperimeter * nfid * nelems
+    (; perimeter_data) = dss_buffer
+    nitems = prod(DataLayouts.farray_size(perimeter_data))
     nthreads, nblocks = _configure_threadblock(nitems)
-    args = (pperimeter_data, pdata, perimeter)
+    args = (perimeter_data, data, perimeter)
     auto_launch!(
         dss_load_perimeter_data_kernel!,
-        args,
-        pperimeter_data;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
@@ -37,13 +34,16 @@ function Topologies.dss_load_perimeter_data!(
 end
 
 function dss_load_perimeter_data_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
-    pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
+    perimeter_data::DataLayouts.AbstractData,
+    data::Union{DataLayouts.IJFH, DataLayouts.VIJFH},
     perimeter::Topologies.Perimeter2D{Nq},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, _, nfidx, nelems) = sizep = size(pperimeter_data) # size of perimeter data array
+) where {Nq}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, _, nfidx, nelems) =
+        sizep = DataLayouts.farray_size(perimeter_data) # size of perimeter data array
     sized = (nlevels, Nq, Nq, nfidx, nelems) # size of data
+    pperimeter_data = parent(perimeter_data)
+    pdata = parent(data)
 
     if gidx ≤ prod(sizep)
         (level, p, fidx, elem) = cart_ind(sizep, gidx).I
@@ -60,16 +60,13 @@ function Topologies.dss_unload_perimeter_data!(
     dss_buffer::Topologies.DSSBuffer,
     perimeter,
 )
-    pperimeter_data = parent(dss_buffer.perimeter_data)
-    pdata = parent(data)
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
-    nitems = nlevels * nperimeter * nfid * nelems
+    (; perimeter_data) = dss_buffer
+    nitems = prod(DataLayouts.farray_size(perimeter_data))
     nthreads, nblocks = _configure_threadblock(nitems)
-    args = (pdata, pperimeter_data, perimeter)
+    args = (data, perimeter_data, perimeter)
     auto_launch!(
         dss_unload_perimeter_data_kernel!,
-        args,
-        pdata;
+        args;
         threads_s = (nthreads),
         blocks_s = (nblocks),
     )
@@ -77,13 +74,16 @@ function Topologies.dss_unload_perimeter_data!(
 end
 
 function dss_unload_perimeter_data_kernel!(
-    pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
-    pperimeter_data::AbstractArray{FT, 4},
+    data::Union{DataLayouts.IJFH, DataLayouts.VIJFH},
+    perimeter_data::AbstractData,
     perimeter::Topologies.Perimeter2D{Nq},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, nperimeter, nfidx, nelems) = sizep = size(pperimeter_data) # size of perimeter data array
+) where {Nq}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, nperimeter, nfidx, nelems) =
+        sizep = DataLayouts.farray_size(perimeter_data) # size of perimeter data array
     sized = (nlevels, Nq, Nq, nfidx, nelems) # size of data
+    pperimeter_data = parent(perimeter_data)
+    pdata = parent(data)
 
     if gidx ≤ prod(sizep)
         (level, p, fidx, elem) = cart_ind(sizep, gidx).I
@@ -103,13 +103,13 @@ function Topologies.dss_local!(
     nlocalvertices = length(topology.local_vertex_offset) - 1
     nlocalfaces = length(topology.interior_faces)
     if (nlocalvertices + nlocalfaces) > 0
-        pperimeter_data = parent(perimeter_data)
-        (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
+        (nlevels, nperimeter, nfid, nelems) =
+            DataLayouts.farray_size(perimeter_data)
 
         nitems = nlevels * nfid * (nlocalfaces + nlocalvertices)
         nthreads, nblocks = _configure_threadblock(nitems)
         args = (
-            pperimeter_data,
+            perimeter_data,
             topology.local_vertices,
             topology.local_vertex_offset,
             topology.interior_faces,
@@ -117,8 +117,7 @@ function Topologies.dss_local!(
         )
         auto_launch!(
             dss_local_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -127,16 +126,19 @@ function Topologies.dss_local!(
 end
 
 function dss_local_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::DataLayouts.VIFH,
     local_vertices::AbstractVector{Tuple{Int, Int}},
     local_vertex_offset::AbstractVector{Int},
     interior_faces::AbstractVector{Tuple{Int, Int, Int, Int, Bool}},
     perimeter::Topologies.Perimeter2D{Nq},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+) where {Nq}
+    FT = eltype(parent(perimeter_data))
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
     nlocalvertices = length(local_vertex_offset) - 1
     nlocalfaces = length(interior_faces)
-    (nlevels, nperimeter, nfidx, _) = size(pperimeter_data)
+    pperimeter_data = parent(perimeter_data)
+    FT = eltype(pperimeter_data)
+    (nlevels, nperimeter, nfidx, _) = DataLayouts.farray_size(perimeter_data)
     if gidx ≤ nlevels * nfidx * nlocalvertices # local vertices
         sizev = (nlevels, nfidx, nlocalvertices)
         (level, fidx, vertexid) = cart_ind(sizev, gidx).I
@@ -200,15 +202,15 @@ function Topologies.dss_transform!(
         p∂ξ∂x = parent(∂ξ∂x)
         pperimeter_data = parent(perimeter_data)
         nmetric = cld(length(p∂ξ∂x), prod(size(∂ξ∂x)))
-        (nlevels, nperimeter, _, _) = size(pperimeter_data)
+        (nlevels, nperimeter, _, _) = DataLayouts.array_size(perimeter_data)
         nitems = nlevels * nperimeter * nlocalelems
         nthreads, nblocks = _configure_threadblock(nitems)
         args = (
-            pperimeter_data,
+            perimeter_data,
             pdata,
             p∂ξ∂x,
             p∂x∂ξ,
-            nmetric,
+            Val(nmetric),
             pweight,
             perimeter,
             scalarfidx,
@@ -217,11 +219,11 @@ function Topologies.dss_transform!(
             covariant123fidx,
             contravariant123fidx,
             localelems,
+            Val(nlocalelems),
         )
         auto_launch!(
             dss_transform_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -230,11 +232,11 @@ function Topologies.dss_transform!(
 end
 
 function dss_transform_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::DataLayouts.VIFH,
     pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
     p∂ξ∂x::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
     p∂x∂ξ::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
-    nmetric::Int,
+    ::Val{nmetric},
     pweight::AbstractArray{FT, 4},
     perimeter::Topologies.Perimeter2D{Nq},
     scalarfidx::AbstractVector{Int},
@@ -243,10 +245,12 @@ function dss_transform_kernel!(
     covariant123fidx::AbstractVector{Int},
     contravariant123fidx::AbstractVector{Int},
     localelems::AbstractVector{Int},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
-    nlocalelems = length(localelems)
+    ::Val{nlocalelems},
+) where {FT <: AbstractFloat, Nq, nmetric, nlocalelems}
+    pperimeter_data = parent(perimeter_data)
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, nperimeter, nfid, nelems) =
+        DataLayouts.farray_size(perimeter_data)
     if gidx ≤ nlevels * nperimeter * nlocalelems
         sizet = (nlevels, nperimeter, nlocalelems)
         sizet_data = (nlevels, Nq, Nq, nfid, nelems)
@@ -393,12 +397,11 @@ function Topologies.dss_untransform!(
         p∂x∂ξ = parent(∂x∂ξ)
         p∂ξ∂x = parent(∂ξ∂x)
         nmetric = cld(length(p∂ξ∂x), prod(size(∂ξ∂x)))
-        pperimeter_data = parent(perimeter_data)
-        (nlevels, nperimeter, _, _) = size(pperimeter_data)
+        (nlevels, nperimeter, _, _) = DataLayouts.array_size(perimeter_data)
         nitems = nlevels * nperimeter * nlocalelems
         nthreads, nblocks = _configure_threadblock(nitems)
         args = (
-            pperimeter_data,
+            perimeter_data,
             pdata,
             p∂ξ∂x,
             p∂x∂ξ,
@@ -410,11 +413,11 @@ function Topologies.dss_untransform!(
             covariant123fidx,
             contravariant123fidx,
             localelems,
+            Val(nlocalelems),
         )
         auto_launch!(
             dss_untransform_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -423,7 +426,7 @@ function Topologies.dss_untransform!(
 end
 
 function dss_untransform_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::DataLayouts.VIFH,
     pdata::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
     p∂ξ∂x::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
     p∂x∂ξ::Union{AbstractArray{FT, 4}, AbstractArray{FT, 5}},
@@ -435,10 +438,12 @@ function dss_untransform_kernel!(
     covariant123fidx::AbstractVector{Int},
     contravariant123fidx::AbstractVector{Int},
     localelems::AbstractVector{Int},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
-    nlocalelems = length(localelems)
+    ::Val{nlocalelems},
+) where {FT <: AbstractFloat, Nq, nlocalelems}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, nperimeter, nfid, nelems) =
+        DataLayouts.farray_size(perimeter_data)
+    pperimeter_data = parent(perimeter_data)
     if gidx ≤ nlevels * nperimeter * nlocalelems
         sizet = (nlevels, nperimeter, nlocalelems)
         sizet_data = (nlevels, Nq, Nq, nfid, nelems)
@@ -533,21 +538,20 @@ function Topologies.dss_local_ghost!(
 )
     nghostvertices = length(topology.ghost_vertex_offset) - 1
     if nghostvertices > 0
-        pperimeter_data = parent(perimeter_data)
-        (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
+        (nlevels, nperimeter, nfid, nelems) =
+            DataLayouts.farray_size(perimeter_data)
         max_threads = 256
         nitems = nlevels * nfid * nghostvertices
         nthreads, nblocks = _configure_threadblock(nitems)
         args = (
-            pperimeter_data,
+            perimeter_data,
             topology.ghost_vertices,
             topology.ghost_vertex_offset,
             perimeter,
         )
         auto_launch!(
             dss_local_ghost_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -556,13 +560,15 @@ function Topologies.dss_local_ghost!(
 end
 
 function dss_local_ghost_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::DataLayouts.VIFH,
     ghost_vertices,
     ghost_vertex_offset,
     perimeter::Topologies.Perimeter2D{Nq},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, nperimeter, nfidx, _) = size(pperimeter_data)
+) where {Nq}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    pperimeter_data = parent(perimeter_data)
+    FT = eltype(pperimeter_data)
+    (nlevels, nperimeter, nfidx, _) = DataLayouts.farray_size(perimeter_data)
     nghostvertices = length(ghost_vertex_offset) - 1
     if gidx ≤ nlevels * nfidx * nghostvertices
         sizev = (nlevels, nfidx, nghostvertices)
@@ -594,17 +600,16 @@ function Topologies.fill_send_buffer!(
     synchronize = true,
 )
     (; perimeter_data, send_buf_idx, send_data) = dss_buffer
-    pperimeter_data = parent(perimeter_data)
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
+    (nlevels, nperimeter, nfid, nelems) =
+        DataLayouts.farray_size(perimeter_data)
     nsend = size(send_buf_idx, 1)
     if nsend > 0
         nitems = nsend * nlevels * nfid
         nthreads, nblocks = _configure_threadblock(nitems)
-        args = (send_data, send_buf_idx, pperimeter_data)
+        args = (send_data, send_buf_idx, perimeter_data, Val(nsend))
         auto_launch!(
             fill_send_buffer_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -618,11 +623,12 @@ end
 function fill_send_buffer_kernel!(
     send_data::AbstractArray{FT, 1},
     send_buf_idx::AbstractArray{I, 2},
-    pperimeter_data::AbstractArray{FT, 4},
-) where {FT <: AbstractFloat, I <: Int}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    (nlevels, _, nfid, nelems) = size(pperimeter_data)
-    nsend = size(send_buf_idx, 1)
+    perimeter_data::AbstractData,
+    ::Val{nsend},
+) where {FT <: AbstractFloat, I <: Int, nsend}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, _, nfid, nelems) = DataLayouts.farray_size(perimeter_data)
+    pperimeter_data = parent(perimeter_data)
     #sizet = (nsend, nlevels, nfid)
     sizet = (nlevels, nfid, nsend)
     #if gidx ≤ nsend * nlevels * nfid
@@ -642,17 +648,16 @@ function Topologies.load_from_recv_buffer!(
     dss_buffer::Topologies.DSSBuffer,
 )
     (; perimeter_data, recv_buf_idx, recv_data) = dss_buffer
-    pperimeter_data = parent(perimeter_data)
-    (nlevels, nperimeter, nfid, nelems) = size(pperimeter_data)
+    (nlevels, nperimeter, nfid, nelems) =
+        DataLayouts.farray_size(perimeter_data)
     nrecv = size(recv_buf_idx, 1)
     if nrecv > 0
         nitems = nrecv * nlevels * nfid
         nthreads, nblocks = _configure_threadblock(nitems)
-        args = (pperimeter_data, recv_data, recv_buf_idx)
+        args = (perimeter_data, recv_data, recv_buf_idx, Val(nrecv))
         auto_launch!(
             load_from_recv_buffer_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -661,13 +666,14 @@ function Topologies.load_from_recv_buffer!(
 end
 
 function load_from_recv_buffer_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::AbstractData,
     recv_data::AbstractArray{FT, 1},
     recv_buf_idx::AbstractArray{I, 2},
-) where {FT <: AbstractFloat, I <: Int}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    nlevels, _, nfid, nelems = size(pperimeter_data)
-    nrecv = size(recv_buf_idx, 1)
+    ::Val{nrecv},
+) where {FT <: AbstractFloat, I <: Int, nrecv}
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    pperimeter_data = parent(perimeter_data)
+    (nlevels, _, nfid, nelems) = DataLayouts.farray_size(perimeter_data)
     #sizet = (nrecv, nlevels, nfid)
     sizet = (nlevels, nfid, nrecv)
     #if gidx ≤ nrecv * nlevels * nfid
@@ -691,12 +697,11 @@ function Topologies.dss_ghost!(
 )
     nghostvertices = length(topology.ghost_vertex_offset) - 1
     if nghostvertices > 0
-        pperimeter_data = parent(perimeter_data)
-        nlevels, _, nfidx, _ = size(pperimeter_data)
+        (nlevels, _, nfidx, _) = DataLayouts.farray_size(perimeter_data)
         nitems = nlevels * nfidx * nghostvertices
         nthreads, nblocks = _configure_threadblock(nitems)
         args = (
-            pperimeter_data,
+            perimeter_data,
             topology.ghost_vertices,
             topology.ghost_vertex_offset,
             topology.repr_ghost_vertex,
@@ -704,8 +709,7 @@ function Topologies.dss_ghost!(
         )
         auto_launch!(
             dss_ghost_kernel!,
-            args,
-            pperimeter_data;
+            args;
             threads_s = (nthreads),
             blocks_s = (nblocks),
         )
@@ -714,14 +718,16 @@ function Topologies.dss_ghost!(
 end
 
 function dss_ghost_kernel!(
-    pperimeter_data::AbstractArray{FT, 4},
+    perimeter_data::AbstractData,
     ghost_vertices,
     ghost_vertex_offset,
     repr_ghost_vertex,
     perimeter::Topologies.Perimeter2D{Nq},
-) where {FT <: AbstractFloat, Nq}
-    gidx = threadIdx().x + (blockIdx().x - 1) * blockDim().x
-    nlevels, _, nfidx, _ = size(pperimeter_data)
+) where {Nq}
+    pperimeter_data = parent(perimeter_data)
+    FT = eltype(pperimeter_data)
+    gidx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    (nlevels, _, nfidx, _) = DataLayouts.farray_size(perimeter_data)
     nghostvertices = length(ghost_vertex_offset) - 1
 
     if gidx ≤ nlevels * nfidx * nghostvertices
diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl
index 6c8a759e92..08f85eb71a 100644
--- a/src/DataLayouts/DataLayouts.jl
+++ b/src/DataLayouts/DataLayouts.jl
@@ -72,14 +72,14 @@ end
 @inline array_length(data::AbstractData) = prod(size(parent(data)))
 
 """
-    (Ni, Nj, Nv, _, Nh) = universal_size(data::AbstractData)
+    (Ni, Nj, _, Nv, Nh) = universal_size(data::AbstractData)
 
 A tuple of compile-time known type parameters,
 corresponding to `UniversalSize`. The field dimension
 is excluded and is returned as 1.
 """
 @inline universal_size(::UniversalSize{Ni, Nj, Nv, Nh}) where {Ni, Nj, Nv, Nh} =
-    (Ni, Nj, Nv, 1, Nh)
+    (Ni, Nj, 1, Nv, Nh)
 
 """
     get_N(::AbstractData)
@@ -116,8 +116,6 @@ Statically returns `Nh`.
 @inline get_Nv(data::AbstractData) = get_Nv(UniversalSize(data))
 @inline get_N(data::AbstractData) = get_N(UniversalSize(data))
 
-@inline universal_size(data::AbstractData) = universal_size(UniversalSize(data))
-
 function Base.show(io::IO, data::AbstractData)
     indent_width = 2
     (rows, cols) = displaysize(io)
@@ -1276,6 +1274,51 @@ type parameters.
 @inline union_all(::Type{<:IH1JH2}) = IH1JH2
 @inline union_all(::Type{<:IV1JH2}) = IV1JH2
 
+"""
+    array_size(data::AbstractData, [dim])
+    array_size(::Type{<:AbstractData}, [dim])
+
+This is an internal function, please do not use outside of ClimaCore.
+
+Returns the size of the backing array, with the field dimension set to 1
+
+This function is helpful for writing generic
+code, when reconstructing new datalayouts with new
+type parameters.
+"""
+@inline array_size(data::AbstractData, i::Integer) = array_size(data)[i]
+@inline array_size(::IJKFVH{S, Nij, Nk, Nv, Nh}) where {S, Nij, Nk, Nv, Nh} = (Nij, Nij, Nk, 1, Nv, Nh)
+@inline array_size(::IJFH{S, Nij, Nh}) where {S, Nij, Nh} = (Nij, Nij, 1, Nh)
+@inline array_size(::IFH{S, Ni, Nh}) where {S, Ni, Nh} = (Ni, 1, Nh)
+@inline array_size(::DataF{S}) where {S} = (1,)
+@inline array_size(::IJF{S, Nij}) where {S, Nij} = (Nij, Nij, 1)
+@inline array_size(::IF{S, Ni}) where {S, Ni} = (Ni, 1)
+@inline array_size(::VF{S, Nv}) where {S, Nv} = (Nv, 1)
+@inline array_size(::VIJFH{S, Nv, Nij, Nh}) where {S, Nv, Nij, Nh} = (Nv, Nij, Nij, 1, Nh)
+@inline array_size(::VIFH{S, Nv, Ni, Nh}) where {S, Nv, Ni, Nh} = (Nv, Ni, 1, Nh)
+
+"""
+    farray_size(data::AbstractData)
+
+This is an internal function, please do not use outside of ClimaCore.
+
+Returns the size of the backing array, including the field dimension
+
+This function is helpful for writing generic
+code, when reconstructing new datalayouts with new
+type parameters.
+"""
+@inline farray_size(data::AbstractData, i::Integer) = farray_size(data)[i]
+@inline farray_size(data::IJKFVH{S, Nij, Nk, Nv, Nh}) where {S, Nij, Nk, Nv, Nh} = (Nij, Nij, Nk, ncomponents(data), Nv, Nh)
+@inline farray_size(data::IJFH{S, Nij, Nh}) where {S, Nij, Nh} = (Nij, Nij, ncomponents(data), Nh)
+@inline farray_size(data::IFH{S, Ni, Nh}) where {S, Ni, Nh} = (Ni, ncomponents(data), Nh)
+@inline farray_size(data::DataF{S}) where {S} = (ncomponents(data),)
+@inline farray_size(data::IJF{S, Nij}) where {S, Nij} = (Nij, Nij, ncomponents(data))
+@inline farray_size(data::IF{S, Ni}) where {S, Ni} = (Ni, ncomponents(data))
+@inline farray_size(data::VF{S, Nv}) where {S, Nv} = (Nv, ncomponents(data))
+@inline farray_size(data::VIJFH{S, Nv, Nij, Nh}) where {S, Nv, Nij, Nh} = (Nv, Nij, Nij, ncomponents(data), Nh)
+@inline farray_size(data::VIFH{S, Nv, Ni, Nh}) where {S, Nv, Ni, Nh} = (Nv, Ni, ncomponents(data), Nh)
+
 # Keep in sync with definition(s) in libs.
 @inline slab_index(i, j) = CartesianIndex(i, j, 1, 1, 1)
 @inline slab_index(i) = CartesianIndex(i, 1, 1, 1, 1)
diff --git a/src/Topologies/dss.jl b/src/Topologies/dss.jl
index 4432ffc0b6..31056ed87f 100644
--- a/src/Topologies/dss.jl
+++ b/src/Topologies/dss.jl
@@ -66,9 +66,7 @@ function create_dss_buffer(
     convert_to_array = DA isa Array ? false : true
     (_, _, _, Nv, Nh) = Base.size(data)
     Np = length(perimeter)
-    Nf =
-        length(parent(data)) == 0 ? 0 :
-        cld(length(parent(data)), (Nij * Nij * Nv * Nh))
+    Nf = DataLayouts.ncomponents(data)
     nfacedof = Nij - 2
     T = eltype(parent(data))
     TS = _transformed_type(data, local_geometry, local_weights, DA) # extract transformed type
@@ -941,7 +939,7 @@ function fill_send_buffer!(
 )
     (; perimeter_data, send_buf_idx, send_data) = dss_buffer
     (Np, _, _, Nv, nelems) = size(perimeter_data)
-    Nf = cld(length(parent(perimeter_data)), (Nv * Np * nelems))
+    Nf = DataLayouts.ncomponents(perimeter_data)
     pdata = parent(perimeter_data)
     nsend = size(send_buf_idx, 1)
     ctr = 1
@@ -970,7 +968,7 @@ function load_from_recv_buffer!(
 )
     (; perimeter_data, recv_buf_idx, recv_data) = dss_buffer
     (Np, _, _, Nv, nelems) = size(perimeter_data)
-    Nf = cld(length(parent(perimeter_data)), (Nv * Np * nelems))
+    Nf = DataLayouts.ncomponents(perimeter_data)
     pdata = parent(perimeter_data)
     nrecv = size(recv_buf_idx, 1)
     ctr = 1
diff --git a/src/Topologies/dss_transform.jl b/src/Topologies/dss_transform.jl
index 1fec9d587a..16403b67d0 100644
--- a/src/Topologies/dss_transform.jl
+++ b/src/Topologies/dss_transform.jl
@@ -285,7 +285,7 @@ function create_ghost_buffer(
         )
         k = stride(parent(send_data), 4)
     else
-        Nv, _, _, Nf, _ = size(parent(data))
+        Nv, _, _, Nf, _ = DataLayouts.farray_size(data)
         send_data =
             DataLayouts.VIJFH{S, Nv, Nij, Topologies.nsendelems(topology)}(
                 similar(
diff --git a/test/DataLayouts/unit_copyto.jl b/test/DataLayouts/unit_copyto.jl
index 0b304a4f81..1cf917fd1b 100644
--- a/test/DataLayouts/unit_copyto.jl
+++ b/test/DataLayouts/unit_copyto.jl
@@ -17,7 +17,7 @@ function test_copyto_float!(data)
     rand_data = DataLayouts.rebuild(data, similar(parent(data)))
     ArrayType = ClimaComms.array_type(ClimaComms.device())
     parent(rand_data) .=
-        ArrayType(rand(eltype(parent(data)), size(parent(data))))
+        ArrayType(rand(eltype(parent(data)), DataLayouts.farray_size(data)))
     Base.copyto!(data, rand_data) # test copyto!(::AbstractData, ::AbstractData)
     @test all(parent(data) .== parent(rand_data))
     Base.copyto!(data, Base.Broadcast.broadcasted(+, rand_data, 1)) # test copyto!(::AbstractData, ::Broadcasted)
@@ -30,7 +30,7 @@ function test_copyto!(data)
     rand_data = DataLayouts.rebuild(data, similar(parent(data)))
     ArrayType = ClimaComms.array_type(ClimaComms.device())
     parent(rand_data) .=
-        ArrayType(rand(eltype(parent(data)), size(parent(data))))
+        ArrayType(rand(eltype(parent(data)), DataLayouts.farray_size(data)))
     Base.copyto!(data, rand_data) # test copyto!(::AbstractData, ::AbstractData)
     @test all(parent(data.:1) .== parent(rand_data.:1))
     @test all(parent(data.:2) .== parent(rand_data.:2))
@@ -98,7 +98,7 @@ end
         SubArray(
             parent(data),
             ntuple(
-                i -> Base.Slice(Base.OneTo(size(parent(data), i))),
+                i -> Base.Slice(Base.OneTo(DataLayouts.farray_size(data, i))),
                 ndims(data),
             ),
         ),
diff --git a/test/DataLayouts/unit_fill.jl b/test/DataLayouts/unit_fill.jl
index fc803c0015..f8af1f022c 100644
--- a/test/DataLayouts/unit_fill.jl
+++ b/test/DataLayouts/unit_fill.jl
@@ -73,7 +73,10 @@ end
         data,
         SubArray(
             parent(data),
-            ntuple(i -> Base.OneTo(size(parent(data), i)), ndims(data)),
+            ntuple(
+                i -> Base.OneTo(DataLayouts.farray_size(data, i)),
+                ndims(data),
+            ),
         ),
     )
     FT = Float64
@@ -119,7 +122,10 @@ end
             data,
             SubArray(
                 parent(rdata),
-                ntuple(i -> Base.OneTo(size(parent(rdata), i)), ndims(rdata)),
+                ntuple(
+                    i -> Base.OneTo(DataLayouts.farray_size(rdata, i)),
+                    ndims(rdata),
+                ),
             ),
         )
         rarray = parent(parent(newdata))
diff --git a/test/DataLayouts/unit_mapreduce.jl b/test/DataLayouts/unit_mapreduce.jl
index 2da4547521..dcbf0a99a0 100644
--- a/test/DataLayouts/unit_mapreduce.jl
+++ b/test/DataLayouts/unit_mapreduce.jl
@@ -24,7 +24,8 @@ function test_mapreduce_1!(context, data)
     Random.seed!(1234)
     device = ClimaComms.device(context)
     ArrayType = ClimaComms.array_type(device)
-    rand_data = ArrayType(rand(eltype(parent(data)), size(parent(data))))
+    rand_data =
+        ArrayType(rand(eltype(parent(data)), DataLayouts.farray_size(data)))
     parent(data) .= rand_data
     if device isa ClimaComms.CUDADevice
         @test wrapper(context, identity, min, data) == minimum(parent(data))
@@ -40,7 +41,8 @@ function test_mapreduce_2!(context, data)
     Random.seed!(1234)
     device = ClimaComms.device(context)
     ArrayType = ClimaComms.array_type(device)
-    rand_data = ArrayType(rand(eltype(parent(data)), size(parent(data))))
+    rand_data =
+        ArrayType(rand(eltype(parent(data)), DataLayouts.farray_size(data)))
     parent(data) .= rand_data
     # mapreduce orders tuples lexicographically:
     #    minimum(((2,3), (1,4))) # (1, 4)
@@ -116,7 +118,10 @@ end
         data,
         SubArray(
             parent(data),
-            ntuple(i -> Base.OneTo(size(parent(data), i)), ndims(data)),
+            ntuple(
+                i -> Base.OneTo(DataLayouts.farray_size(data, i)),
+                ndims(data),
+            ),
         ),
     )
     FT = Float64
diff --git a/test/Operators/spectralelement/benchmark_utils.jl b/test/Operators/spectralelement/benchmark_utils.jl
index 4415f42cfe..fb82644196 100644
--- a/test/Operators/spectralelement/benchmark_utils.jl
+++ b/test/Operators/spectralelement/benchmark_utils.jl
@@ -6,6 +6,7 @@ using LinearAlgebra: ×
 import PrettyTables
 import LinearAlgebra as LA
 import OrderedCollections
+import ClimaCore.DataLayouts
 import ClimaCore.Operators as Operators
 import ClimaCore.Domains as Domains
 import ClimaCore.Meshes as Meshes
@@ -229,7 +230,7 @@ function setup_kernel_args(ARGS::Vector{String} = ARGS)
     f_comp2_buffer = Spaces.create_dss_buffer(f_comp2)
     f = @. Geometry.Contravariant3Vector(Geometry.WVector(ϕ))
 
-    s = size(parent(ϕ))
+    s = DataLayouts.farray_size(Fields.field_values(ϕ))
     ArrayType = ClimaComms.array_type(device)
     ϕ_arr = ArrayType(fill(FT(1), s))
     ψ_arr = ArrayType(fill(FT(2), s))
diff --git a/test/Spaces/distributed_cuda/ddss2.jl b/test/Spaces/distributed_cuda/ddss2.jl
index d138034d57..32e5d53a56 100644
--- a/test/Spaces/distributed_cuda/ddss2.jl
+++ b/test/Spaces/distributed_cuda/ddss2.jl
@@ -108,7 +108,7 @@ pid, nprocs = ClimaComms.init(context)
     end
 #! format: on
     p = @allocated Spaces.weighted_dss!(y0, dss_buffer)
-    iamroot && @test p ≤ 8064
+    iamroot && @test p ≤ 8832
 
     #testing weighted dss on a vector field
     init_vectorstate(local_geometry, p) = Geometry.Covariant12Vector(1.0, -1.0)
diff --git a/test/Spaces/distributed_cuda/ddss4.jl b/test/Spaces/distributed_cuda/ddss4.jl
index d129a6f263..d127bdbf79 100644
--- a/test/Spaces/distributed_cuda/ddss4.jl
+++ b/test/Spaces/distributed_cuda/ddss4.jl
@@ -100,7 +100,7 @@ pid, nprocs = ClimaComms.init(context)
     end
     p = @allocated Spaces.weighted_dss!(y0, dss_buffer)
     if pid == 1
-        @test p ≤ 7008
+        @test p ≤ 7776
     end
 
 end
diff --git a/test/Spaces/opt_spaces.jl b/test/Spaces/opt_spaces.jl
index a6de4e6f81..c2d66c8876 100644
--- a/test/Spaces/opt_spaces.jl
+++ b/test/Spaces/opt_spaces.jl
@@ -35,19 +35,19 @@ end
     if ClimaComms.device(context) isa ClimaComms.CUDADevice
         test_n_failures(86,   TU.PointSpace, context)
         test_n_failures(144,  TU.SpectralElementSpace1D, context)
-        test_n_failures(1120, TU.SpectralElementSpace2D, context)
+        test_n_failures(1141, TU.SpectralElementSpace2D, context)
         test_n_failures(123,  TU.ColumnCenterFiniteDifferenceSpace, context)
         test_n_failures(123,  TU.ColumnFaceFiniteDifferenceSpace, context)
-        test_n_failures(1126, TU.SphereSpectralElementSpace, context)
+        test_n_failures(1131, TU.SphereSpectralElementSpace, context)
         test_n_failures(1139, TU.CenterExtrudedFiniteDifferenceSpace, context)
         test_n_failures(1139, TU.FaceExtrudedFiniteDifferenceSpace, context)
     else
         test_n_failures(0,    TU.PointSpace, context)
         test_n_failures(137,  TU.SpectralElementSpace1D, context)
-        test_n_failures(308,  TU.SpectralElementSpace2D, context)
+        test_n_failures(310,  TU.SpectralElementSpace2D, context)
         test_n_failures(118,  TU.ColumnCenterFiniteDifferenceSpace, context)
         test_n_failures(118,  TU.ColumnFaceFiniteDifferenceSpace, context)
-        test_n_failures(314,  TU.SphereSpectralElementSpace, context)
+        test_n_failures(316,  TU.SphereSpectralElementSpace, context)
         test_n_failures(321,  TU.CenterExtrudedFiniteDifferenceSpace, context)
         test_n_failures(321,  TU.FaceExtrudedFiniteDifferenceSpace, context)
 
diff --git a/test/Spaces/unit_spaces.jl b/test/Spaces/unit_spaces.jl
index 2b1596a9ea..3a78fb429c 100644
--- a/test/Spaces/unit_spaces.jl
+++ b/test/Spaces/unit_spaces.jl
@@ -52,8 +52,7 @@ on_gpu = ClimaComms.device() isa ClimaComms.CUDADevice
     coord_data = Spaces.coordinates_data(space)
     @test eltype(coord_data) == Geometry.XPoint{Float64}
 
-    array = parent(Spaces.coordinates_data(space))
-    @test size(array) == (4, 1, 1)
+    @test DataLayouts.farray_size(Spaces.coordinates_data(space)) == (4, 1, 1)
     coord_slab = slab(Spaces.coordinates_data(space), 1)
     @test coord_slab[slab_index(1)] == Geometry.XPoint{FT}(-3)
     @test coord_slab[slab_index(4)] == Geometry.XPoint{FT}(5)
@@ -112,17 +111,18 @@ on_gpu || @testset "extruded (2d 1×3) finite difference space" begin
     # Extrusion
     f_space = Spaces.ExtrudedFiniteDifferenceSpace(hspace, vert_face_space)
     c_space = Spaces.CenterExtrudedFiniteDifferenceSpace(f_space)
-    array = parent(Spaces.coordinates_data(c_space))
+    s = DataLayouts.farray_size(Spaces.coordinates_data(c_space))
     z = Fields.coordinate_field(c_space).z
-    @test size(array) == (10, 4, 2, 5) # 10V, 4I, 2F(x,z), 5H
+    @test s == (10, 4, 2, 5) # 10V, 4I, 2F(x,z), 5H
     @test Spaces.local_geometry_type(typeof(f_space)) <: Geometry.LocalGeometry
     @test Spaces.local_geometry_type(typeof(c_space)) <: Geometry.LocalGeometry
 
     # Define test col index
     colidx = Fields.ColumnIndex{1}((4,), 5)
+    z_values = Fields.field_values(z[colidx])
     # Here valid `colidx` are `Fields.ColumnIndex{1}((1:4,), 1:5)`
-    @test size(parent(z[colidx])) == (10, 1)
-    @test Fields.field_values(z[colidx]) isa DataLayouts.VF
+    @test DataLayouts.farray_size(z_values) == (10, 1)
+    @test z_values isa DataLayouts.VF
     @test Spaces.column(z, 1, 1, 1) isa Fields.Field
     @test_throws BoundsError Spaces.column(z, 1, 2, 1)
     @test Spaces.column(z, 1, 2) isa Fields.Field
@@ -214,8 +214,7 @@ end
       quadrature: 4-point Gauss-Legendre-Lobatto quadrature"""
 
     coord_data = Spaces.coordinates_data(space)
-    array = parent(coord_data)
-    @test size(array) == (4, 4, 2, 1)
+    @test DataLayouts.farray_size(coord_data) == (4, 4, 2, 1)
     coord_slab = slab(coord_data, 1)
     @test coord_slab[slab_index(1, 1)] ≈ Geometry.XYPoint{FT}(-3.0, -2.0)
     @test coord_slab[slab_index(4, 1)] ≈ Geometry.XYPoint{FT}(5.0, -2.0)