Merge branch 'branch-25.04' into realloc-policy-fix

rapidsai · Feb 12, 2025 · fbdf4b8 · fbdf4b8
2 parents 128b2c3 + 57dc53c
commit fbdf4b8
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 21 deletions.
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
@@ -278,7 +278,11 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
    * @param idx the index of the element to access
    * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
    */
-  constexpr typename base::reference operator[](size_type idx) const { return this->_data[idx]; }
+  constexpr typename base::reference operator[](typename base::size_type idx) const
+  {
+    static_assert(sizeof(idx) >= sizeof(size_t), "index type must not be smaller than size_t");
+    return this->_data[idx];
+  }
 
   // not noexcept due to undefined behavior when size = 0
   /**
@@ -402,8 +406,9 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
    * @param idx the index of the element to access
    * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
    */
-  __device__ constexpr typename base::reference operator[](size_type idx) const
+  __device__ constexpr typename base::reference operator[](typename base::size_type idx) const
   {
+    static_assert(sizeof(idx) >= sizeof(size_t), "index type must not be smaller than size_t");
     return this->_data[idx];
   }
 
@@ -512,7 +517,7 @@ class base_2dspan {
    * @param row the index of the element to access
    * @return A reference to the row-th element of the sequence, i.e., `data()[row]`
    */
-  CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](size_t row) const
+  CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](std::size_t row) const
   {
     return _flat.subspan(row * _size.second, _size.second);
   }

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -76,7 +76,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
       // for shared memory aggregations
       auto const size = cudf::type_dispatcher<cudf::dispatch_storage_type>(request.values.type(),
                                                                            size_of_functor{});
-      return static_cast<size_type>(data_buffer_size) >= (size * GROUPBY_CARDINALITY_THRESHOLD);
+      return data_buffer_size >= (size * GROUPBY_CARDINALITY_THRESHOLD);
     });
 
   // Performs naive global memory aggregations when the workload is not compatible with shared

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -99,7 +99,6 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
                                                   idx);
     }
   }
-  block.sync();
 }
 
 __device__ void compute_pre_aggregrations(cudf::size_type col_start,
@@ -174,7 +173,6 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons
                                                   idx);
     }
   }
-  block.sync();
 }
 
 /* Takes the local_mapping_index and global_mapping_index to compute
@@ -213,6 +211,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
   block.sync();
 
   while (col_end < num_cols) {
+    block.sync();
     if (block.thread_rank() == 0) {
       calculate_columns_to_aggregate(col_start,
                                      col_end,
@@ -234,6 +233,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                   shmem_agg_mask_offsets,
                                   cardinality,
                                   d_agg_kinds);
+    block.sync();
 
     compute_pre_aggregrations(col_start,
                               col_end,
@@ -263,7 +263,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 }
 }  // namespace
 
-std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
+size_type get_available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -276,7 +276,7 @@ std::size_t get_available_shared_memory_size(cudf::size_type grid_size)
 }
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
-                                std::size_t available_shmem_size,
+                                size_type available_shmem_size,
                                 cudf::size_type num_input_rows,
                                 bitmask_type const* row_bitmask,
                                 bool skip_rows_with_nulls,

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,15 +22,15 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::groupby::detail::hash {
-std::size_t get_available_shared_memory_size(cudf::size_type grid_size);
+size_type get_available_shared_memory_size(cudf::size_type grid_size);
 
-std::size_t constexpr compute_shmem_offsets_size(cudf::size_type num_cols)
+size_type constexpr compute_shmem_offsets_size(cudf::size_type num_cols)
 {
-  return sizeof(cudf::size_type) * num_cols;
+  return static_cast<size_type>(sizeof(cudf::size_type) * num_cols);
 }
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
-                                std::size_t available_shmem_size,
+                                cudf::size_type available_shmem_size,
                                 cudf::size_type num_input_rows,
                                 bitmask_type const* row_bitmask,
                                 bool skip_rows_with_nulls,

diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,11 @@
 #include <cuda/std/type_traits>
 
 namespace cudf::groupby::detail::hash {
+__device__ constexpr void set_mask(bool* mask)
+{
+  if (not *mask) { cudf::detail::atomic_max(mask, true); }
+}
+
 template <typename Source, cudf::aggregation::Kind k, typename Enable = void>
 struct update_target_element_shmem {
   __device__ void operator()(
@@ -52,7 +57,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_min(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -74,7 +79,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_max(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -97,7 +102,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_add(&target_casted[target_index],
                              static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -117,7 +122,7 @@ struct update_target_element_shmem<
     auto value            = static_cast<Target>(source.element<Source>(source_index));
     cudf::detail::atomic_add(&target_casted[target_index], value * value);
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -137,7 +142,7 @@ struct update_target_element_shmem<
     cudf::detail::atomic_mul(&target_casted[target_index],
                              static_cast<Target>(source.element<Source>(source_index)));
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -202,7 +207,7 @@ struct update_target_element_shmem<
       }
     }
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };
 
@@ -228,7 +233,7 @@ struct update_target_element_shmem<
       }
     }
 
-    if (!target_mask[target_index]) { target_mask[target_index] = true; }
+    set_mask(target_mask + target_index);
   }
 };