Skip to content

Commit

Permalink
Use C++17 RNG rather than cuRAND for host RNG
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffburdick committed Jan 31, 2025
1 parent 877b4ec commit 0bbc06b
Showing 1 changed file with 22 additions and 91 deletions.
113 changes: 22 additions & 91 deletions include/matx/generators/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <cuda/std/complex>
#include <curand_kernel.h>
#include <type_traits>
#include <random>

namespace matx {

Expand Down Expand Up @@ -162,55 +163,7 @@ __MATX_INLINE__ __MATX_DEVICE__ void get_random(cuda::std::complex<double> &val,
}
};

/**
* Generates random numbers
*
* @tparam
* Type of random number
*
* Generate random numbers based on a size and seed. Uses the Philox 4x32
* generator with 10 rounds.
*/
template <typename T> class [[deprecated("Use random() operator instead of randomGenerator_t")]] randomGenerator_t {
private:
index_t total_threads_;
bool init_;
curandStatePhilox4_32_10_t *states_;
uint64_t seed_;

public:
randomGenerator_t() = delete;

/**
* Constructs a random number generator
*
* This call will allocate memory sufficiently large enough to store state of
* the RNG
*
* @param total_threads
* Number of random values to generate
* @param seed
* Seed for the RNG
*/
__MATX_INLINE__ randomGenerator_t(index_t total_threads, uint64_t seed)
: total_threads_(total_threads)
{
#ifdef __CUDACC__
matxAlloc((void **)&states_,
total_threads_ * sizeof(curandStatePhilox4_32_10_t),
MATX_DEVICE_MEMORY);

int threads = 128;
int blocks = static_cast<int>((total_threads_ + threads - 1) / threads);
curand_setup_kernel<<<blocks, threads>>>(states_, seed, total_threads);
#endif
};

/**
* Destroy the RNG and free all memory
*/
__MATX_INLINE__ ~randomGenerator_t() { matxFree(states_); }
};

namespace detail {

Expand All @@ -237,6 +190,7 @@ namespace detail {
index_t total_size_;
mutable curandStatePhilox4_32_10_t *states_;
uint64_t seed_;
mutable std::mt19937 rng_;
mutable bool init_ = false;
mutable bool device_;

Expand Down Expand Up @@ -321,13 +275,7 @@ namespace detail {
}
else if constexpr (is_host_executor_v<Executor>) {
if (!init_) {
[[maybe_unused]] curandStatus_t ret;

ret = curandCreateGeneratorHost(&gen_, CURAND_RNG_PSEUDO_MT19937);
MATX_ASSERT_STR_EXP(ret, CURAND_STATUS_SUCCESS, matxCudaError, "Failed to create random number generator");

ret = curandSetPseudoRandomGeneratorSeed(gen_, seed_);
MATX_ASSERT_STR_EXP(ret, CURAND_STATUS_SUCCESS, matxCudaError, "Error setting random seed");
rng_.seed(seed_);

// In the future we may allocate a buffer, but for now we generate a single number at a time
// matxAlloc((void **)&val, total_size_ * sizeof(T), MATX_HOST_MEMORY, stream);
Expand Down Expand Up @@ -410,44 +358,32 @@ namespace detail {
std::is_same_v<T, cuda::std::complex<double>>
)
{

using inner_type = typename inner_op_type_t<T>::type;

if (fParams_.dist_ == UNIFORM) {
if constexpr (std::is_same_v<T, float>) {
curandGenerateUniform(gen_, &val, 1);
std::uniform_real_distribution<inner_type> uniform_real_dist{0.0, 1.0};

if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
val = uniform_real_dist(rng_);
}
else if constexpr (std::is_same_v<T, double>) {
curandGenerateUniformDouble(gen_, &val, 1);
}
else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) {
float *tmp = reinterpret_cast<float *>(&val);
curandGenerateUniform(gen_, &tmp[0], 1);
curandGenerateUniform(gen_, &tmp[1], 1);
}
else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) {
double *tmp = reinterpret_cast<double *>(&val);
curandGenerateUniformDouble(gen_, &tmp[0], 1);
curandGenerateUniformDouble(gen_, &tmp[1], 1);
else if constexpr ( std::is_same_v<T, cuda::std::complex<float>> ||
std::is_same_v<T, cuda::std::complex<double>>) {
val = {uniform_real_dist(rng_), uniform_real_dist(rng_)};
}

val = fParams_.alpha_ * val + fParams_.beta_;
}
else if (fParams_.dist_ == NORMAL) {
if constexpr (std::is_same_v<T, float>) {
curandGenerateNormal(gen_, &val, 1, fParams_.beta_, fParams_.alpha_);
}
else if constexpr (std::is_same_v<T, double>) {
curandGenerateNormalDouble(gen_, &val, 1, fParams_.beta_, fParams_.alpha_);
}
else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) {
float *tmp = reinterpret_cast<float *>(&val);
curandGenerateNormal(gen_, &tmp[0], 1, fParams_.beta_, fParams_.alpha_);
curandGenerateNormal(gen_, &tmp[1], 1, fParams_.beta_, fParams_.alpha_);
std::normal_distribution<inner_type> normal_dist(0.0, 1.0);

if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
val = normal_dist(rng_);
}
else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) {
double *tmp = reinterpret_cast<double *>(&val);
curandGenerateNormalDouble(gen_, &tmp[0], 1, fParams_.beta_, fParams_.alpha_);
curandGenerateNormalDouble(gen_, &tmp[1], 1, fParams_.beta_, fParams_.alpha_);
else if constexpr (std::is_same_v<T, cuda::std::complex<float>> || std::is_same_v<T, cuda::std::complex<double>>) {
val = {normal_dist(rng_), normal_dist(rng_)};
}

val = fParams_.alpha_ * val + fParams_.beta_;
}
else {
val = 0;
Expand All @@ -460,13 +396,8 @@ namespace detail {
std::is_same_v<T, int64_t>
)
{
float fScale;
curandGenerateUniform(gen_, &fScale, 1);

// Scale to the provided min and max range
double fMax = static_cast<double>(iParams_.max_);
double fMin = static_cast<double>(iParams_.min_);
val = static_cast<T>(fScale * (fMax - fMin) + fMin);
std::uniform_int_distribution<T> uniform_int_dist(iParams_.min_, iParams_.max_);
val = uniform_int_dist(rng_);
}
#endif

Expand Down

0 comments on commit 0bbc06b

Please sign in to comment.