Remove umpire and moves device cuda code to a single source code file (…

…#71)
LLNL · Jun 12, 2024 · 531534a · 531534a
1 parent f3cc10f
commit 531534a
Show file tree

Hide file tree

Showing 17 changed files with 824 additions and 447 deletions.
diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp
@@ -390,6 +390,7 @@ class AMSWrap
         pattern = std::string("<PID>");
         id = getpid();
       }
+
       // Combine hostname and pid
       std::ostringstream combined;
       combined << "." << hostname << "." << id;
@@ -665,7 +666,8 @@ const char *AMSGetAllocatorName(AMSResourceType device)
 void AMSSetAllocator(AMSResourceType resource, const char *alloc_name)
 {
   auto &rm = ams::ResourceManager::getInstance();
-  rm.setAllocator(std::string(alloc_name), resource);
+  std::string alloc(alloc_name);
+  rm.setAllocator(alloc, resource);
 }
 
 AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,

diff --git a/src/AMSlib/CMakeLists.txt b/src/AMSlib/CMakeLists.txt
@@ -8,27 +8,35 @@ file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp")
 #set global library path to link with tests if necessary
 set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH})
 set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp)
+
+if (WITH_CUDA)
+  list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp)
+  message(WARNING "FILES ARE ${AMS_LIB_SRC}")
+endif()
+
+
 # two targets: a shared lib and an exec
 add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES})
 
 # ------------------------------------------------------------------------------
 if (WITH_CUDA)
-    set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
-
-    #    if (BUILD_SHARED_LIBS)
-    #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    #    else()
-    #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    #      set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-    #    endif()
-
-    set_source_files_properties(AMS.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(AMS.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
-    set_source_files_properties(AMS.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
-
-    if (WITH_PERFFLOWASPECT)
-        set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
-        set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
+
+  set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
+
+  #    if (BUILD_SHARED_LIBS)
+  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  #    else()
+  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  #      set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  #    endif()
+
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA)
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
+
+  if (WITH_PERFFLOWASPECT)
+      set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
+      set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
     endif()
 endif()
 

diff --git a/src/AMSlib/ml/hdcache.hpp b/src/AMSlib/ml/hdcache.hpp
@@ -17,6 +17,7 @@
 #include <stdexcept>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #ifdef __ENABLE_FAISS__
@@ -29,6 +30,8 @@
 #include <faiss/gpu/GpuCloner.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
 #include <faiss/gpu/StandardGpuResources.h>
+
+#include "wf/device.hpp"
 #endif
 #endif
 
@@ -366,7 +369,7 @@ class HDCache
     _evaluate(ndata, data, is_acceptable);
 
     if (cache_location == AMSResourceType::AMS_DEVICE) {
-      deviceCheckErrors(__FILE__, __LINE__);
+      ams::deviceCheckErrors(__FILE__, __LINE__);
     }
 
     DBG(UQModule, "Done with evalution of uq")

diff --git a/src/AMSlib/ml/random_uq.hpp b/src/AMSlib/ml/random_uq.hpp
@@ -11,34 +11,17 @@
 
 #include "AMS.h"
 #include "wf/debug.h"
+#include "wf/device.hpp"
 #include "wf/utils.hpp"
 
-static size_t round_up(size_t num, size_t denom)
-{
-  return (num + denom - 1) / denom;
-}
-
 class RandomUQ
 {
 public:
   PERFFASPECT()
   inline void evaluate(const size_t ndata, bool *is_acceptable)
   {
     if (resourceLocation == AMSResourceType::AMS_DEVICE) {
-#ifdef __ENABLE_CUDA__
-      //TODO: Move all of this code on device.cpp and provide better logic regarding
-      // number of threads
-      size_t threads = 256;
-      size_t blocks = round_up(ndata, threads);
-      random_uq_device<<<blocks, threads>>>(seed,
-                                            is_acceptable,
-                                            ndata,
-                                            threshold);
-      seed = seed + 1;
-#else
-      THROW(std::runtime_error,
-            "Random-uq is not configured to use device allocations");
-#endif
+      ams::device_random_uq(seed, is_acceptable, ndata, threshold);
     } else {
       random_uq_host(is_acceptable, ndata, threshold);
     }

diff --git a/src/AMSlib/ml/surrogate.hpp b/src/AMSlib/ml/surrogate.hpp
@@ -15,7 +15,7 @@
 #include <unordered_map>
 
 #include "AMS.h"
-#include "wf/cuda/utilities.cuh"
+#include "wf/device.hpp"
 
 #ifdef __ENABLE_TORCH__
 #include <ATen/core/interned_strings.h>
@@ -100,43 +100,12 @@ class SurrogateModel
   {
     // Transpose to get continuous memory and
     // perform single memcpy.
+    auto& rm = ams::ResourceManager::getInstance();
     tensor = tensor.transpose(1, 0);
-    if (model_resource == AMSResourceType::AMS_HOST) {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    } else {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        DtoDMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    }
-  }
-
-  PERFFASPECT()
-  inline void tensorToHostArray(at::Tensor tensor,
-                                long numRows,
-                                long numCols,
-                                TypeInValue** array)
-  {
-    // Transpose to get continuous memory and
-    // perform single memcpy.
-    tensor = tensor.transpose(1, 0);
-    if (model_resource == AMSResourceType::AMS_HOST) {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    } else {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        DtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
+    for (long j = 0; j < numCols; j++) {
+      auto tmp = tensor[j].contiguous();
+      TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
+      rm.copy(ptr, model_resource, array[j], model_resource, numRows);
     }
   }
 
@@ -216,13 +185,9 @@ class SurrogateModel
       if (model_resource == AMSResourceType::AMS_DEVICE) {
 #ifdef __ENABLE_CUDA__
         DBG(Surrogate, "Compute mean delta uq predicates on device\n");
-        constexpr int block_size = 256;
-        int grid_size = divup(nrows, block_size);
-        computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
-            outputs_stdev, predicates, nrows, ncols, threshold);
         // TODO: use combined routine when it lands.
-        cudaDeviceSynchronize();
-        CUDACHECKERROR();
+        ams::Device::computeDeltaUQMeanPredicatesDevice(
+            outputs_stdev, predicates, nrows, ncols, threshold);
 #else
         THROW(std::runtime_error,
               "Expected CUDA is enabled when model data are on DEVICE");
@@ -235,13 +200,9 @@ class SurrogateModel
       if (model_resource == AMSResourceType::AMS_DEVICE) {
 #ifdef __ENABLE_CUDA__
         DBG(Surrogate, "Compute max delta uq predicates on device\n");
-        constexpr int block_size = 256;
-        int grid_size = divup(nrows, block_size);
-        computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
-            outputs_stdev, predicates, nrows, ncols, threshold);
         // TODO: use combined routine when it lands.
-        cudaDeviceSynchronize();
-        CUDACHECKERROR();
+        ams::Device::computeDeltaUQMaxPredicatesDevice(
+            outputs_stdev, predicates, nrows, ncols, threshold);
 #else
         THROW(std::runtime_error,
               "Expected CUDA is enabled when model data are on DEVICE");
@@ -306,7 +267,7 @@ class SurrogateModel
     }
 
     if (is_device()) {
-      deviceCheckErrors(__FILE__, __LINE__);
+      ams::deviceCheckErrors(__FILE__, __LINE__);
     }
 
     DBG(Surrogate,

diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp
@@ -1962,14 +1962,8 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
            msg_id)
     auto& msg = *it;
     auto& rm = ams::ResourceManager::getInstance();
-    try {
-      rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
-    } catch (const umpire::util::Exception& e) {
-      FATAL(RMQPublisherHandler,
-            "Failed to deallocate #%d (%p)",
-            msg.id(),
-            msg.data());
-    }
+    rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
+
     DBG(RMQPublisherHandler, "Deallocated msg #%d (%p)", msg.id(), msg.data())
     buf.erase(it);
   }
@@ -1984,14 +1978,7 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
     auto& rm = ams::ResourceManager::getInstance();
     for (auto& dp : buffer) {
       DBG(RMQPublisherHandler, "deallocate msg #%d (%p)", dp.id(), dp.data())
-      try {
-        rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
-      } catch (const umpire::util::Exception& e) {
-        FATAL(RMQPublisherHandler,
-              "Failed to deallocate msg #%d (%p)",
-              dp.id(),
-              dp.data());
-      }
+      rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
     }
     buffer.clear();
   }
@@ -2377,7 +2364,6 @@ class RMQInterface
   }
 };
 
-
 /* A class that provides a BaseDB interface to AMS workflow.
  * When storing data it pushes the data to the RMQ server asynchronously
 */