Skip to content

Commit

Permalink
Remove umpire and moves device cuda code to a single source code file (
Browse files Browse the repository at this point in the history
  • Loading branch information
koparasy authored Jun 12, 2024
1 parent f3cc10f commit 531534a
Show file tree
Hide file tree
Showing 17 changed files with 824 additions and 447 deletions.
4 changes: 3 additions & 1 deletion src/AMSlib/AMS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ class AMSWrap
pattern = std::string("<PID>");
id = getpid();
}

// Combine hostname and pid
std::ostringstream combined;
combined << "." << hostname << "." << id;
Expand Down Expand Up @@ -665,7 +666,8 @@ const char *AMSGetAllocatorName(AMSResourceType device)
void AMSSetAllocator(AMSResourceType resource, const char *alloc_name)
{
auto &rm = ams::ResourceManager::getInstance();
rm.setAllocator(std::string(alloc_name), resource);
std::string alloc(alloc_name);
rm.setAllocator(alloc, resource);
}

AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,
Expand Down
40 changes: 24 additions & 16 deletions src/AMSlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,35 @@ file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp")
#set global library path to link with tests if necessary
set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH})
set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp)

if (WITH_CUDA)
list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp)
message(WARNING "FILES ARE ${AMS_LIB_SRC}")
endif()


# two targets: a shared lib and an exec
add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES})

# ------------------------------------------------------------------------------
if (WITH_CUDA)
set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})

# if (BUILD_SHARED_LIBS)
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# else()
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# endif()

set_source_files_properties(AMS.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(AMS.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
set_source_files_properties(AMS.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")

if (WITH_PERFFLOWASPECT)
set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")

set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})

# if (BUILD_SHARED_LIBS)
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# else()
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# endif()

set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")

if (WITH_PERFFLOWASPECT)
set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
endif()
endif()

Expand Down
5 changes: 4 additions & 1 deletion src/AMSlib/ml/hdcache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <stdexcept>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>

#ifdef __ENABLE_FAISS__
Expand All @@ -29,6 +30,8 @@
#include <faiss/gpu/GpuCloner.h>
#include <faiss/gpu/GpuIndexIVFPQ.h>
#include <faiss/gpu/StandardGpuResources.h>

#include "wf/device.hpp"
#endif
#endif

Expand Down Expand Up @@ -366,7 +369,7 @@ class HDCache
_evaluate(ndata, data, is_acceptable);

if (cache_location == AMSResourceType::AMS_DEVICE) {
deviceCheckErrors(__FILE__, __LINE__);
ams::deviceCheckErrors(__FILE__, __LINE__);
}

DBG(UQModule, "Done with evalution of uq")
Expand Down
21 changes: 2 additions & 19 deletions src/AMSlib/ml/random_uq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,17 @@

#include "AMS.h"
#include "wf/debug.h"
#include "wf/device.hpp"
#include "wf/utils.hpp"

static size_t round_up(size_t num, size_t denom)
{
return (num + denom - 1) / denom;
}

class RandomUQ
{
public:
PERFFASPECT()
inline void evaluate(const size_t ndata, bool *is_acceptable)
{
if (resourceLocation == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
//TODO: Move all of this code on device.cpp and provide better logic regarding
// number of threads
size_t threads = 256;
size_t blocks = round_up(ndata, threads);
random_uq_device<<<blocks, threads>>>(seed,
is_acceptable,
ndata,
threshold);
seed = seed + 1;
#else
THROW(std::runtime_error,
"Random-uq is not configured to use device allocations");
#endif
ams::device_random_uq(seed, is_acceptable, ndata, threshold);
} else {
random_uq_host(is_acceptable, ndata, threshold);
}
Expand Down
61 changes: 11 additions & 50 deletions src/AMSlib/ml/surrogate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <unordered_map>

#include "AMS.h"
#include "wf/cuda/utilities.cuh"
#include "wf/device.hpp"

#ifdef __ENABLE_TORCH__
#include <ATen/core/interned_strings.h>
Expand Down Expand Up @@ -100,43 +100,12 @@ class SurrogateModel
{
// Transpose to get continuous memory and
// perform single memcpy.
auto& rm = ams::ResourceManager::getInstance();
tensor = tensor.transpose(1, 0);
if (model_resource == AMSResourceType::AMS_HOST) {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
} else {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
DtoDMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
}
}

PERFFASPECT()
inline void tensorToHostArray(at::Tensor tensor,
long numRows,
long numCols,
TypeInValue** array)
{
// Transpose to get continuous memory and
// perform single memcpy.
tensor = tensor.transpose(1, 0);
if (model_resource == AMSResourceType::AMS_HOST) {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
} else {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
DtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
rm.copy(ptr, model_resource, array[j], model_resource, numRows);
}
}

Expand Down Expand Up @@ -216,13 +185,9 @@ class SurrogateModel
if (model_resource == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
DBG(Surrogate, "Compute mean delta uq predicates on device\n");
constexpr int block_size = 256;
int grid_size = divup(nrows, block_size);
computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
outputs_stdev, predicates, nrows, ncols, threshold);
// TODO: use combined routine when it lands.
cudaDeviceSynchronize();
CUDACHECKERROR();
ams::Device::computeDeltaUQMeanPredicatesDevice(
outputs_stdev, predicates, nrows, ncols, threshold);
#else
THROW(std::runtime_error,
"Expected CUDA is enabled when model data are on DEVICE");
Expand All @@ -235,13 +200,9 @@ class SurrogateModel
if (model_resource == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
DBG(Surrogate, "Compute max delta uq predicates on device\n");
constexpr int block_size = 256;
int grid_size = divup(nrows, block_size);
computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
outputs_stdev, predicates, nrows, ncols, threshold);
// TODO: use combined routine when it lands.
cudaDeviceSynchronize();
CUDACHECKERROR();
ams::Device::computeDeltaUQMaxPredicatesDevice(
outputs_stdev, predicates, nrows, ncols, threshold);
#else
THROW(std::runtime_error,
"Expected CUDA is enabled when model data are on DEVICE");
Expand Down Expand Up @@ -306,7 +267,7 @@ class SurrogateModel
}

if (is_device()) {
deviceCheckErrors(__FILE__, __LINE__);
ams::deviceCheckErrors(__FILE__, __LINE__);
}

DBG(Surrogate,
Expand Down
20 changes: 3 additions & 17 deletions src/AMSlib/wf/basedb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1962,14 +1962,8 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
msg_id)
auto& msg = *it;
auto& rm = ams::ResourceManager::getInstance();
try {
rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
} catch (const umpire::util::Exception& e) {
FATAL(RMQPublisherHandler,
"Failed to deallocate #%d (%p)",
msg.id(),
msg.data());
}
rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);

DBG(RMQPublisherHandler, "Deallocated msg #%d (%p)", msg.id(), msg.data())
buf.erase(it);
}
Expand All @@ -1984,14 +1978,7 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
auto& rm = ams::ResourceManager::getInstance();
for (auto& dp : buffer) {
DBG(RMQPublisherHandler, "deallocate msg #%d (%p)", dp.id(), dp.data())
try {
rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
} catch (const umpire::util::Exception& e) {
FATAL(RMQPublisherHandler,
"Failed to deallocate msg #%d (%p)",
dp.id(),
dp.data());
}
rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
}
buffer.clear();
}
Expand Down Expand Up @@ -2377,7 +2364,6 @@ class RMQInterface
}
};


/* A class that provides a BaseDB interface to AMS workflow.
* When storing data it pushes the data to the RMQ server asynchronously
*/
Expand Down
Loading

0 comments on commit 531534a

Please sign in to comment.