Skip to content

Commit

Permalink
Fix bug of DeltaUQ when running on CUDA. Data are not copied correctl…
Browse files Browse the repository at this point in the history
…y. Now they are with a sub-optimal mechanism
  • Loading branch information
koparasy committed Mar 22, 2024
1 parent 3e4b410 commit d783ae4
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 5 deletions.
2 changes: 2 additions & 0 deletions src/AMSlib/ml/surrogate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ class SurrogateModel
else
_load<TypeInValue>(new_path, "cuda");
}

AMSResourceType getModelResource() const { return model_resource; }
};

template <typename T>
Expand Down
36 changes: 31 additions & 5 deletions src/AMSlib/ml/uq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class UQ

if (uqPolicy == AMSUQPolicy::RandomUQ)
randomUQ = std::make_unique<RandomUQ>(resourceLocation, threshold);

DBG(UQ, "UQ Model is of type %d", uqPolicy)
}

PERFFASPECT()
Expand All @@ -73,29 +75,48 @@ class UQ
{
if ((uqPolicy == AMSUQPolicy::DeltaUQ_Mean) ||
(uqPolicy == AMSUQPolicy::DeltaUQ_Max)) {

auto &rm = ams::ResourceManager::getInstance();

CALIPER(CALI_MARK_BEGIN("DELTAUQ");)
const size_t ndims = outputs.size();
std::vector<FPTypeValue *> outputs_stdev(ndims);
// TODO: Enable device-side allocation and predicate calculation.
auto &rm = ams::ResourceManager::getInstance();
for (int dim = 0; dim < ndims; ++dim)
outputs_stdev[dim] =
rm.allocate<FPTypeValue>(totalElements, AMSResourceType::HOST);

CALIPER(CALI_MARK_BEGIN("SURROGATE");)
DBG(Workflow,
"Model exists, I am calling DeltaUQ surrogate (for all data)");
DBG(UQ,
"Model exists, I am calling DeltaUQ surrogate [%ld %ld] -> (mu:[%ld "
"%ld], std:[%ld %ld])",
totalElements,
inputs.size(),
totalElements,
outputs.size(),
totalElements,
inputs.size());
surrogate->evaluate(totalElements, inputs, outputs, outputs_stdev);
CALIPER(CALI_MARK_END("SURROGATE");)

// FIXME: We do something sub-optimal. We copy all the data from the GPU
// to the CPU and then we compute the predicate. Then we copy back the computed
// predicate to the device. We should avoid this unecessary back and forth.
bool *predicate = p_ml_acceptable;
if (surrogate->getModelResource() == AMSResourceType::DEVICE) {
predicate = rm.allocate<bool>(totalElements, AMSResourceType::HOST);
rm.copy(p_ml_acceptable, predicate);
}


if (uqPolicy == AMSUQPolicy::DeltaUQ_Mean) {
for (size_t i = 0; i < totalElements; ++i) {
// Use double for increased precision, range in the calculation
double mean = 0.0;
for (size_t dim = 0; dim < ndims; ++dim)
mean += outputs_stdev[dim][i];
mean /= ndims;
p_ml_acceptable[i] = (mean < threshold);
predicate[i] = (mean < threshold);
}
} else if (uqPolicy == AMSUQPolicy::DeltaUQ_Max) {
for (size_t i = 0; i < totalElements; ++i) {
Expand All @@ -106,12 +127,17 @@ class UQ
break;
}

p_ml_acceptable[i] = is_acceptable;
predicate[i] = is_acceptable;
}
} else {
THROW(std::runtime_error, "Invalid UQ policy");
}

if (surrogate->getModelResource() == AMSResourceType::DEVICE) {
rm.copy(predicate, p_ml_acceptable);
rm.deallocate(predicate, AMSResourceType::HOST);
}

for (int dim = 0; dim < ndims; ++dim)
rm.deallocate(outputs_stdev[dim], AMSResourceType::HOST);
CALIPER(CALI_MARK_END("DELTAUQ");)
Expand Down
13 changes: 13 additions & 0 deletions tests/AMSlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ if (WITH_TORCH)
add_test(NAME AMSExampleSingleRandomUQ::HOST COMMAND ams_example --precision single --uqtype random -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -e 100)
add_test(NAME AMSExampleDoubleRandomUQ::HOST COMMAND ams_example --precision double --uqtype random -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -e 100)

# UQ Tests
BUILD_TEST(ams_delta_uq_test ams_uq_model.cpp)
if (WITH_TORCH)
add_test(NAME AMSDeltaUQDoubleMean::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 3 0.0)
add_test(NAME AMSDeltaUQDoubleMax::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 4 0.0)

if (WITH_CUDA)
add_test(NAME AMSDeltaUQDoubleMean::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 3 0.0)
add_test(NAME AMSDeltaUQDoubleMax::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 4 0.0)
endif()
endif()
#TODO Add tests with cpu model

BUILD_TEST(ams_update_model ams_update_model.cpp)
ADDTEST(ams_update_model AMSUpdateModelDouble "double" ${CMAKE_CURRENT_SOURCE_DIR}/ConstantZeroModel_cpu.pt ${CMAKE_CURRENT_SOURCE_DIR}/ConstantOneModel_cpu.pt)
endif()
Expand Down
77 changes: 77 additions & 0 deletions tests/AMSlib/ams_uq_model.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <AMS.h>

#include <cstring>
#include <iostream>
#include <ml/surrogate.hpp>
#include <ml/uq.hpp>
#include <umpire/ResourceManager.hpp>
#include <umpire/Umpire.hpp>
#include <vector>
#include <wf/resource_manager.hpp>

#define SIZE (32L * 1024L + 3L)

template <typename T>
void model(UQ<T> &model,
AMSResourceType resource,
int num_inputs,
int num_outputs)
{
std::vector<const T *> inputs;
std::vector<T *> outputs;
auto &ams_rm = ams::ResourceManager::getInstance();

for (int i = 0; i < num_inputs; i++)
inputs.push_back(ams_rm.allocate<T>(SIZE, resource));

for (int i = 0; i < num_outputs; i++)
outputs.push_back(ams_rm.allocate<T>(SIZE, resource));

bool *predicates = ams_rm.allocate<bool>(SIZE, resource);

std::cout << "We are calling evaluate\n";
model.evaluate(SIZE, inputs, outputs, predicates);


for (int i = 0; i < num_inputs; i++)
ams_rm.deallocate(const_cast<T *>(inputs[i]), resource);

for (int i = 0; i < num_outputs; i++)
ams_rm.deallocate(outputs[i], resource);

ams_rm.deallocate(predicates, resource);
}


int main(int argc, char *argv[])
{
using namespace ams;
auto &ams_rm = ResourceManager::getInstance();
int use_device = std::atoi(argv[1]);
char *model_path = argv[2];
char *data_type = argv[3];
int num_inputs = std::atoi(argv[4]);
int num_outputs = std::atoi(argv[5]);
const AMSUQPolicy uq_policy = static_cast<AMSUQPolicy>(std::atoi(argv[6]));
float threshold = std::atof(argv[7]);

std::cout << "Executing on device " << use_device << "\n";

AMSResourceType resource = AMSResourceType::HOST;
if (use_device == 1) {
resource = AMSResourceType::DEVICE;
}

ams_rm.init();


if (std::strcmp("double", data_type) == 0) {
UQ<double> UQModel(resource, uq_policy, nullptr, -1, model_path, threshold);
model(UQModel, resource, num_inputs, num_outputs);
} else if (std::strcmp("single", data_type) == 0) {
UQ<float> UQModel(resource, uq_policy, nullptr, -1, model_path, threshold);
model(UQModel, resource, num_inputs, num_outputs);
}

return 0;
}
Binary file added tests/AMSlib/torch.duq
Binary file not shown.
Binary file added tests/AMSlib/torch.duq.cuda
Binary file not shown.

0 comments on commit d783ae4

Please sign in to comment.