Skip to content

Commit

Permalink
[SYCLomatic] Fix missing migration of cudaStream_t arg for 8 CUB APIs (
Browse files Browse the repository at this point in the history
…#258)

Signed-off-by: Wang, Yihan <yihan.wang@intel.com>
  • Loading branch information
yihanwg authored Oct 13, 2022
1 parent 2546278 commit d43ce1a
Show file tree
Hide file tree
Showing 15 changed files with 620 additions and 520 deletions.
431 changes: 304 additions & 127 deletions clang/lib/DPCT/APINamesCUB.inc

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions clang/lib/DPCT/CUBAPIMigration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@ auto parentStmt = []() {
};
} // namespace

/// Check if expression is one of NULL(0)/nullptr/__null
static bool isNullPointerConstant(const Expr *E) {
if (!E)
return false;
return E->isNullPointerConstant(DpctGlobalInfo::getContext(),
static bool isNullPointerConstant(const clang::Expr *E) {
assert(E && "Expr can not be nullptr");
return E->isNullPointerConstant(clang::dpct::DpctGlobalInfo::getContext(),
Expr::NPC_ValueDependentIsNull) !=
Expr::NPCK_NotNull;
}
Expand Down
25 changes: 25 additions & 0 deletions clang/lib/DPCT/CallExprRewriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2606,6 +2606,21 @@ class CheckArgCount {
bool operator()(const CallExpr *C) { return C->getNumArgs() == Count; }
};

class CheckArgCountGreaterThan {
unsigned Count;
public:
CheckArgCountGreaterThan(unsigned I) : Count(I) {}
bool operator()(const CallExpr *C) {
unsigned DefaultArgNum = 0;
llvm::ArrayRef<const Expr *> Args(C->getArgs(), C->getNumArgs());
for (const Expr *Arg : Args) {
if (Arg->isDefaultArgument())
++DefaultArgNum;
}
return C->getNumArgs() - DefaultArgNum > Count;
}
};

class CheckBaseType {
std::string TypeName;

Expand Down Expand Up @@ -2664,6 +2679,16 @@ class CheckArgIsConstantIntWithValue {
}
};

class CheckArgIsDefaultCudaStream {
unsigned ArgIndex;

public:
CheckArgIsDefaultCudaStream(unsigned ArgIndex) : ArgIndex(ArgIndex) {}
bool operator()(const CallExpr *C) const {
return isDefaultStream(C->getArg(ArgIndex));
}
};

class CheckIsPtr {
unsigned Idx;

Expand Down
1 change: 1 addition & 0 deletions clang/lib/DPCT/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -571,4 +571,5 @@ std::string getArgTypeStr(const clang::CallExpr *CE, unsigned int Idx);
std::string getFunctionName(const clang::FunctionDecl *Node);
std::string getFunctionName(const clang::UnresolvedLookupExpr *Node);
std::string getFunctionName(const clang::FunctionTemplateDecl *Node);

#endif // DPCT_UTILITY_H
80 changes: 40 additions & 40 deletions clang/test/dpct/cub/devicelevel/device_encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,8 @@

#define N 8

// CHECK: void test_1() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int h_in[N] = {0, 2, 2, 9, 5, 5, 5, 8};
// CHECK: int h_unique[N] = {0};
// CHECK: int h_counts[N] = {0};
// CHECK: int *d_in = nullptr;
// CHECK: int *d_unique = nullptr;
// CHECK: int *d_counts = nullptr;
// CHECK: int *d_selected_num = nullptr;
// CHECK: int h_selected_num = 0;
// CHECK: d_in = (int *)sycl::malloc_device(sizeof(h_in), q_ct1);
// CHECK: d_unique = (int *)sycl::malloc_device(sizeof(h_unique), q_ct1);
// CHECK: d_counts = (int *)sycl::malloc_device(sizeof(h_counts), q_ct1);
// CHECK: d_selected_num = sycl::malloc_device<int>(1, q_ct1);
// CHECK: q_ct1.memcpy((void *)d_in, (void *)h_in, sizeof(h_in)).wait();
// CHECK: DPCT1026:{{.*}}
// CHECK: q_ct1.fill(d_selected_num, std::distance(d_unique, oneapi::dpl::reduce_by_segment(oneapi::dpl::execution::device_policy(q_ct1), d_in, d_in + N, dpct::device_vector<size_t>(N, 1).begin(), d_unique, d_counts).first), 1).wait();
// CHECK: q_ct1.memcpy((void *)&h_selected_num, (void *)d_selected_num, sizeof(int)){{.*}};
// CHECK: q_ct1.memcpy((void *)h_unique, (void *)d_unique, h_selected_num * sizeof(int)){{.*}};
// CHECK: q_ct1.memcpy((void *)h_counts, (void *)d_counts, h_selected_num * sizeof(int)).wait();
// CHECK: }
void test_1() {
int h_in[N] = {0, 2, 2, 9, 5, 5, 5, 8};
int h_unique[N] = {0};
Expand Down Expand Up @@ -72,29 +52,9 @@ void test_1() {
cudaFree(d_selected_num);
}

// CHECK: void test_2() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int h_in[N] = {0, 2, 2, 9, 5, 5, 5, 8};
// CHECK: int h_unique[N] = {0};
// CHECK: int h_counts[N] = {0};
// CHECK: int *d_in = nullptr;
// CHECK: int *d_unique = nullptr;
// CHECK: int *d_counts = nullptr;
// CHECK: int *d_selected_num = nullptr;
// CHECK: int h_selected_num = 0;
// CHECK: d_in = (int *)sycl::malloc_device(sizeof(h_in), q_ct1);
// CHECK: d_unique = (int *)sycl::malloc_device(sizeof(h_unique), q_ct1);
// CHECK: d_counts = (int *)sycl::malloc_device(sizeof(h_counts), q_ct1);
// CHECK: d_selected_num = sycl::malloc_device<int>(1, q_ct1);
// CHECK: q_ct1.memcpy((void *)d_in, (void *)h_in, sizeof(h_in)).wait();
// CHECK: DPCT1027:{{.*}}
// CHECK: 0, 0;
// CHECK: q_ct1.fill(d_selected_num, std::distance(d_unique, oneapi::dpl::reduce_by_segment(oneapi::dpl::execution::device_policy(q_ct1), d_in, d_in + N, dpct::device_vector<size_t>(N, 1).begin(), d_unique, d_counts).first), 1).wait();
// CHECK: q_ct1.memcpy((void *)&h_selected_num, (void *)d_selected_num, sizeof(int)){{.*}};
// CHECK: q_ct1.memcpy((void *)h_unique, (void *)d_unique, h_selected_num * sizeof(int)){{.*}};
// CHECK: q_ct1.memcpy((void *)h_counts, (void *)d_counts, h_selected_num * sizeof(int)).wait();
// CHECK: }
void test_2() {
int h_in[N] = {0, 2, 2, 9, 5, 5, 5, 8};
int h_unique[N] = {0};
Expand Down Expand Up @@ -130,3 +90,43 @@ void test_2() {
cudaFree(d_temp);
cudaFree(d_selected_num);
}

// CHECK: dpct::queue_ptr stream = (dpct::queue_ptr)(void *)(uintptr_t)5;
// CHECK: DPCT1026:{{.*}}
// CHECK: stream->fill(d_selected_num, std::distance(d_unique, oneapi::dpl::reduce_by_segment(oneapi::dpl::execution::device_policy(*stream), d_in, d_in + N, dpct::device_vector<size_t>(N, 1).begin(), d_unique, d_counts).first), 1).wait();
void test_3() {
int h_in[N] = {0, 2, 2, 9, 5, 5, 5, 8};
int h_unique[N] = {0};
int h_counts[N] = {0};
int *d_in = nullptr;
int *d_temp = nullptr;
int *d_unique = nullptr;
int *d_counts = nullptr;
int *d_selected_num = nullptr;
int h_selected_num = 0;
size_t d_temp_size = 0;
cudaMalloc((void **)&d_in, sizeof(h_in));
cudaMalloc((void **)&d_unique, sizeof(h_unique));
cudaMalloc((void **)&d_counts, sizeof(h_counts));
cudaMalloc((void **)&d_selected_num, sizeof(int));
cudaMemcpy((void *)d_in, (void *)h_in, sizeof(h_in), cudaMemcpyHostToDevice);
cudaStream_t stream = (cudaStream_t)(void *)(uintptr_t)5;
cub::DeviceRunLengthEncode::Encode(nullptr, d_temp_size, d_in, d_unique, d_counts, d_selected_num, N, stream);
cudaMalloc((void **)&d_temp, d_temp_size);
cub::DeviceRunLengthEncode::Encode(d_temp, d_temp_size, d_in, d_unique, d_counts, d_selected_num, N, stream);
cudaMemcpy((void *)&h_selected_num, (void *)d_selected_num, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy((void *)h_unique, (void *)d_unique, h_selected_num * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy((void *)h_counts, (void *)d_counts, h_selected_num * sizeof(int), cudaMemcpyDeviceToHost);
printf("%d\n", h_selected_num);
for (int i = 0; i < h_selected_num; ++i)
printf("%d ", h_unique[i]);
printf("\n");
for (int i = 0; i < h_selected_num; ++i)
printf("%d ", h_counts[i]);
printf("\n");
cudaFree(d_in);
cudaFree(d_unique);
cudaFree(d_counts);
cudaFree(d_temp);
cudaFree(d_selected_num);
}
52 changes: 24 additions & 28 deletions clang/test/dpct/cub/devicelevel/device_exclusive_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,8 @@ struct CustomSum {
}
};

// CHECK: void test_1() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int *device_in = nullptr;
// CHECK: int *device_out = nullptr;
// CHECK: int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK: int host_out[10];
// CHECK: device_in = sycl::malloc_device<int>(N, q_ct1);
// CHECK: device_out = sycl::malloc_device<int>(N, q_ct1);
// CHECK: q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK: DPCT1026:{{.*}}
// CHECK: oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + N, device_out, 0, op);
// CHECK: q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK: sycl::free(device_in, q_ct1);
// CHECK: sycl::free(device_out, q_ct1);
// CHECK: }
void test_1() {
int *device_in = nullptr;
int *device_out = nullptr;
Expand All @@ -54,23 +40,9 @@ void test_1() {
cudaFree(device_tmp);
}

// CHECK: void test_2() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int *device_in = nullptr;
// CHECK: int *device_out = nullptr;
// CHECK: int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK: int host_out[10];
// CHECK: device_in = sycl::malloc_device<int>(N, q_ct1);
// CHECK: device_out = sycl::malloc_device<int>(N, q_ct1);
// CHECK: q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK: DPCT1027:{{.*}}
// CHECK: 0, 0;
// CHECK: oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + N, device_out, 0, op);
// CHECK: q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK: sycl::free(device_in, q_ct1);
// CHECK: sycl::free(device_out, q_ct1);
// CHECK: }
void test_2() {
int *device_in = nullptr;
int *device_out = nullptr;
Expand All @@ -90,3 +62,27 @@ void test_2() {
cudaFree(device_out);
cudaFree(device_tmp);
}

// CHECK: dpct::queue_ptr stream = (dpct::queue_ptr)(void *)(uintptr_t)5;
// CHECK: DPCT1026:{{.*}}
// CHECK: oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(*stream), device_in, device_in + N, device_out, 0, op);
void test_3() {
int *device_in = nullptr;
int *device_out = nullptr;
int *device_tmp = nullptr;
size_t n_device_tmp = 0;
int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int host_out[10];
CustomSum op;
cudaMalloc((void **)&device_in, N * sizeof(int));
cudaMalloc((void **)&device_out, N * sizeof(int));
cudaMemcpy(device_in, (void *)host_in, sizeof(host_in), cudaMemcpyHostToDevice);
cudaStream_t stream = (cudaStream_t)(void *)(uintptr_t)5;
cub::DeviceScan::ExclusiveScan(device_tmp, n_device_tmp, device_in, device_out, op, 0, N, stream);
cudaMalloc((void **)&device_tmp, n_device_tmp);
cub::DeviceScan::ExclusiveScan((void *)device_tmp, n_device_tmp, device_in, device_out, op, 0, N, stream);
cudaMemcpy((void *)host_out, (void *)device_out, sizeof(host_out), cudaMemcpyDeviceToHost);
cudaFree(device_in);
cudaFree(device_out);
cudaFree(device_tmp);
}
60 changes: 27 additions & 33 deletions clang/test/dpct/cub/devicelevel/device_exclusive_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,8 @@
#include <cub/cub.cuh>
#include <stdio.h>

// CHECK:void test_1() {
// CHECK:dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK:sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK:int n = 10;
// CHECK:int *device_in = nullptr;
// CHECK:int *device_out = nullptr;
// CHECK:int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK:int host_out[10];
// CHECK:device_in = sycl::malloc_device<int>(n, q_ct1);
// CHECK:device_out = sycl::malloc_device<int>(n, q_ct1);
// CHECK:q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK:DPCT1026:{{.*}}
// CHECK:oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + n, device_out, typename std::iterator_traits<decltype(device_in)>::value_type{});
// CHECK:q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK:sycl::free(device_in, q_ct1);
// CHECK:sycl::free(device_out, q_ct1);
// CHECK:}
void test_1() {
int n = 10;
int *device_in = nullptr;
Expand All @@ -46,24 +31,9 @@ void test_1() {
cudaFree(device_tmp);
}

// CHECK:void test_2() {
// CHECK:dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK:sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK:int n = 10;
// CHECK:int *device_in = nullptr;
// CHECK:int *device_out = nullptr;
// CHECK:int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK:int host_out[10];
// CHECK:device_in = sycl::malloc_device<int>(n, q_ct1);
// CHECK:device_out = sycl::malloc_device<int>(n, q_ct1);
// CHECK:q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK:DPCT1027:{{.*}}
// CHECK:0, 0;
// CHECK:oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + n, device_out, typename std::iterator_traits<decltype(device_in)>::value_type{});
// CHECK:q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK:sycl::free(device_in, q_ct1);
// CHECK:sycl::free(device_out, q_ct1);
// CHECK:}
// CHECK: DPCT1027:{{.*}}
// CHECK: 0, 0;
// CHECK: oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + n, device_out, typename std::iterator_traits<decltype(device_in)>::value_type{});
void test_2() {
int n = 10;
int *device_in = nullptr;
Expand All @@ -83,3 +53,27 @@ void test_2() {
cudaFree(device_out);
cudaFree(device_tmp);
}

// CHECK: dpct::queue_ptr stream = (dpct::queue_ptr)(void *)(uintptr_t)5;
// CHECK: DPCT1026:{{.*}}
// CHECK: oneapi::dpl::exclusive_scan(oneapi::dpl::execution::device_policy(*stream), device_in, device_in + n, device_out, typename std::iterator_traits<decltype(device_in)>::value_type{});
void test_3() {
int n = 10;
int *device_in = nullptr;
int *device_out = nullptr;
int *device_tmp = nullptr;
size_t n_device_tmp = 0;
int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int host_out[10];
cudaMalloc((void **)&device_in, n * sizeof(int));
cudaMalloc((void **)&device_out, n * sizeof(int));
cudaMemcpy(device_in, (void *)host_in, sizeof(host_in), cudaMemcpyHostToDevice);
cudaStream_t stream = (cudaStream_t)(void *)(uintptr_t)5;
cub::DeviceScan::ExclusiveSum(device_tmp, n_device_tmp, device_in, device_out, n, stream);
cudaMalloc((void **)&device_tmp, n_device_tmp);
cub::DeviceScan::ExclusiveSum((void *)device_tmp, n_device_tmp, device_in, device_out, n, stream);
cudaMemcpy((void *)host_out, (void *)device_out, sizeof(host_out), cudaMemcpyDeviceToHost);
cudaFree(device_in);
cudaFree(device_out);
cudaFree(device_tmp);
}
52 changes: 24 additions & 28 deletions clang/test/dpct/cub/devicelevel/device_inclusive_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,8 @@ struct CustomSum {
}
};

// CHECK: void test_1() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int *device_in = nullptr;
// CHECK: int *device_out = nullptr;
// CHECK: int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK: int host_out[10];
// CHECK: device_in = sycl::malloc_device<int>(N, q_ct1);
// CHECK: device_out = sycl::malloc_device<int>(N, q_ct1);
// CHECK: q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK: DPCT1026:{{.*}}
// CHECK: oneapi::dpl::inclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + N, device_out, op);
// CHECK: q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK: sycl::free(device_in, q_ct1);
// CHECK: sycl::free(device_out, q_ct1);
// CHECK: }
void test_1() {
int *device_in = nullptr;
int *device_out = nullptr;
Expand All @@ -54,23 +40,9 @@ void test_1() {
cudaFree(device_tmp);
}

// CHECK: void test_2() {
// CHECK: dpct::device_ext &dev_ct1 = dpct::get_current_device();
// CHECK: sycl::queue &q_ct1 = dev_ct1.default_queue();
// CHECK: int *device_in = nullptr;
// CHECK: int *device_out = nullptr;
// CHECK: int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
// CHECK: int host_out[10];
// CHECK: device_in = sycl::malloc_device<int>(N, q_ct1);
// CHECK: device_out = sycl::malloc_device<int>(N, q_ct1);
// CHECK: q_ct1.memcpy(device_in, (void *)host_in, sizeof(host_in)).wait();
// CHECK: DPCT1027:{{.*}}
// CHECK: 0, 0;
// CHECK: oneapi::dpl::inclusive_scan(oneapi::dpl::execution::device_policy(q_ct1), device_in, device_in + N, device_out, op);
// CHECK: q_ct1.memcpy((void *)host_out, (void *)device_out, sizeof(host_out)).wait();
// CHECK: sycl::free(device_in, q_ct1);
// CHECK: sycl::free(device_out, q_ct1);
// CHECK: }
void test_2() {
int *device_in = nullptr;
int *device_out = nullptr;
Expand All @@ -90,3 +62,27 @@ void test_2() {
cudaFree(device_out);
cudaFree(device_tmp);
}

// CHECK: dpct::queue_ptr stream = (dpct::queue_ptr)(void *)(uintptr_t)5;
// CHECK: DPCT1026:{{.*}}
// CHECK: oneapi::dpl::inclusive_scan(oneapi::dpl::execution::device_policy(*stream), device_in, device_in + N, device_out, op);
void test_3() {
int *device_in = nullptr;
int *device_out = nullptr;
int *device_tmp = nullptr;
size_t n_device_tmp = 0;
int host_in[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
int host_out[10];
CustomSum op;
cudaMalloc((void **)&device_in, N * sizeof(int));
cudaMalloc((void **)&device_out, N * sizeof(int));
cudaMemcpy(device_in, (void *)host_in, sizeof(host_in), cudaMemcpyHostToDevice);
cudaStream_t stream = (cudaStream_t)(void *)(uintptr_t)5;
cub::DeviceScan::InclusiveScan(device_tmp, n_device_tmp, device_in, device_out, op, N, stream);
cudaMalloc((void **)&device_tmp, n_device_tmp);
cub::DeviceScan::InclusiveScan((void *)device_tmp, n_device_tmp, device_in, device_out, op, N, stream);
cudaMemcpy((void *)host_out, (void *)device_out, sizeof(host_out), cudaMemcpyDeviceToHost);
cudaFree(device_in);
cudaFree(device_out);
cudaFree(device_tmp);
}
Loading

0 comments on commit d43ce1a

Please sign in to comment.