diff --git a/clang/include/clang/DPCT/DPCTOptions.inc b/clang/include/clang/DPCT/DPCTOptions.inc index 08a6d9359705..4c4ec568be94 100644 --- a/clang/include/clang/DPCT/DPCTOptions.inc +++ b/clang/include/clang/DPCT/DPCTOptions.inc @@ -800,6 +800,14 @@ DPCT_ENUM_OPTION( "Experimental extension that allows map an address range onto " "multiple allocations of physical memory.", false), + DPCT_OPTION_ENUM_VALUE( + "in_order_queue_events", + int(ExperimentalFeatures::Exp_InOrderQueueEvents), + "Experimental extension that allows get the event from the " + "last command submission into the queue and set an external " + "event as an implicit dependence on the next command submitted to " + "the queue.", + false), DPCT_OPTION_ENUM_VALUE( "non-stdandard-sycl-builtins", int(ExperimentalFeatures::Exp_NonStandardSYCLBuiltins), diff --git a/clang/lib/DPCT/AnalysisInfo.cpp b/clang/lib/DPCT/AnalysisInfo.cpp index 458724d38b1a..ca99f60685b6 100644 --- a/clang/lib/DPCT/AnalysisInfo.cpp +++ b/clang/lib/DPCT/AnalysisInfo.cpp @@ -5721,6 +5721,13 @@ void KernelCallExpr::printSubmit(KernelPrinter &Printer) { } printStreamBase(Printer); + if (isDefaultStream()) { + SubmitStmts.DefaultStreamFlag = true; + } + if (DpctGlobalInfo::useExpInOrderQueueEvents() && + (DpctGlobalInfo::getUsmLevel() == UsmLevel::UL_Restricted)) { + SubmitStmts.ImplicitSyncFlag = true; + } if (SubmitStmts.empty()) { printParallelFor(Printer, false); } else { @@ -6519,10 +6526,20 @@ KernelPrinter &KernelCallExpr::SubmitStmtsList::print(KernelPrinter &Printer) { printList(Printer, NdRangeList, "ranges to define ND iteration space for the kernel"); printList(Printer, CommandGroupList, "helper variables defined"); + if (ImplicitSyncFlag) { + if (DefaultStreamFlag) { + Printer.line("cgh.depends_on(dpct::get_current_device().get_in_order_" + "queues_last_events());"); + } else { + Printer.line("cgh.depends_on(dpct::get_default_queue().ext_oneapi_get_" + "last_event());"); + } + Printer.newLine(); + } return Printer; } bool KernelCallExpr::SubmitStmtsList::empty() const noexcept { - return CommandGroupList.empty() && NdRangeList.empty() && + return !ImplicitSyncFlag && CommandGroupList.empty() && NdRangeList.empty() && AccessorList.empty() && PtrList.empty() && MemoryList.empty() && RangeList.empty() && TextureList.empty() && SamplerList.empty() && StreamList.empty() && SyncList.empty(); diff --git a/clang/lib/DPCT/AnalysisInfo.h b/clang/lib/DPCT/AnalysisInfo.h index 9d3170c90533..1d71fbf5dab9 100644 --- a/clang/lib/DPCT/AnalysisInfo.h +++ b/clang/lib/DPCT/AnalysisInfo.h @@ -1333,6 +1333,9 @@ class DpctGlobalInfo { static bool useExpVirtualMemory() { return getUsingExperimental(); } + static bool useExpInOrderQueueEvents() { + return getUsingExperimental(); + } static bool useExpNonStandardSYCLBuiltins() { return getUsingExperimental< ExperimentalFeatures::Exp_NonStandardSYCLBuiltins>(); @@ -2933,7 +2936,8 @@ class KernelCallExpr : public CallFunctionExpr { StmtList SamplerList; StmtList NdRangeList; StmtList CommandGroupList; - + bool ImplicitSyncFlag = false; + bool DefaultStreamFlag = false; KernelPrinter &print(KernelPrinter &Printer); bool empty() const noexcept; diff --git a/clang/lib/DPCT/CommandOption/ValidateArguments.h b/clang/lib/DPCT/CommandOption/ValidateArguments.h index f07144c172b6..b1bbc7cec06c 100644 --- a/clang/lib/DPCT/CommandOption/ValidateArguments.h +++ b/clang/lib/DPCT/CommandOption/ValidateArguments.h @@ -97,6 +97,7 @@ enum class ExperimentalFeatures : unsigned int { Exp_NonUniformGroups, Exp_DeviceGlobal, Exp_VirtualMemory, + Exp_InOrderQueueEvents, Exp_ExperimentalFeaturesEnumSize, Exp_NonStandardSYCLBuiltins, Exp_All diff --git a/clang/runtime/dpct-rt/include/dpct/device.hpp b/clang/runtime/dpct-rt/include/dpct/device.hpp index 78ad6d2bee96..ce48707e9f62 100644 --- a/clang/runtime/dpct-rt/include/dpct/device.hpp +++ b/clang/runtime/dpct-rt/include/dpct/device.hpp @@ -582,6 +582,21 @@ class device_ext : public sycl::device { lock.lock(); } + std::vector get_in_order_queues_last_events() { + std::unique_lock lock(m_mutex); + std::vector last_events; + std::vector> current_queues(_queues); + lock.unlock(); + for (const auto &q : current_queues) { + if (q->is_in_order()) { + last_events.push_back(q->ext_oneapi_get_last_event()); + } + } + // Guard the destruct of current_queues to make sure the ref count is safe. + lock.lock(); + return last_events; + } + sycl::queue *create_queue(bool enable_exception_handler = false) { #ifdef DPCT_USM_LEVEL_NONE return create_out_of_order_queue(enable_exception_handler); diff --git a/clang/test/dpct/kernel_implicit_sync.cu b/clang/test/dpct/kernel_implicit_sync.cu new file mode 100644 index 000000000000..20b6894533cb --- /dev/null +++ b/clang/test/dpct/kernel_implicit_sync.cu @@ -0,0 +1,40 @@ +// RUN: dpct --format-range=none --use-experimental-features=in_order_queue_events -out-root %T/kernel_implicit_sync %s --cuda-include-path="%cuda-path/include" -- -x cuda --cuda-host-only +// RUN: FileCheck %s --match-full-lines --input-file %T/kernel_implicit_sync/kernel_implicit_sync.dp.cpp +// RUN: %if build_lit %{icpx -c -fsycl %T/kernel_implicit_sync/kernel_implicit_sync.dp.cpp -o %T/kernel_implicit_sync/kernel_implicit_sync.dp.o %} +#include + +__global__ void kernel(int *a){ + +} + +int main() { + int *a, *b; + cudaStream_t s1; + cudaStreamCreate(&s1); + cudaMallocManaged(&a, 100); + cudaMallocManaged(&b, 100); + +// CHECK: q_ct1.submit( +// CHECK: [&](sycl::handler &cgh) { +// CHECK: cgh.depends_on(dpct::get_current_device().get_in_order_queues_last_events()); +// CHECK: cgh.parallel_for( +// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), +// CHECK: [=](sycl::nd_item<3> item_ct1) { +// CHECK: kernel(a); +// CHECK: }); +// CHECK: }); + kernel<<<1,1>>>(a); + +// CHECK: s1->submit( +// CHECK: [&](sycl::handler &cgh) { +// CHECK: cgh.depends_on(dpct::get_default_queue().ext_oneapi_get_last_event()); +// CHECK: cgh.parallel_for( +// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)), +// CHECK: [=](sycl::nd_item<3> item_ct1) { +// CHECK: kernel(b); +// CHECK: }); +// CHECK: }); + kernel<<<1, 1, 0, s1>>>(b); + + return 0; +}