Change usage of DEFAULT_CPU TUNING_TARGET to DEFAULT (#515)

* Add skipping broken test for default config on AMD and NVIDIA * Add CMAKE warning for disabled tests. * Changing TUNING_TARGET from DEFAULT_CPU to DEFAULT * Update README
codeplaysoftware · May 10, 2024 · 5b80c99 · 5b80c99
1 parent 30ed01a
commit 5b80c99
Show file tree

Hide file tree

Showing 14 changed files with 71 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -416,6 +416,13 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device
 architecture through `-DDPCPP_SYCL_ARCH=<arch>`, e.g., `<arch>` can be `sm_80`
 for NVIDIA or `gfx908` for AMD.
 
+It is possible to use the `DEFAULT` target even for AMD and NVIDIA GPUs, but
+defining `-DDPCPP_SYCL_TARGET` and `-DDPCPP_SYCL_ARCH` is mandatory. The rules
+mentioned above also apply in this case.
+Using `DEFAULT` as the target will speedup compilation at the expense of
+runtime performance. Additionally, some operators will be disabled.
+For full compatibility and best performance, set the `TUNING_TARGET` appropriately.
+
 #### DPC++ Compiler Support
 
 As DPCPP SYCL compiler the project is fully compatible with `icpx` provided by
@@ -487,7 +494,7 @@ Some of the supported options are:
 | `BLAS_ENABLE_TESTING` | `ON`/`OFF` | Set it to `OFF` to avoid building the tests (`ON` is the default value) |
 | `BLAS_ENABLE_BENCHMARK` | `ON`/`OFF` | Set it to `OFF` to avoid building the benchmarks (`ON` is the default value) |
 | `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `adaptivecpp` and `computecpp`*(deprecated)*. |
-| `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT_CPU` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` |
+| `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` |
 | `CMAKE_PREFIX_PATH` | path | List of paths to check when searching for dependencies |
 | `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` |
 | `BUILD_SHARED_LIBS` | `ON`/`OFF` | Build as shared library (`ON` by default) |

diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake
@@ -98,11 +98,11 @@ function(set_target_compile_def in_target)
   elseif(${TUNING_TARGET} STREQUAL "NVIDIA_GPU")
     target_compile_definitions(${in_target} PUBLIC NVIDIA_GPU=1)
   else()
-    if(NOT ${TUNING_TARGET} STREQUAL "DEFAULT_CPU")
-      message(STATUS "${TUNING_TARGET} not supported. Switching to DEFAULT_CPU instead.")
-      set(TUNING_TARGET "DEFAULT_CPU")
+    if(NOT ${TUNING_TARGET} STREQUAL "DEFAULT")
+      message(STATUS "${TUNING_TARGET} not supported. Switching to DEFAULT instead.")
+      set(TUNING_TARGET "DEFAULT")
     endif()
-    target_compile_definitions(${in_target} PUBLIC DEFAULT_CPU=1)
+    target_compile_definitions(${in_target} PUBLIC DEFAULT=1)
   endif()
   message(STATUS "Adding ${TUNING_TARGET} backend to target ${in_target}")
   #setting tall skinny support

diff --git a/cmake/Modules/ConfigurePORTBLAS.cmake b/cmake/Modules/ConfigurePORTBLAS.cmake
@@ -56,7 +56,7 @@ if(NAIVE_GEMM)
 endif()
 
 # the TUNING_TARGET variable defines the platform for which the sycl library is tuned
-SET(TUNING_TARGET "DEFAULT_CPU" CACHE STRING "Default Platform 'DEFAULT_CPU'")
+SET(TUNING_TARGET "DEFAULT" CACHE STRING "Default Platform 'DEFAULT'")
 message(STATUS "${TUNING_TARGET} is chosen as a tuning target")
 
 if(DEFINED TARGET)

diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake
@@ -97,7 +97,7 @@ elseif(is_adaptivecpp)
   set(CMAKE_CXX_STANDARD 17)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
   get_target_property(SYCL_INCLUDE_DIRS AdaptiveCpp::acpp-rt INTERFACE_INCLUDE_DIRECTORIES)
-  set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT_CPU")
+  set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT")
   if((${BLAS_ENABLE_BENCHMARK}) AND (${TUNING_TARGET} IN_LIST HIP_BENCH_UNSUPPORTED_TARGETS))
     message(STATUS "Benchmarks are not supported when targetting OpenCL/LevelZero backend 
             devices. portBLAS Benchmarks are disabled.")

diff --git a/doc/Gemm.md b/doc/Gemm.md
@@ -172,7 +172,7 @@ This cmake variable causes a corresponding define for the selected platform to b
 #elif defined POWER_VR
 #include "interface/blas3/backend/power_vr.hpp"
 #else
-#include "interface/blas3/backend/default_cpu.hpp"
+#include "interface/blas3/backend/default.hpp"
 #endif
 ```
 
@@ -307,7 +307,7 @@ The relevant parameters are:
 - Vector size, the number of elements to use in vectorized loads/stores.
 - Batch type, whether to use strided (most `GEMM` kernels) or the interleaved `GEMM` for batched calls.
 
-For an example of a backend target header and some of the ways that configurations are selected let's look at `src/interface/blas3/backend/default_cpu.hpp` :
+For an example of a backend target header and some of the ways that configurations are selected let's look at `src/interface/blas3/backend/default.hpp` :
 
 ```c++
 template <bool _t_a, bool _t_b, bool is_beta_zero, typename sb_handle_t, 

diff --git a/src/interface/blas1/backend/backend.hpp b/src/interface/blas1/backend/backend.hpp
@@ -29,6 +29,6 @@
 #elif NVIDIA_GPU
 #include "interface/blas1/backend/nvidia_gpu.hpp"
 #else
-#include "interface/blas1/backend/default_cpu.hpp"
+#include "interface/blas1/backend/default.hpp"
 #endif
 
diff --git a/src/interface/blas1/backend/default_cpu.hpp → src/interface/blas1/backend/default.hpp b/src/interface/blas1/backend/default_cpu.hpp → src/interface/blas1/backend/default.hpp
@@ -22,8 +22,8 @@
  *  @filename defaul_cpu.hpp
  *
  **************************************************************************/
-#ifndef PORTBLAS_ASUM_DEFAULT_CPU_BACKEND_HPP
-#define PORTBLAS_ASUM_DEFAULT_CPU_BACKEND_HPP
+#ifndef PORTBLAS_ASUM_DEFAULT_BACKEND_HPP
+#define PORTBLAS_ASUM_DEFAULT_BACKEND_HPP
 #include "interface/blas1_interface.h"
 
 namespace blas {

diff --git a/src/interface/blas2/backend/backend.hpp b/src/interface/blas2/backend/backend.hpp
@@ -31,5 +31,5 @@
 #elif NVIDIA_GPU
 #include "interface/blas2/backend/nvidia_gpu.hpp"
 #else
-#include "interface/blas2/backend/default_cpu.hpp"
+#include "interface/blas2/backend/default.hpp"
 #endif
diff --git a/src/interface/blas2/backend/default_cpu.hpp → src/interface/blas2/backend/default.hpp b/src/interface/blas2/backend/default_cpu.hpp → src/interface/blas2/backend/default.hpp
@@ -19,11 +19,11 @@
  *
  *  portBLAS: BLAS implementation using SYCL
  *
- *  @filename default_cpu.hpp
+ *  @filename default.hpp
  *
  **************************************************************************/
-#ifndef PORTBLAS_GEMV_DEFAULT_CPU_BACKEND_HPP
-#define PORTBLAS_GEMV_DEFAULT_CPU_BACKEND_HPP
+#ifndef PORTBLAS_GEMV_DEFAULT_BACKEND_HPP
+#define PORTBLAS_GEMV_DEFAULT_BACKEND_HPP
 #include "interface/blas2_interface.h"
 
 namespace blas {

diff --git a/src/interface/blas3/backend/backend.hpp b/src/interface/blas3/backend/backend.hpp
@@ -31,5 +31,5 @@
 #elif defined NVIDIA_GPU
 #include "interface/blas3/backend/nvidia_gpu.hpp"
 #else
-#include "interface/blas3/backend/default_cpu.hpp"
+#include "interface/blas3/backend/default.hpp"
 #endif
diff --git a/src/interface/blas3/backend/default_cpu.hpp → src/interface/blas3/backend/default.hpp b/src/interface/blas3/backend/default_cpu.hpp → src/interface/blas3/backend/default.hpp
@@ -19,11 +19,11 @@
  *
  *  portBLAS: BLAS implementation using SYCL
  *
- *  @filename default_cpu.hpp
+ *  @filename default.hpp
  *
  **************************************************************************/
-#ifndef PORTBLAS_GEMM_DEFAULT_CPU_BACKEND_HPP
-#define PORTBLAS_GEMM_DEFAULT_CPU_BACKEND_HPP
+#ifndef PORTBLAS_GEMM_DEFAULT_BACKEND_HPP
+#define PORTBLAS_GEMM_DEFAULT_BACKEND_HPP
 #include "interface/gemm_launcher.h"
 
 namespace blas {

diff --git a/src/interface/extension/backend/backend.hpp b/src/interface/extension/backend/backend.hpp
@@ -29,5 +29,5 @@
 #elif defined NVIDIA_GPU
 #include "interface/extension/backend/nvidia_gpu.hpp"
 #else
-#include "interface/extension/backend/default_cpu.hpp"
+#include "interface/extension/backend/default.hpp"
 #endif
diff --git a/...terface/extension/backend/default_cpu.hpp → src/interface/extension/backend/default.hpp b/...terface/extension/backend/default_cpu.hpp → src/interface/extension/backend/default.hpp
@@ -19,11 +19,11 @@
  *
  *  portBLAS: BLAS implementation using SYCL
  *
- *  @filename default_cpu.hpp
+ *  @filename default.hpp
  *
  **************************************************************************/
-#ifndef PORTBLAS_TRANSPOSE_DEFAULT_CPU_BACKEND_HPP
-#define PORTBLAS_TRANSPOSE_DEFAULT_CPU_BACKEND_HPP
+#ifndef PORTBLAS_TRANSPOSE_DEFAULT_BACKEND_HPP
+#define PORTBLAS_TRANSPOSE_DEFAULT_BACKEND_HPP
 #include "interface/extension_interface.h"
 
 namespace blas {

diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt
@@ -73,18 +73,20 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/buffers/sycl_buffer_test.cpp
 )
 
-# Skip these tests for AdaptiveCpp for SPIRV/OpenCL targets 
-# that use SYCL 2020 features like group reduction or hang 
-# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
-set(ADAPTIVE_CPP_SKIP 
-  ${PORTBLAS_UNITTEST}/blas1/blas1_asum_test.cpp
-  ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp
-  ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp
-  ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp
-  ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp
-  # Hang during execution (without failing)
-  ${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp
-)
+if(is_adaptivecpp)
+  # Skip these tests for AdaptiveCpp for SPIRV/OpenCL targets
+  # that use SYCL 2020 features like group reduction or hang
+  # during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
+  set(TESTS_TO_SKIP
+    ${PORTBLAS_UNITTEST}/blas1/blas1_asum_test.cpp
+    ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp
+    ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp
+    ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp
+    ${PORTBLAS_UNITTEST}/blas1/blas1_rot_test.cpp
+    # Hang during execution (without failing)
+    ${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp
+  )
+endif()
 
 if(${BLAS_ENABLE_EXTENSIONS})
   list(APPEND SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/extension/transpose_test.cpp
@@ -101,6 +103,31 @@ if(is_dpcpp)
   )
 endif()
 
+if(is_dpcpp AND ${TUNING_TARGET} STREQUAL "DEFAULT")
+  if (${DPCPP_SYCL_TARGET} MATCHES "nvidia")
+    set(TESTS_TO_SKIP
+      ${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp
+      ${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_tbsv_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_tpsv_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_trsv_test.cpp
+      ${PORTBLAS_UNITTEST}/blas3/blas3_trsm_test.cpp
+      )
+    message(WARNING "Targetting NVIDIA hardware with DEFAULT TUNING_TARGET.
+    Disabling tests for following operators: iamax, iamin, trsv, tbsv, tpsv, trsm.")
+  elseif (${DPCPP_SYCL_TARGET} MATCHES "amd")
+    set(TESTS_TO_SKIP
+      ${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp
+      ${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_tbsv_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_tpsv_test.cpp
+      ${PORTBLAS_UNITTEST}/blas2/blas2_trsv_test.cpp
+      )
+    message(WARNING "Targetting AMD hardware with DEFAULT TUNING_TARGET.
+    Disabling tests for following operators: iamax, iamin, tbsv, tpsv, trsv.")
+  endif()
+endif()
+
 if(GEMM_TALL_SKINNY_SUPPORT)
   list(APPEND SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_tall_skinny_test.cpp)
 endif()
@@ -112,7 +139,7 @@ set(HALF_DATA_OPS "blas1_axpy_test"
                   )
 
 foreach(blas_test ${SYCL_UNITTEST_SRCS})
-  if(is_adaptivecpp AND ${blas_test} IN_LIST ADAPTIVE_CPP_SKIP)
+  if(${blas_test} IN_LIST TESTS_TO_SKIP)
     continue()
   endif()
   get_filename_component(test_exec ${blas_test} NAME_WE)