diff --git a/sycl/include/CL/sycl/half_type.hpp b/sycl/include/CL/sycl/half_type.hpp index 9d171d7c17867..dbcaa2883ca3c 100644 --- a/sycl/include/CL/sycl/half_type.hpp +++ b/sycl/include/CL/sycl/half_type.hpp @@ -35,6 +35,19 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { + +namespace ext { +namespace intel { +namespace experimental { +namespace esimd { +namespace detail { +class WrapperElementTypeProxy; +} // namespace detail +} // namespace esimd +} // namespace experimental +} // namespace intel +} // namespace ext + namespace detail { inline __SYCL_CONSTEXPR_HALF uint16_t float2Half(const float &Val) { @@ -255,6 +268,9 @@ class __SYCL_EXPORT half_v2 { // Initialize underlying data constexpr explicit half_v2(uint16_t x) : Buf(x) {} + friend class sycl::ext::intel::experimental::esimd::detail:: + WrapperElementTypeProxy; + private: uint16_t Buf; }; @@ -391,6 +407,9 @@ class half { template friend struct std::hash; + friend class sycl::ext::intel::experimental::esimd::detail:: + WrapperElementTypeProxy; + private: StorageT Data; }; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 8cacaf63e330c..dff4d9dc559f1 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -29,6 +29,9 @@ #define ESIMD_REGISTER(n) __attribute__((register_num(n))) #define __ESIMD_API ESIMD_NODEBUG ESIMD_INLINE + +#define __ESIMD_UNSUPPORTED_ON_HOST + #else // __SYCL_DEVICE_ONLY__ #define SYCL_ESIMD_KERNEL #define SYCL_ESIMD_FUNCTION @@ -41,6 +44,9 @@ #define ESIMD_REGISTER(n) #define __ESIMD_API ESIMD_INLINE + +#define __ESIMD_UNSUPPORTED_ON_HOST throw cl::sycl::feature_not_supported() + #endif // __SYCL_DEVICE_ONLY__ // Mark a function being noinline diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp new file mode 100644 index 0000000000000..c2e7aed5fad2e --- /dev/null +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp @@ -0,0 +1,708 @@ +//==------------ - elem_type_traits.hpp - DPC++ Explicit SIMD API ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This header provides basic infrastructure to support non-standard C++ types +// as simd element types. This non-standard element types are usually structs or +// classes (example: sycl::half). +// Terms: +// - "wrapper type" - a non-standard element type +// - "raw type" - the real types used to represent real storage type of the data +// bits wrapped by the corresponding wrapper structure/class +// By design, user program never uses the raw types, so they are not exposed at +// user level. +// +// The main reasons why the infrastructure is needed are: +// - attempt to create a clang vector with wrapper element type +// vector_type_t will result in compilation error +// - C++ operations on WrapperT are usually supported by the Intel GPU hardware +// (which is the main reason of supporting them in ESIMD) and need to be +// mapped to efficient hardware code sequences. +// +// To make a wrapper type appear as first-class element type, the following +// major components must be available/implemented for the type: +// 1) Storage ("raw") type must be defined. The raw type must be bit-castable to +// the wrapper type and thus must have the same bit size and alignment +// requirements. +// 2) "Nearest enclosing" standard C++ type must be defined. This is a standard +// C++ type which can represent values of the wrapper type. The enclosing type +// can be used as a fall-back type for default implementations of operations +// on the wrapper type +// 3) Type conversion intrinsics between the bit representation of a wrapper +// type value and the equivalent enclosing C++ type value +// 4) The above three are enough to emulate any wrapper type, as all operations +// can be performed on the enclosing type values, converting from raw to +// enclosing before the operation and converting back from enclosing to raw +// after the operation. But this would be inefficient in some cases - when +// enclosing C++ type does not match the raw type, as H/W usually supports +// many operations directly on the raw type (which is bit representation of +// the wrapper type). So mapping to efficient H/W operations must be defined. +// For example, for SYCL half type efficient mapping primitive operations to +// Intel GPU harware is as easy as "unwrapping" sycl::half value, which yields +// "_Float16" natively supported by the device compiler and hardware, then +// using standard C++, operations such as '+', on _Float16 values. For other +// types like bfloat16 this will require mapping to appropriate intrinsics. +// 5) The type must be marked as wrapper type explicitly, for the API to behave +// correctly. +// Important note: some of these components might have different definition for +// the same wrapper type depending on host vs device compilation. E.g. for SYCL +// half the raw type is uint16_t on host and _Float16 on device. +// +// - The mechanism to define components 1) and 2) for a new wrapper type is to +// provide a specialization of the `element_type_traits` structure for this +// type. +// - Component 3) is provided via implementing specializations of the following +// intrinsics: +// * __esimd_wrapper_type_bitcast_to/__esimd_wrapper_type_bitcast_from (should +// not be necessary with C++ 20 where there is a standard bitcast operation) +// to bitcast between the raw and the wrapper types. +// * __esimd_convertvector_to/__esimd_convertvector_from to type-convert +// between clang vectors of the wrapper type (bit-represented with the raw +// type) and clang vectors the the enclosing std type values. +// - Component 4) is provided via: +// * (primitive operations) Specializations of the +// __esimd_binary_op +// __esimd_unary_op +// __esimd_cmp_op +// __esimd_vector_binary_op +// __esimd_vector_unary_op +// __esimd_vector_cmp_op +// intrinsics. If the `use_native_cpp_ops` element type trait is true, then +// implementing those intrinsics is not necessary and std C++ operations +// will be used. +// * (math operations) Overloading std math functions for the new wrapper +// type. +// - Component 5) is provided via adding the new type to the list of types in +// `is_wrapper_elem_type_v` meta function. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace __SEIEED { + +// Primitive C++ operations supported by simd objects and templated upon by some +// of the functions/classes. + +enum class BinOp { + add, + sub, + mul, + div, + rem, + shl, + shr, + bit_or, + bit_and, + bit_xor, + log_or, + log_and +}; + +enum class CmpOp { lt, lte, gte, gt, eq, ne }; + +enum class UnaryOp { minus, plus, bit_not, log_not }; + +// If given type is a special "wrapper" element type. +template +static inline constexpr bool is_wrapper_elem_type_v = + std::is_same_v; + +template +static inline constexpr bool is_valid_simd_elem_type_v = + (is_vectorizable_v || is_wrapper_elem_type_v); + +struct invalid_raw_element_type; + +// Default (unusable) definition of the element type traits. +template struct element_type_traits { + // The raw element type of the underlying clang vector used as a + // storage. + using RawT = invalid_raw_element_type; + // A starndard C++ type which this one can be converted to/from. + // The conversions are usually H/W-supported, and the C++ type can + // represent the entire range of values of this type. + using EnclosingCppT = void; + // Whether a value or clang vector value the raw element type can be used + // directly as operand to std C++ operations. + static inline constexpr bool use_native_cpp_ops = true; +}; + +// Element type traits specialization for C++ standard element type. +template +struct element_type_traits>> { + using RawT = T; + using EnclosingCppT = T; + static inline constexpr bool use_native_cpp_ops = true; +}; + +// --- Type conversions + +// Low-level conversion functions to and from a wrapper element type. +// Must be implemented for each supported +// . + +// These are default implementations for wrapper types with native cpp +// operations support for their corresponding raw type. +template +ESIMD_INLINE vector_type_t<__raw_t, N> +__esimd_convertvector_to(vector_type_t Val) +#ifdef __SYCL_DEVICE_ONLY__ + ; // needs to be implemented for WrapperTy's for which + // element_type_traits::use_native_cpp_ops is false. +#else +{ + // TODO implement for host + throw sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ + +template +ESIMD_INLINE vector_type_t +__esimd_convertvector_from(vector_type_t<__raw_t, N> Val) +#ifdef __SYCL_DEVICE_ONLY__ + ; // needs to be implemented for WrapperTy's for which + // element_type_traits::use_native_cpp_ops is false. +#else +{ + // TODO implement for host + throw sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ + +// TODO should be replaced by std::bit_cast once C++20 is supported. +template +WrapperTy __esimd_wrapper_type_bitcast_to(__raw_t Val); +template +__raw_t __esimd_wrapper_type_bitcast_from(WrapperTy Val); + +template struct wrapper_type_converter { + using RawTy = __raw_t; + + template + ESIMD_INLINE static vector_type_t + to_vector(vector_type_t Val) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return __builtin_convertvector(Val, vector_type_t); + } else { + return __esimd_convertvector_to(Val); + } + } + + template + ESIMD_INLINE static vector_type_t + from_vector(vector_type_t Val) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return __builtin_convertvector(Val, vector_type_t); + } else { + return __esimd_convertvector_from(Val); + } + } +}; + +// Converts a raw representation of a simd vector with element type +// SrcWrapperTy to a raw representation of a simd vector with element type +// DstWrapperTy. +template , N>, + class SrcRawVecTy = vector_type_t<__raw_t, N>> +ESIMD_INLINE DstRawVecTy convert_vector(SrcRawVecTy Val) { + if constexpr (std::is_same_v) { + return Val; + } else if constexpr (!is_wrapper_elem_type_v && + !is_wrapper_elem_type_v) { + return __builtin_convertvector(Val, DstRawVecTy); + } else { + // The chain of conversions (some can be no-op if types match): + // SrcRawVecTy (of SrcWrapperTy) + // | step A [wrapper_type_converter]::from_vector + // v + // SrcStdT + // | step B [__builtin_convertvector] + // v + // DstStdT + // | step C [wrapper_type_converter]::to_vector + // v + // DstRawVecTy (of DstWrapperTy) + // + using DstStdT = typename element_type_traits::EnclosingCppT; + using SrcStdT = typename element_type_traits::EnclosingCppT; + using SrcConv = wrapper_type_converter; + using DstConv = wrapper_type_converter; + using DstStdVecT = vector_type_t; + using SrcStdVecT = vector_type_t; + SrcStdVecT TmpSrcVal; + + if constexpr (std::is_same_v) { + TmpSrcVal = std::move(Val); + } else { + TmpSrcVal = SrcConv::template from_vector(Val); // step A + } + if constexpr (std::is_same_v) { + return TmpSrcVal; + } else { + DstStdVecT TmpDstVal; + + if constexpr (std::is_same_v) { + TmpDstVal = std::move(TmpSrcVal); + } else { + TmpDstVal = __builtin_convertvector(TmpSrcVal, DstStdVecT); // step B + } + if constexpr (std::is_same_v) { + return TmpDstVal; + } else { + return DstConv::template to_vector(TmpDstVal); // step C + } + } + } +} + +template ESIMD_INLINE __raw_t bitcast_to_raw_type(Ty Val) { + if constexpr (!is_wrapper_elem_type_v) { + return Val; + } else { + return __esimd_wrapper_type_bitcast_from(Val); + } +} + +template ESIMD_INLINE Ty bitcast_to_wrapper_type(__raw_t Val) { + if constexpr (!is_wrapper_elem_type_v) { + return Val; + } else { + return __esimd_wrapper_type_bitcast_to(Val); + } +} + +// Converts a scalar value from given source type to destination type. Both +// types can be non-std element types, in which case additional non-C++ +// conversions happen if the types are different. +// NOTE: this is not symmetric with convert_vector, which inputs and outputs +// raw (storage) vector types. +template , + class SrcRawTy = __raw_t> +ESIMD_INLINE DstWrapperTy convert_scalar(SrcWrapperTy Val) { + if constexpr (std::is_same_v) { + return Val; + } else if constexpr (!is_wrapper_elem_type_v && + !is_wrapper_elem_type_v) { + return static_cast(Val); + } else { + vector_type_t V0 = bitcast_to_raw_type(Val); + vector_type_t V1 = + convert_vector(V0); + return bitcast_to_wrapper_type(V1[0]); + } +} + +template T binary_op_default_impl(T X, T Y) { + T Res{}; + if constexpr (Op == BinOp::add) + Res = X + Y; + else if constexpr (Op == BinOp::sub) + Res = X - Y; + else if constexpr (Op == BinOp::mul) + Res = X * Y; + else if constexpr (Op == BinOp::div) + Res = X / Y; + else if constexpr (Op == BinOp::rem) + Res = X % Y; + else if constexpr (Op == BinOp::shl) + Res = X << Y; + else if constexpr (Op == BinOp::shr) + Res = X >> Y; + else if constexpr (Op == BinOp::bit_or) + Res = X | Y; + else if constexpr (Op == BinOp::bit_and) + Res = X & Y; + else if constexpr (Op == BinOp::bit_xor) + Res = X ^ Y; + else if constexpr (Op == BinOp::log_or) + Res = X || Y; + else if constexpr (Op == BinOp::log_and) + Res = X && Y; + return Res; +} + +template auto comparison_op_default_impl(T X, T Y) { + decltype(X < Y) Res{}; + if constexpr (Op == CmpOp::lt) + Res = X < Y; + else if constexpr (Op == CmpOp::lte) + Res = X <= Y; + else if constexpr (Op == CmpOp::eq) + Res = X == Y; + else if constexpr (Op == CmpOp::ne) + Res = X != Y; + else if constexpr (Op == CmpOp::gte) + Res = X >= Y; + else if constexpr (Op == CmpOp::gt) + Res = X > Y; + return Res; +} + +template auto unary_op_default_impl(T X) { + if constexpr (Op == UnaryOp::minus) + return -X; + else if constexpr (Op == UnaryOp::plus) + return +X; + else if constexpr (Op == UnaryOp::bit_not) + return ~X; + else if constexpr (Op == UnaryOp::log_not) + return !X; +} + +template struct __hlp { + using RawElemT = __raw_t; + using RawVecT = vector_type_t; + using BinopT = decltype(std::declval() + std::declval()); + using CmpT = decltype(std::declval() < std::declval()); +}; + +template using __re_t = typename Hlp::RawElemT; +template using __rv_t = typename Hlp::RawVecT; +template using __cmp_t = typename Hlp::CmpT; + +// --- Scalar versions of binary operations + +template ESIMD_INLINE T __esimd_binary_op(T X, T Y); + +template >> +ESIMD_INLINE T binary_op_default(T X, T Y) { + static_assert(element_type_traits::use_native_cpp_ops); + using T1 = __raw_t; + T1 X1 = bitcast_to_raw_type(X); + T1 Y1 = bitcast_to_raw_type(Y); + T1 Res = binary_op_default_impl(X1, Y1); + return bitcast_to_wrapper_type(Res); +} + +// Default (inefficient) implementation of a scalar binary operation, which +// involves conversion to an std C++ type, performing the op and converting +// back. +template ESIMD_INLINE T __esimd_binary_op(T X, T Y) { + using T1 = typename element_type_traits::EnclosingCppT; + T1 X1 = convert_scalar(X); + T1 Y1 = convert_scalar(Y); + return convert_scalar(binary_op_default(X1, Y1)); +} + +template >> +ESIMD_INLINE T binary_op(T X, T Y) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return binary_op_default(X, Y); + } else { + return __esimd_binary_op(X, Y); + } +} + +// --- Vector versions of binary operations + +template >> +ESIMD_INLINE RawVecT vector_binary_op_default(RawVecT X, RawVecT Y) { + static_assert(element_type_traits::use_native_cpp_ops); + return binary_op_default_impl(X, Y); +} + +// Default (inefficient) implementation of a vector binary operation, which +// involves conversion to an std C++ type, performing the op and converting +// back. +template >> +ESIMD_INLINE RawVecT __esimd_vector_binary_op(RawVecT X, RawVecT Y) { + using T1 = typename element_type_traits::EnclosingCppT; + using VecT1 = vector_type_t; + VecT1 X1 = convert_vector(X); + VecT1 Y1 = convert_vector(Y); + return convert_vector( + vector_binary_op_default(X1, Y1)); +} + +template >> +ESIMD_INLINE RawVecT vector_binary_op(RawVecT X, RawVecT Y) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return vector_binary_op_default(X, Y); + } else { + return __esimd_vector_binary_op(X, Y); + } +} + +// --- Scalar versions of unary operations + +template ESIMD_INLINE T __esimd_unary_op(T X); + +template >> +ESIMD_INLINE T unary_op_default(T X) { + static_assert(element_type_traits::use_native_cpp_ops); + using T1 = __raw_t; + T1 X1 = bitcast_to_raw_type(X); + T1 Res = unary_op_default_impl(X1); + return bitcast_to_wrapper_type(Res); +} + +// Default (inefficient) implementation of a scalar unary operation, which +// involves conversion to an std C++ type, performing the op and converting +// back. +template ESIMD_INLINE T __esimd_unary_op(T X) { + using T1 = typename element_type_traits::EnclosingCppT; + T1 X1 = convert_scalar(X); + return convert_scalar(unary_op_default(X1)); +} + +template >> +ESIMD_INLINE T unary_op(T X) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return unary_op_default(X); + } else { + return __esimd_unary_op(X); + } +} + +// --- Vector versions of unary operations + +template >> +ESIMD_INLINE RawVecT vector_unary_op_default(RawVecT X) { + static_assert(element_type_traits::use_native_cpp_ops); + return unary_op_default_impl(X); +} + +// Default (inefficient) implementation of a vector unary operation, which +// involves conversion to an std C++ type, performing the op and converting +// back. +template >> +ESIMD_INLINE RawVecT __esimd_vector_unary_op(RawVecT X) { + using T1 = typename element_type_traits::EnclosingCppT; + using VecT1 = vector_type_t; + VecT1 X1 = convert_vector(X); + return convert_vector(vector_unary_op_default(X1)); +} + +template >> +ESIMD_INLINE RawVecT vector_unary_op(RawVecT X) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return vector_unary_op_default(X); + } else { + return __esimd_vector_unary_op(X); + } +} + +// --- Vector versions of comparison operations + +template , + class RetT = __cmp_t, class RawVecT = __rv_t> +ESIMD_INLINE RetT vector_comparison_op_default(RawVecT X, RawVecT Y) { + static_assert(element_type_traits::use_native_cpp_ops); + return comparison_op_default_impl(X, Y); +} + +// Default (inefficient) implementation of a vector comparison operation, which +// involves conversion to an std C++ type, performing the op and converting +// back. +template , + class RetT = __cmp_t, class RawVecT = __rv_t> +ESIMD_INLINE RetT __esimd_vector_comparison_op(RawVecT X, RawVecT Y) { + using T1 = typename element_type_traits::EnclosingCppT; + using VecT1 = vector_type_t; + VecT1 X1 = convert_vector(X); + VecT1 Y1 = convert_vector(Y); + return convert_vector, T1, N>( + vector_comparison_op_default(X1, Y1)); +} + +template , + class RetT = __cmp_t, class RawVecT = __rv_t> +ESIMD_INLINE RetT vector_comparison_op(RawVecT X, RawVecT Y) { + if constexpr (element_type_traits::use_native_cpp_ops) { + return vector_comparison_op_default(X, Y); + } else { + return __esimd_vector_comparison_op(X, Y); + } +} + +// Proxy class to access bit representation of a wrapper type both on host and +// device. +// TODO add this functionality to sycl type implementation? With C++20, +// std::bit_cast should be a good replacement. +class WrapperElementTypeProxy { +public: + template + static inline __raw_t bitcast_from_half(T Val) { +#ifdef __SYCL_DEVICE_ONLY__ + return Val.Data; +#else + return Val.Data.Buf; +#endif // __SYCL_DEVICE_ONLY__ + } + + template + static inline T bitcast_to_half(__raw_t Bits) { +#ifndef __SYCL_DEVICE_ONLY__ + return sycl::half{Bits}; +#else + sycl::half Res; + Res.Data = Bits; + return Res; +#endif // __SYCL_DEVICE_ONLY__ + } +}; + +// "Generic" version of std::is_floating_point_v which returns "true" also for +// the wrapper floating-point types such as sycl::half. +template +static inline constexpr bool is_generic_floating_point_v = + std::is_floating_point_v::EnclosingCppT>; + +// @{ +// Get computation type of a binary operator given its operand types: +// - if both types are arithmetic - return CPP's "common real type" of the +// computation (matches C++) +// - if both types are simd types, they must be of the same length N, +// and the returned type is simd, where N is the "common real type" of +// the element type of the operands (diverges from clang) +// - otherwise, one type is simd and another is arithmetic - the simd type is +// returned (matches clang) + +struct invalid_computation_type; + +template struct computation_type { + using type = invalid_computation_type; +}; + +template +struct computation_type && + is_valid_simd_elem_type_v>> { +private: + template using tr = element_type_traits; + template + using native_t = + std::conditional_t::use_native_cpp_ops, typename tr::RawT, + typename tr::EnclosingCppT>; + static inline constexpr bool is_wr1 = is_wrapper_elem_type_v; + static inline constexpr bool is_wr2 = is_wrapper_elem_type_v; + static inline constexpr bool is_fp1 = is_generic_floating_point_v; + static inline constexpr bool is_fp2 = is_generic_floating_point_v; + +public: + using type = std::conditional_t< + !is_wr1 && !is_wr2, + // T1 and T2 are both std C++ types - use std C++ type promotion + decltype(std::declval() + std::declval()), + std::conditional_t< + std::is_same_v, + // Types are the same wrapper type - return any + T1, + std::conditional_t will + // yield sycl::half) + std::conditional_t, + // both are either floating point or integral - + // return result of C++ promotion of the native + // types + decltype(std::declval>() + + std::declval>())>>>; +}; + +template +struct computation_type< + T1, T2, + std::enable_if_t || is_simd_like_type_v>> { +private: + using Ty1 = element_type_t; + using Ty2 = element_type_t; + using EltTy = typename computation_type::type; + static constexpr int N1 = is_simd_like_type_v ? T1::length : 0; + static constexpr int N2 = is_simd_like_type_v ? T2::length : 0; + static_assert((N1 == N2) || ((N1 & N2) == 0), "size mismatch"); + static constexpr int N = N1 ? N1 : N2; + +public: + using type = simd; +}; + +template +using computation_type_t = + typename computation_type, remove_cvref_t>::type; + +// @} + +//////////////////////////////////////////////////////////////////////////////// +// sycl::half traits +//////////////////////////////////////////////////////////////////////////////// + +template +struct element_type_traits>> { + // Can't use sycl::detail::half_impl::StorageT as RawT for both host and + // device as it still maps to struct on/ host (even though the struct is a + // trivial wrapper around uint16_t), and for ESIMD we need a type which can be + // an element of clang vector. +#ifdef __SYCL_DEVICE_ONLY__ + using RawT = sycl::detail::half_impl::StorageT; + // On device, _Float16 is native Cpp type, so it is the enclosing C++ type + using EnclosingCppT = RawT; + // On device, operations on half are translated to operations on _Float16, + // which is natively supported by the device compiler + static inline constexpr bool use_native_cpp_ops = true; +#else + using RawT = uint16_t; + using EnclosingCppT = float; + // On host, we can't use native Cpp '+', '-' etc. over uint16_t to emulate the + // operations on half type. + static inline constexpr bool use_native_cpp_ops = false; +#endif // __SYCL_DEVICE_ONLY__ +}; + +using half_raw = __raw_t; + +template <> +ESIMD_INLINE sycl::half +__esimd_wrapper_type_bitcast_to(half_raw Val) { + return WrapperElementTypeProxy::bitcast_to_half(Val); +} + +template <> +ESIMD_INLINE half_raw +__esimd_wrapper_type_bitcast_from(sycl::half Val) { + return WrapperElementTypeProxy::bitcast_from_half(Val); +} + +template <> +struct is_esimd_arithmetic_type<__raw_t, void> : std::true_type {}; + +// Misc +inline std::ostream &operator<<(std::ostream &O, sycl::half const &rhs) { + O << static_cast(rhs); + return O; +} + +inline std::istream &operator>>(std::istream &I, sycl::half &rhs) { + float ValFloat = 0.0f; + I >> ValFloat; + rhs = ValFloat; + return I; +} + +// The only other place which needs to be updated to support a new type is +// the is_wrapper_elem_type_v meta function. + +//////////////////////////////////////////////////////////////////////////////// +// sycl::bfloat16 traits +//////////////////////////////////////////////////////////////////////////////// +// TODO + +} // namespace __SEIEED +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp index bb8e6d5843ab4..5bbe1b6519715 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp @@ -139,12 +139,15 @@ namespace experimental { namespace esimd { namespace detail { +template using __st = __raw_t; + /// read from a basic region of a vector, return a vector template -__SEIEED::vector_type_t ESIMD_INLINE -readRegion(const __SEIEED::vector_type_t &Base, RTy Region) { - using ElemTy = typename RTy::element_type; - auto Base1 = bitcast(Base); +__SEIEED::vector_type_t<__st, RTy::length> + ESIMD_INLINE readRegion(const __SEIEED::vector_type_t<__st, BN> &Base, + RTy Region) { + using ElemTy = __st; + auto Base1 = bitcast, BN>(Base); constexpr int Bytes = BN * sizeof(BT); if constexpr (Bytes == RTy::Size_in_bytes) // This is a no-op format. @@ -163,14 +166,14 @@ readRegion(const __SEIEED::vector_type_t &Base, RTy Region) { /// read from a nested region of a vector, return a vector template -ESIMD_INLINE __SEIEED::vector_type_t -readRegion(const __SEIEED::vector_type_t &Base, +ESIMD_INLINE __SEIEED::vector_type_t<__st, T::length> +readRegion(const __SEIEED::vector_type_t<__st, BN> &Base, std::pair Region) { // parent-region type using PaTy = typename shape_type::type; constexpr int BN1 = PaTy::length; using BT1 = typename PaTy::element_type; - using ElemTy = typename T::element_type; + using ElemTy = __st; // Recursively read the base auto Base1 = readRegion(Base, Region.second); if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes) @@ -178,7 +181,7 @@ readRegion(const __SEIEED::vector_type_t &Base, return readRegion(Base1, Region.first); else { static_assert(T::Is_2D); - static_assert(std::is_same::value); + static_assert(std::is_same>::value); // To read a 2D region, we need the parent region // Read full rows with non-trivial vertical and horizontal stride = 1. constexpr int M = T::Size_y * PaTy::Size_x; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp index e19707515d1d1..0cd1efe73d927 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp @@ -10,6 +10,7 @@ #pragma once +#include #include #include #include @@ -68,20 +69,21 @@ namespace __SEIEED { // ========= simd_obj_impl bitwise logic and arithmetic operators -#define __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(BINOP, COND) \ +#define __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(BINOP, BINOP_ID, COND) \ \ /* simd_obj_impl BINOP simd_obj_impl */ \ template class SimdT, \ class SimdTx = SimdT, class = std::enable_if_t> \ inline auto operator BINOP( \ - const __SEIEED::simd_obj_impl> &LHS, \ - const __SEIEED::simd_obj_impl> &RHS) { \ + const __SEIEED::simd_obj_impl<__raw_t, N, SimdT> &LHS, \ + const __SEIEED::simd_obj_impl<__raw_t, N, SimdT> &RHS) { \ if constexpr (__SEIEED::is_simd_type_v>) { \ - using SimdPromotedT = \ - __SEIEED::computation_type_t, SimdT>; \ - using VecT = typename SimdPromotedT::vector_type; \ - return SimdPromotedT(__SEIEED::convert(LHS.data()) \ - BINOP __SEIEED::convert(RHS.data())); \ + using PromotedT = __SEIEED::computation_type_t; \ + /* vector_binary_op returns SimdT::raw_vector_type */ \ + SimdT Res = vector_binary_op( \ + __SEIEED::convert_vector(LHS.data()), \ + __SEIEED::convert_vector(RHS.data())); \ + return Res; \ } else { \ /* for SimdT=simd_mask_impl T1 and T2 are both equal to \ * simd_mask_elem_type */ \ @@ -93,7 +95,8 @@ namespace __SEIEED { template class SimdT1, class T2, \ class SimdTx = SimdT1, class = std::enable_if_t> \ inline auto operator BINOP( \ - const __SEIEED::simd_obj_impl> &LHS, T2 RHS) { \ + const __SEIEED::simd_obj_impl<__raw_t, N1, SimdT1> &LHS, \ + T2 RHS) { \ if constexpr (__SEIEED::is_simd_type_v>) { \ /* convert the SCALAR to vector type and reuse the basic operation over \ * simd objects */ \ @@ -109,7 +112,8 @@ namespace __SEIEED { template class SimdT2, \ class SimdTx = SimdT2, class = std::enable_if_t> \ inline auto operator BINOP( \ - T1 LHS, const __SEIEED::simd_obj_impl> &RHS) { \ + T1 LHS, \ + const __SEIEED::simd_obj_impl<__raw_t, N2, SimdT2> &RHS) { \ if constexpr (__SEIEED::is_simd_type_v>) { \ /* convert the SCALAR to vector type and reuse the basic operation over \ * simd objects */ \ @@ -122,27 +126,28 @@ namespace __SEIEED { #define __ESIMD_BITWISE_OP_FILTER \ std::is_integral_v &&std::is_integral_v -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(^, __ESIMD_BITWISE_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(|, __ESIMD_BITWISE_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(&, __ESIMD_BITWISE_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(^, BinOp::bit_xor, __ESIMD_BITWISE_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(|, BinOp::bit_or, __ESIMD_BITWISE_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(&, BinOp::bit_and, __ESIMD_BITWISE_OP_FILTER) #undef __ESIMD_BITWISE_OP_FILTER #define __ESIMD_SHIFT_OP_FILTER \ std::is_integral_v &&std::is_integral_v \ &&__SEIEED::is_simd_type_v -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(%, __ESIMD_SHIFT_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(<<, __ESIMD_SHIFT_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(%, BinOp::rem, __ESIMD_SHIFT_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(<<, BinOp::shl, __ESIMD_SHIFT_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(>>, BinOp::shr, __ESIMD_SHIFT_OP_FILTER) #undef __ESIMD_SHIFT_OP_FILTER #define __ESIMD_ARITH_OP_FILTER \ - __SEIEED::is_vectorizable_v &&__SEIEED::is_vectorizable_v \ - &&__SEIEED::is_simd_type_v - -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(+, __ESIMD_ARITH_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(-, __ESIMD_ARITH_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(*, __ESIMD_ARITH_OP_FILTER) -__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER) + __SEIEED::is_valid_simd_elem_type_v \ + &&__SEIEED::is_valid_simd_elem_type_v \ + &&__SEIEED::is_simd_type_v + +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(+, BinOp::add, __ESIMD_ARITH_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(-, BinOp::sub, __ESIMD_ARITH_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(*, BinOp::mul, __ESIMD_ARITH_OP_FILTER) +__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, BinOp::div, __ESIMD_ARITH_OP_FILTER) #undef __ESIMD_ARITH_OP_FILTER #undef __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP @@ -151,24 +156,28 @@ __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER) // Both simd and simd_mask will match simd_obj_impl argument when resolving // operator overloads. -#define __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(CMPOP, COND) \ +#define __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(CMPOP, CMPOP_ID, COND) \ \ /* simd_obj_impl CMPOP simd_obj_impl */ \ template class SimdT, \ class SimdTx = SimdT, class = std::enable_if_t> \ inline __SEIEE::simd_mask operator CMPOP( \ - const __SEIEED::simd_obj_impl> &LHS, \ - const __SEIEED::simd_obj_impl> &RHS) { \ - using MaskVecT = typename __SEIEE::simd_mask::vector_type; \ + const __SEIEED::simd_obj_impl<__raw_t, N, SimdT> &LHS, \ + const __SEIEED::simd_obj_impl<__raw_t, N, SimdT> &RHS) { \ + using MaskVecT = typename __SEIEE::simd_mask::raw_vector_type; \ \ - if constexpr (__SEIEED::is_simd_type_v>) { \ - using PromSimdT = \ - __SEIEED::computation_type_t, SimdT>; \ - using PromVecT = typename PromSimdT::vector_type; \ - auto ResVec = __SEIEED::convert(LHS.data()) \ - CMPOP __SEIEED::convert(RHS.data()); \ - return __SEIEE::simd_mask(__SEIEED::convert(ResVec) & \ - MaskVecT(1)); \ + if constexpr (is_simd_type_v>) { \ + using PromotedT = computation_type_t; \ + /* vector_comparison_op returns vector_type_t, where Ti is \ + * integer type */ \ + /* of the same bit size as PromotedT */ \ + auto Res = vector_comparison_op( \ + __SEIEED::convert_vector(LHS.data()), \ + __SEIEED::convert_vector(RHS.data())); \ + using ResElemT = element_type_t; \ + return __SEIEE::simd_mask( \ + __SEIEED::convert_vector(Res) & \ + MaskVecT(1)); \ } else { \ /* this is comparison of masks, don't perform type promotion */ \ auto ResVec = LHS.data() CMPOP RHS.data(); \ @@ -180,49 +189,57 @@ __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER) /* simd_obj_impl CMPOP SCALAR */ \ template class SimdT1, class T2, \ class SimdTx = SimdT1, \ - class = std::enable_if_t<__SEIEED::is_vectorizable_v && COND>> \ + class = std::enable_if_t< \ + __SEIEED::is_valid_simd_elem_type_v && COND>> \ inline __SEIEE::simd_mask operator CMPOP( \ - const __SEIEED::simd_obj_impl> &LHS, T2 RHS) { \ + const __SEIEED::simd_obj_impl<__raw_t, N1, SimdT1> &LHS, \ + T2 RHS) { \ if constexpr (__SEIEED::is_simd_type_v>) \ /* simd case */ \ return LHS CMPOP SimdT1(RHS); \ else \ /* simd_mask case - element type is fixed */ \ - return LHS CMPOP SimdT1((T1)RHS); \ + return LHS CMPOP SimdT1(convert_scalar(RHS)); \ } \ \ /* SCALAR CMPOP simd_obj_impl */ \ template class SimdT2, \ class SimdTx = SimdT2, \ - class = std::enable_if_t<__SEIEED::is_vectorizable_v && COND>> \ + class = std::enable_if_t< \ + __SEIEED::is_valid_simd_elem_type_v && COND>> \ inline __SEIEE::simd_mask operator CMPOP( \ - T1 LHS, const __SEIEED::simd_obj_impl> &RHS) { \ + T1 LHS, \ + const __SEIEED::simd_obj_impl<__raw_t, N2, SimdT2> &RHS) { \ if constexpr (__SEIEED::is_simd_type_v>) \ /* simd case */ \ return SimdT2(LHS) CMPOP RHS; \ else \ /* simd_mask case - element type is fixed */ \ - return SimdT2((T2)LHS) CMPOP RHS; \ + return SimdT2(convert_scalar(LHS)) CMPOP RHS; \ } // Equality comparison is defined for all simd_obj_impl subclasses. -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(==, true) -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(!=, true) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(==, CmpOp::eq, true) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(!=, CmpOp::ne, true) // Relational operators are defined only for the simd type. -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<, __SEIEED::is_simd_type_v) -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>, __SEIEED::is_simd_type_v) -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<=, __SEIEED::is_simd_type_v) -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>=, __SEIEED::is_simd_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<, CmpOp::lt, __SEIEED::is_simd_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>, CmpOp::gt, __SEIEED::is_simd_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<=, CmpOp::lte, + __SEIEED::is_simd_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>=, CmpOp::gte, + __SEIEED::is_simd_type_v) // Logical operators are defined only for the simd_mask type -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(&&, __SEIEED::is_simd_mask_type_v) -__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(||, __SEIEED::is_simd_mask_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(&&, BinOp::log_and, + __SEIEED::is_simd_mask_type_v) +__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(||, BinOp::log_or, + __SEIEED::is_simd_mask_type_v) #undef __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP } // namespace __SEIEED -namespace __SEIEE { +namespace __SEIEED { //////////////////////////////////////////////////////////////////////////////// // simd_view global operators //////////////////////////////////////////////////////////////////////////////// @@ -320,8 +337,8 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER) #undef __ESIMD_SHIFT_OP_FILTER #define __ESIMD_ARITH_OP_FILTER \ - __SEIEED::is_simd_type_v &&__SEIEED::is_vectorizable_v \ - &&__SEIEED::is_vectorizable_v + __SEIEED::is_simd_type_v &&__SEIEED::is_valid_simd_elem_type_v \ + &&__SEIEED::is_valid_simd_elem_type_v __ESIMD_DEF_SIMD_VIEW_BIN_OP(+, __ESIMD_ARITH_OP_FILTER) __ESIMD_DEF_SIMD_VIEW_BIN_OP(-, __ESIMD_ARITH_OP_FILTER) @@ -364,32 +381,33 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(||, __SEIEED::is_simd_mask_type_v) } \ \ /* simd_view CMPOP simd_obj_impl */ \ - template ::length == N2) && \ (__SEIEED::is_simd_type_v == \ __SEIEED::is_simd_type_v)&&COND>> \ inline __SEIEE::simd_mask operator CMPOP( \ const __SEIEE::simd_view &LHS, \ - const __SEIEED::simd_obj_impl &RHS) { \ + const __SEIEED::simd_obj_impl &RHS) { \ return LHS.read() CMPOP SimdT2(RHS.data()); \ } \ \ /* simd_obj_impl CMPOP simd_view */ \ - template ::length == N1) && \ (__SEIEED::is_simd_type_v == \ __SEIEED::is_simd_type_v)&&COND>> \ inline __SEIEE::simd_mask operator CMPOP( \ - const __SEIEED::simd_obj_impl &LHS, \ + const __SEIEED::simd_obj_impl &LHS, \ const __SEIEE::simd_view &RHS) { \ return SimdT1(LHS.data()) CMPOP RHS.read(); \ } \ \ /* simd_view CMPOP SCALAR */ \ template && COND>> \ + class = std::enable_if_t< \ + __SEIEED::is_valid_simd_elem_type_v && COND>> \ inline auto operator CMPOP(const __SEIEE::simd_view &LHS, \ T2 RHS) { \ return LHS.read() CMPOP RHS; \ @@ -397,7 +415,8 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(||, __SEIEED::is_simd_mask_type_v) \ /* SCALAR CMPOP simd_view */ \ template && COND>> \ + class = std::enable_if_t< \ + __SEIEED::is_valid_simd_elem_type_v && COND>> \ inline auto operator CMPOP( \ T1 LHS, const __SEIEE::simd_view &RHS) { \ return LHS CMPOP RHS.read(); \ @@ -415,4 +434,4 @@ __ESIMD_DEF_SIMD_VIEW_CMP_OP(>=, __SEIEED::is_simd_type_v) #undef __ESIMD_DEF_SIMD_VIEW_CMP_OP -} // namespace __SEIEE +} // namespace __SEIEED diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp index 806cd413827f8..bed505ac88b94 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp @@ -34,9 +34,10 @@ class simd_mask_impl using base_type = detail::simd_obj_impl>; public: + using raw_element_type = T; using element_type = T; - using vector_type = typename base_type::vector_type; - static_assert(std::is_same_v> && + using raw_vector_type = typename base_type::raw_vector_type; + static_assert(std::is_same_v> && "mask impl type mismatch"); simd_mask_impl() = default; @@ -48,7 +49,7 @@ class simd_mask_impl /// Implicit conversion constructor from a raw vector object. // TODO this should be made inaccessible from user code. - simd_mask_impl(const vector_type &Val) : base_type(Val) {} + simd_mask_impl(const raw_vector_type &Val) : base_type(Val) {} /// Initializer list constructor. __SYCL_DEPRECATED("use constructor from array, e.g: simd_mask<3> x({0,1,1});") @@ -56,7 +57,7 @@ class simd_mask_impl /// Construct from an array. To allow e.g. simd_mask m({1,0,0,1,...}). template > - simd_mask_impl(const element_type(&&Arr)[N1]) { + simd_mask_impl(const raw_element_type (&&Arr)[N1]) { base_type::template init_from_array(std::move(Arr)); } diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 7c908c57935a3..35a3c8fa91657 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -10,10 +10,11 @@ #pragma once +#include #include #include #include -#include +#include #include __SYCL_INLINE_NAMESPACE(cl) { @@ -29,7 +30,7 @@ namespace esimd { /// element_aligned_tag type. Flag of this type should be used in load and store /// operations when memory address is aligned by simd object's element type. struct element_aligned_tag { - template ::type> + template > static constexpr unsigned alignment = alignof(ET); }; @@ -86,7 +87,7 @@ namespace detail { /// template arguments are needed, template aliases can be used /// (simd_mask_type). /// -/// \tparam Ty the element type +/// \tparam RawTy raw (storage) element type /// \tparam N number of elements /// \tparam Derived - a class derived from this one; this class and its /// derivatives must follow the 'curiously recurring template' pattern. @@ -94,24 +95,29 @@ namespace detail { /// types.hpp, used to disable invalid specializations. /// /// \ingroup sycl_esimd -template class simd_obj_impl { +/// +template +class simd_obj_impl { template friend class simd_view; template friend class simd; template friend class simd_mask_impl; + using element_type = simd_like_obj_element_type_t; + using Ty = element_type; + public: /// The underlying builtin data type. - using vector_type = vector_type_t; + using raw_vector_type = vector_type_t; /// The element type of this simd_obj_impl object. - using element_type = Ty; + using raw_element_type = RawTy; /// The number of elements in this simd_obj_impl object. static constexpr int length = N; protected: template > - void init_from_array(const Ty(&&Arr)[N1]) noexcept { + void init_from_array(const RawTy (&&Arr)[N1]) noexcept { for (auto I = 0; I < N; ++I) { M_data[I] = Arr[I]; } @@ -133,20 +139,15 @@ template class simd_obj_impl { } /// Implicit conversion constructor from another \c simd_obj_impl object. - template - simd_obj_impl( - const simd_obj_impl, - SFINAE> &other) { + template + simd_obj_impl(const simd_obj_impl &other) { __esimd_dbg_print(simd_obj_impl(const simd_obj_impl... > &other)); - if constexpr (std::is_same_v) - set(other.data()); - else - set(__builtin_convertvector(other.data(), vector_type)); + set(convert_vector, N>(other.data())); } /// Implicit conversion constructor from a raw vector object. - simd_obj_impl(const vector_type &Val) { - __esimd_dbg_print(simd_obj_impl(const vector_type &Val)); + simd_obj_impl(const raw_vector_type &Val) { + __esimd_dbg_print(simd_obj_impl(const raw_vector_type &Val)); set(Val); } @@ -158,8 +159,8 @@ template class simd_obj_impl { /// following will compile: /// simd x = {1, 2, 3, 4}; __SYCL_DEPRECATED("use constructor from array, e.g: simd x({1,2,3});") - simd_obj_impl(std::initializer_list Ilist) noexcept { - __esimd_dbg_print(simd_obj_impl(std::initializer_list Ilist)); + simd_obj_impl(std::initializer_list Ilist) noexcept { + __esimd_dbg_print(simd_obj_impl(std::initializer_list Ilist)); int i = 0; for (auto It = Ilist.begin(); It != Ilist.end() && i < N; ++It) { M_data[i++] = *It; @@ -171,21 +172,23 @@ template class simd_obj_impl { __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); #pragma unroll for (int i = 0; i < N; ++i) { - M_data[i] = Val; - Val += Step; + M_data[i] = bitcast_to_raw_type(Val); + Val = binary_op(Val, Step); } } /// Broadcast constructor - simd_obj_impl(Ty Val) noexcept { - __esimd_dbg_print(simd_obj_impl(Ty Val)); - M_data = Val; + template >> + simd_obj_impl(T1 Val) noexcept { + __esimd_dbg_print(simd_obj_impl(T1 Val)); + M_data = bitcast_to_raw_type(detail::convert_scalar(Val)); } /// Construct from an array. To allow e.g. simd_mask_type m({1,0,0,1,...}). template > - simd_obj_impl(const Ty(&&Arr)[N1]) noexcept { - __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1])); + simd_obj_impl(const RawTy (&&Arr)[N1]) noexcept { + __esimd_dbg_print(simd_obj_impl(const RawTy(&&Arr)[N1])); init_from_array(std::move(Arr)); } @@ -212,9 +215,10 @@ template class simd_obj_impl { /// @} // Load the object's value from array. - template std::enable_if_t copy_from(const Ty(&&Arr)[N1]) { - __esimd_dbg_print(copy_from(const Ty(&&Arr)[N1])); - vector_type Tmp; + template + std::enable_if_t copy_from(const RawTy (&&Arr)[N1]) { + __esimd_dbg_print(copy_from(const RawTy(&&Arr)[N1])); + raw_vector_type Tmp; for (auto I = 0; I < N; ++I) { Tmp[I] = Arr[I]; } @@ -222,8 +226,8 @@ template class simd_obj_impl { } // Store the object's value to array. - template std::enable_if_t copy_to(Ty(&&Arr)[N1]) const { - __esimd_dbg_print(copy_to(Ty(&&Arr)[N1])); + template std::enable_if_t copy_to(RawTy (&&Arr)[N1]) const { + __esimd_dbg_print(copy_to(RawTy(&&Arr)[N1])); for (auto I = 0; I < N; ++I) { Arr[I] = data()[I]; } @@ -231,30 +235,31 @@ template class simd_obj_impl { /// @{ /// Conversion operators. - explicit operator const vector_type &() const & { - __esimd_dbg_print(explicit operator const vector_type &() const &); + explicit operator const raw_vector_type &() const & { + __esimd_dbg_print(explicit operator const raw_vector_type &() const &); return M_data; } - explicit operator vector_type &() & { - __esimd_dbg_print(explicit operator vector_type &() &); + explicit operator raw_vector_type &() & { + __esimd_dbg_print(explicit operator raw_vector_type &() &); return M_data; } - /// Explicit conversion for simd_obj_impl into T. + /// Type conversion into a scalar: + /// simd_obj_impl> to Ty. template > operator Ty() const { - __esimd_dbg_print(explicit operator Ty()); - return data()[0]; + __esimd_dbg_print(operator Ty()); + return bitcast_to_wrapper_type(data()[0]); } /// @} - vector_type data() const { - __esimd_dbg_print(vector_type data()); + raw_vector_type data() const { + __esimd_dbg_print(raw_vector_type data()); #ifndef __SYCL_DEVICE_ONLY__ return M_data; #else - return __esimd_vload(&M_data); + return __esimd_vload(&M_data); #endif } @@ -269,8 +274,8 @@ template class simd_obj_impl { /// Whole region update with predicates. void merge(const Derived &Val, const simd_mask_type &Mask) { - set(__esimd_wrregion(data(), Val.data(), 0, - Mask.data())); + set(__esimd_wrregion(data(), Val.data(), 0, + Mask.data())); } void merge(const Derived &Val1, Derived Val2, const simd_mask_type &Mask) { @@ -280,7 +285,7 @@ template class simd_obj_impl { /// View this simd_obj_impl object in a different element type. template auto bit_cast_view() &[[clang::lifetimebound]] { - using TopRegionTy = compute_format_type_t; + using TopRegionTy = compute_format_type_t; using RetTy = simd_view; return RetTy{cast_this_to_derived(), TopRegionTy{0}}; } @@ -294,8 +299,7 @@ template class simd_obj_impl { /// View as a 2-dimensional simd_view. template auto bit_cast_view() &[[clang::lifetimebound]] { - using TopRegionTy = - compute_format_type_2d_t; + using TopRegionTy = compute_format_type_2d_t; using RetTy = simd_view; return RetTy{cast_this_to_derived(), TopRegionTy{0, 0}}; } @@ -332,16 +336,16 @@ template class simd_obj_impl { static_assert(Size > 1 || Stride == 1, "Stride must be 1 in single-element region"); Derived &&Val = std::move(cast_this_to_derived()); - return __esimd_rdregion(Val.data(), - Offset); + return __esimd_rdregion(Val.data(), + Offset); } /// Read single element, return value only (not reference). - Ty operator[](int i) const { return data()[i]; } + Ty operator[](int i) const { return bitcast_to_wrapper_type(data()[i]); } /// Read single element, return value only (not reference). __SYCL_DEPRECATED("use operator[] form.") - Ty operator()(int i) const { return data()[i]; } + Ty operator()(int i) const { return bitcast_to_wrapper_type(data()[i]); } /// Return writable view of a single element. simd_view> operator[](int i) @@ -360,14 +364,14 @@ template class simd_obj_impl { template resize_a_simd_type_t iselect(const simd &Indices) { - vector_type_t Offsets = Indices.data() * sizeof(Ty); - return __esimd_rdindirect(data(), Offsets); + vector_type_t Offsets = Indices.data() * sizeof(RawTy); + return __esimd_rdindirect(data(), Offsets); } // TODO ESIMD_EXPERIMENTAL /// update single element void iupdate(ushort Index, Ty V) { auto Val = data(); - Val[Index] = V; + Val[Index] = bitcast_to_raw_type(V); set(Val); } @@ -377,9 +381,9 @@ template class simd_obj_impl { void iupdate(const simd &Indices, const resize_a_simd_type_t &Val, const simd_mask_type &Mask) { - vector_type_t Offsets = Indices.data() * sizeof(Ty); - set(__esimd_wrindirect(data(), Val.data(), Offsets, - Mask.data())); + vector_type_t Offsets = Indices.data() * sizeof(RawTy); + set(__esimd_wrindirect(data(), Val.data(), Offsets, + Mask.data())); } /// \name Replicate @@ -454,8 +458,8 @@ template class simd_obj_impl { template resize_a_simd_type_t replicate_vs_w_hs(uint16_t Offset) const { - return __esimd_rdregion(data(), - Offset * sizeof(Ty)); + return __esimd_rdregion( + data(), Offset * sizeof(RawTy)); } ///@} @@ -479,19 +483,18 @@ template class simd_obj_impl { /// Write a simd_obj_impl-vector into a basic region of a simd_obj_impl /// object. - template - ESIMD_INLINE void writeRegion( - RTy Region, - const vector_type_t &Val) { - using ElemTy = typename RTy::element_type; - if constexpr (N * sizeof(Ty) == RTy::length * sizeof(ElemTy)) + template > + ESIMD_INLINE void writeRegion(RTy Region, + const vector_type_t &Val) { + + if constexpr (N * sizeof(RawTy) == RTy::length * sizeof(ElemTy)) // update the entire vector - set(bitcast(Val)); + set(bitcast(Val)); else { static_assert(!RTy::Is_2D); // If element type differs, do bitcast conversion first. - auto Base = bitcast(data()); - constexpr int BN = (N * sizeof(Ty)) / sizeof(ElemTy); + auto Base = bitcast(data()); + constexpr int BN = (N * sizeof(RawTy)) / sizeof(ElemTy); // Access the region information. constexpr int M = RTy::Size_x; constexpr int Stride = RTy::Stride_x; @@ -501,27 +504,26 @@ template class simd_obj_impl { auto Merged = __esimd_wrregion(Base, Val, Offset); // Convert back to the original element type, if needed. - set(bitcast(Merged)); + set(bitcast(Merged)); } } /// Write a simd_obj_impl-vector into a nested region of a simd_obj_impl /// object. - template - ESIMD_INLINE void - writeRegion(std::pair Region, - const vector_type_t &Val) { + template > + ESIMD_INLINE void writeRegion(std::pair Region, + const vector_type_t &Val) { // parent-region type using PaTy = typename shape_type::type; - using ElemTy = typename TR::element_type; - using BT = typename PaTy::element_type; + using BT = __raw_t; constexpr int BN = PaTy::length; if constexpr (PaTy::Size_in_bytes == TR::Size_in_bytes) { writeRegion(Region.second, bitcast(Val)); } else { // Recursively read the base - auto Base = readRegion(data(), Region.second); + auto Base = readRegion(data(), Region.second); // If element type differs, do bitcast conversion first. auto Base1 = bitcast(Base); constexpr int BN1 = PaTy::Size_in_bytes / sizeof(ElemTy); @@ -574,9 +576,10 @@ template class simd_obj_impl { /// global address space, otherwise behavior is undefined. /// @param flags for the copy operation. If the template parameter Flags is /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type - /// alignment. If Flags is overaligned_tag, \p addr must be aligned by N. - /// Program not meeting alignment requirements results in undefined behavior. + /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's + /// raw_vector_type alignment. If Flags is overaligned_tag, \p addr must be + /// aligned by N. Program not meeting alignment requirements results in + /// undefined behavior. template >> ESIMD_INLINE void copy_from(const Ty *addr, Flags = {}) SYCL_ESIMD_FUNCTION; @@ -589,9 +592,10 @@ template class simd_obj_impl { /// @param offset offset to copy from (in bytes). /// @param flags for the copy operation. If the template parameter Flags is /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type - /// alignment. If Flags is overaligned_tag, offset must be aligned by N. - /// Program not meeting alignment requirements results in undefined behavior. + /// vector_aligned_tag, offset must be aligned by simd_obj_impl's + /// raw_vector_type alignment. If Flags is overaligned_tag, offset must be + /// aligned by N. Program not meeting alignment requirements results in + /// undefined behavior. template >> @@ -604,9 +608,10 @@ template class simd_obj_impl { /// global address space, otherwise behavior is undefined. /// @param flags for the copy operation. If the template parameter Flags is /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type - /// alignment. If Flags is overaligned_tag, \p addr must be aligned by N. - /// Program not meeting alignment requirements results in undefined behavior. + /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's + /// raw_vector_type alignment. If Flags is overaligned_tag, \p addr must be + /// aligned by N. Program not meeting alignment requirements results in + /// undefined behavior. template >> ESIMD_INLINE void copy_to(Ty *addr, Flags = {}) const SYCL_ESIMD_FUNCTION; @@ -618,9 +623,10 @@ template class simd_obj_impl { /// @param offset offset to copy from. /// @param flags for the copy operation. If the template parameter Flags is /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is - /// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type - /// alignment. If Flags is overaligned_tag, offset must be aligned by N. - /// Program not meeting alignment requirements results in undefined behavior. + /// vector_aligned_tag, offset must be aligned by simd_obj_impl's + /// raw_vector_type alignment. If Flags is overaligned_tag, offset must be + /// aligned by N. Program not meeting alignment requirements results in + /// undefined behavior. template >> @@ -630,21 +636,22 @@ template class simd_obj_impl { /// @} // Memory operations + // Unary operations. + /// Bitwise inversion, available in all subclasses. template >> Derived operator~() const { - return Derived(~data()); + return Derived{ + detail::vector_unary_op(data())}; } /// Unary logical negation operator, available in all subclasses. - /// Similarly to C++, where !x returns bool, !simd returns as simd_mask, where + /// Similarly to C++, where !x returns bool, !simd returns a simd_mask, where /// each element is a result of comparision with zero. + /// No need to implement via detail::vector_unary_op template >> simd_mask_type operator!() const { - using MaskVecT = typename simd_mask_type::vector_type; - auto R = data() == vector_type(0); - return simd_mask_type{__builtin_convertvector(R, MaskVecT) & - MaskVecT(1)}; + return *this == 0; } #define __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(BINOP, OPASSIGN, COND) \ @@ -656,7 +663,9 @@ template class simd_obj_impl { Derived &operator OPASSIGN( \ const __SEIEED::simd_obj_impl &RHS) { \ auto Res = *this BINOP RHS; \ - set(__SEIEED::convert(Res.data())); \ + using ResT = decltype(Res); \ + set(__SEIEED::convert_vector(Res.data())); \ return cast_this_to_derived(); \ } \ \ @@ -670,7 +679,9 @@ template class simd_obj_impl { Derived &operator OPASSIGN( \ const __SEIEE::simd_view &RHS) { \ auto Res = *this BINOP RHS.read(); \ - set(__SEIEED::convert(Res.data())); \ + using ResT = decltype(Res); \ + set(__SEIEED::convert_vector(Res.data())); \ return cast_this_to_derived(); \ } \ \ @@ -681,7 +692,7 @@ template class simd_obj_impl { using RHSVecT = __SEIEED::construct_a_simd_type_t; \ return *this OPASSIGN RHSVecT(RHS); \ } else { \ - return *this OPASSIGN Derived((Ty)RHS); \ + return *this OPASSIGN Derived((RawTy)RHS); \ } \ } @@ -720,14 +731,14 @@ template class simd_obj_impl { private: // The underlying data for this vector. - vector_type M_data; + raw_vector_type M_data; protected: - void set(const vector_type &Val) { + void set(const raw_vector_type &Val) { #ifndef __SYCL_DEVICE_ONLY__ M_data = Val; #else - __esimd_vstore(&M_data, Val); + __esimd_vstore(&M_data, Val); #endif } }; @@ -736,8 +747,10 @@ template class simd_obj_impl { template template -void simd_obj_impl::copy_from(const T *Addr, - Flags) SYCL_ESIMD_FUNCTION { +void simd_obj_impl::copy_from( + const simd_obj_impl::element_type *Addr, + Flags) SYCL_ESIMD_FUNCTION { + using UT = simd_obj_impl::element_type; constexpr unsigned Size = sizeof(T) * N; constexpr unsigned Align = Flags::template alignment; @@ -751,14 +764,14 @@ void simd_obj_impl::copy_from(const T *Addr, constexpr unsigned BlockN = BlockSize / sizeof(T); ForHelper::unroll([BlockN, Addr, this](unsigned Block) { select(Block * BlockN) = - block_load(Addr + (Block * BlockN), Flags{}); + block_load(Addr + (Block * BlockN), Flags{}); }); } if constexpr (RemSize > 0) { constexpr unsigned RemN = RemSize / sizeof(T); constexpr unsigned BlockN = BlockSize / sizeof(T); select(NumBlocks * BlockN) = - block_load(Addr + (NumBlocks * BlockN), Flags{}); + block_load(Addr + (NumBlocks * BlockN), Flags{}); } } else if constexpr (sizeof(T) == 8) { simd BC(reinterpret_cast(Addr), Flags{}); @@ -769,7 +782,7 @@ void simd_obj_impl::copy_from(const T *Addr, simd Offsets(0u, sizeof(T)); ForHelper::unroll([Addr, &Offsets, this](unsigned Block) { select(Block * ChunkSize) = - gather(Addr + (Block * ChunkSize), Offsets); + gather(Addr + (Block * ChunkSize), Offsets); }); } constexpr unsigned RemN = N % ChunkSize; @@ -779,14 +792,14 @@ void simd_obj_impl::copy_from(const T *Addr, } else if constexpr (RemN == 8 || RemN == 16) { simd Offsets(0u, sizeof(T)); select(NumChunks * ChunkSize) = - gather(Addr + (NumChunks * ChunkSize), Offsets); + gather(Addr + (NumChunks * ChunkSize), Offsets); } else { constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32; simd_mask_type Pred(0); Pred.template select() = 1; simd Offsets(0u, sizeof(T)); - simd Vals = - gather(Addr + (NumChunks * ChunkSize), Offsets, Pred); + simd Vals = + gather(Addr + (NumChunks * ChunkSize), Offsets, Pred); select(NumChunks * ChunkSize) = Vals.template select(); } @@ -800,6 +813,8 @@ ESIMD_INLINE EnableIfAccessor simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, Flags) SYCL_ESIMD_FUNCTION { + using UT = simd_obj_impl::element_type; + static_assert(sizeof(UT) == sizeof(T)); constexpr unsigned Size = sizeof(T) * N; constexpr unsigned Align = Flags::template alignment; @@ -813,7 +828,7 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, constexpr unsigned BlockN = BlockSize / sizeof(T); ForHelper::unroll([BlockN, acc, offset, this](unsigned Block) { select(Block * BlockN) = - block_load( + block_load( acc, offset + (Block * BlockSize), Flags{}); }); } @@ -821,7 +836,7 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, constexpr unsigned RemN = RemSize / sizeof(T); constexpr unsigned BlockN = BlockSize / sizeof(T); select(NumBlocks * BlockN) = - block_load( + block_load( acc, offset + (NumBlocks * BlockSize), Flags{}); } } else if constexpr (sizeof(T) == 8) { @@ -834,7 +849,7 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, ForHelper::unroll( [acc, offset, &Offsets, this](unsigned Block) { select(Block * ChunkSize) = - gather( + gather( acc, Offsets, offset + (Block * ChunkSize * sizeof(T))); }); } @@ -842,14 +857,14 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, if constexpr (RemN > 0) { if constexpr (RemN == 1 || RemN == 8 || RemN == 16) { simd Offsets(0u, sizeof(T)); - select(NumChunks * ChunkSize) = gather( + select(NumChunks * ChunkSize) = gather( acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T))); } else { constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32; simd_mask_type Pred(0); Pred.template select() = 1; simd Offsets(0u, sizeof(T)); - simd Vals = gather( + simd Vals = gather( acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred); select(NumChunks * ChunkSize) = Vals.template select(); @@ -860,8 +875,10 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset, template template -void simd_obj_impl::copy_to(T *addr, - Flags) const SYCL_ESIMD_FUNCTION { +void simd_obj_impl::copy_to( + simd_obj_impl::element_type *Addr, + Flags) const SYCL_ESIMD_FUNCTION { + using UT = simd_obj_impl::element_type; constexpr unsigned Size = sizeof(T) * N; constexpr unsigned Align = Flags::template alignment; @@ -869,52 +886,52 @@ void simd_obj_impl::copy_to(T *addr, constexpr unsigned NumBlocks = Size / BlockSize; constexpr unsigned RemSize = Size % BlockSize; - simd Tmp = data(); + simd Tmp{data()}; if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 && detail::isPowerOf2(RemSize / OperandSize::OWORD)) { if constexpr (NumBlocks > 0) { constexpr unsigned BlockN = BlockSize / sizeof(T); - ForHelper::unroll([BlockN, addr, &Tmp](unsigned Block) { - block_store(addr + (Block * BlockN), - Tmp.template select(Block * BlockN)); + ForHelper::unroll([BlockN, Addr, &Tmp](unsigned Block) { + block_store(Addr + (Block * BlockN), + Tmp.template select(Block * BlockN)); }); } if constexpr (RemSize > 0) { constexpr unsigned RemN = RemSize / sizeof(T); constexpr unsigned BlockN = BlockSize / sizeof(T); - block_store(addr + (NumBlocks * BlockN), - Tmp.template select(NumBlocks * BlockN)); + block_store(Addr + (NumBlocks * BlockN), + Tmp.template select(NumBlocks * BlockN)); } } else if constexpr (sizeof(T) == 8) { simd BC = Tmp.template bit_cast_view(); - BC.copy_to(reinterpret_cast(addr), Flags{}); + BC.copy_to(reinterpret_cast(Addr), Flags{}); } else { constexpr unsigned NumChunks = N / ChunkSize; if constexpr (NumChunks > 0) { simd Offsets(0u, sizeof(T)); - ForHelper::unroll([addr, &Offsets, &Tmp](unsigned Block) { - scatter( - addr + (Block * ChunkSize), Offsets, + ForHelper::unroll([Addr, &Offsets, &Tmp](unsigned Block) { + scatter( + Addr + (Block * ChunkSize), Offsets, Tmp.template select(Block * ChunkSize)); }); } constexpr unsigned RemN = N % ChunkSize; if constexpr (RemN > 0) { if constexpr (RemN == 1) { - addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize]; + Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize]; } else if constexpr (RemN == 8 || RemN == 16) { simd Offsets(0u, sizeof(T)); - scatter(addr + (NumChunks * ChunkSize), Offsets, - Tmp.template select(NumChunks * ChunkSize)); + scatter(Addr + (NumChunks * ChunkSize), Offsets, + Tmp.template select(NumChunks * ChunkSize)); } else { constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32; simd_mask_type Pred(0); Pred.template select() = 1; - simd Vals(0); + simd Vals; Vals.template select() = Tmp.template select(NumChunks * ChunkSize); simd Offsets(0u, sizeof(T)); - scatter(addr + (NumChunks * ChunkSize), Offsets, Vals, Pred); + scatter(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred); } } } @@ -926,6 +943,7 @@ ESIMD_INLINE EnableIfAccessor simd_obj_impl::copy_to(AccessorT acc, uint32_t offset, Flags) const SYCL_ESIMD_FUNCTION { + using UT = simd_obj_impl::element_type; constexpr unsigned Size = sizeof(T) * N; constexpr unsigned Align = Flags::template alignment; @@ -933,13 +951,14 @@ simd_obj_impl::copy_to(AccessorT acc, uint32_t offset, constexpr unsigned NumBlocks = Size / BlockSize; constexpr unsigned RemSize = Size % BlockSize; - simd Tmp = data(); + simd Tmp{data()}; + if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 && detail::isPowerOf2(RemSize / OperandSize::OWORD)) { if constexpr (NumBlocks > 0) { constexpr unsigned BlockN = BlockSize / sizeof(T); ForHelper::unroll([BlockN, acc, offset, &Tmp](unsigned Block) { - block_store( + block_store( acc, offset + (Block * BlockSize), Tmp.template select(Block * BlockN)); }); @@ -947,7 +966,7 @@ simd_obj_impl::copy_to(AccessorT acc, uint32_t offset, if constexpr (RemSize > 0) { constexpr unsigned RemN = RemSize / sizeof(T); constexpr unsigned BlockN = BlockSize / sizeof(T); - block_store( + block_store( acc, offset + (NumBlocks * BlockSize), Tmp.template select(NumBlocks * BlockN)); } @@ -960,7 +979,7 @@ simd_obj_impl::copy_to(AccessorT acc, uint32_t offset, simd Offsets(0u, sizeof(T)); ForHelper::unroll([acc, offset, &Offsets, &Tmp](unsigned Block) { - scatter( + scatter( acc, Offsets, Tmp.template select(Block * ChunkSize), offset + (Block * ChunkSize * sizeof(T))); }); @@ -969,20 +988,20 @@ simd_obj_impl::copy_to(AccessorT acc, uint32_t offset, if constexpr (RemN > 0) { if constexpr (RemN == 1 || RemN == 8 || RemN == 16) { simd Offsets(0u, sizeof(T)); - scatter( + scatter( acc, Offsets, Tmp.template select(NumChunks * ChunkSize), offset + (NumChunks * ChunkSize * sizeof(T))); } else { constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32; simd_mask_type Pred(0); Pred.template select() = 1; - simd Vals(0); + simd Vals; Vals.template select() = Tmp.template select(NumChunks * ChunkSize); simd Offsets(0u, sizeof(T)); - scatter(acc, Offsets, Vals, - offset + (NumChunks * ChunkSize * sizeof(T)), - Pred); + scatter(acc, Offsets, Vals, + offset + (NumChunks * ChunkSize * sizeof(T)), + Pred); } } } diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp index d87f64e464909..d4be9820773e2 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp @@ -11,7 +11,7 @@ #pragma once #include -#include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -51,12 +51,13 @@ class simd_view_impl { /// The element type of this class, which could be different from the element /// type of the base object type. using element_type = typename ShapeTy::element_type; + using raw_element_type = __raw_t; /// The simd type if reading the object. using value_type = get_simd_t; /// The underlying builtin vector type backing the value read from the object. - using vector_type = vector_type_t; + using raw_vector_type = vector_type_t<__raw_t, length>; private: Derived &cast_this_to_derived() { return reinterpret_cast(*this); } @@ -83,7 +84,7 @@ class simd_view_impl { if constexpr (std::is_same_v) return read(); else - return convert(read()); + return convert_vector(read().data()); } /// Implicit conversion to simd_mask_impl type, if element type is compatible. @@ -117,7 +118,7 @@ class simd_view_impl { return value_type{readRegion(M_base.data(), M_region)}; } - typename value_type::vector_type data() const { return read().data(); } + typename value_type::raw_vector_type data() const { return read().data(); } /// Write to this object. Derived &write(const value_type &Val) { @@ -259,7 +260,8 @@ class simd_view_impl { #undef __ESIMD_SHIFT_OP_FILTER #define __ESIMD_ARITH_OP_FILTER \ - is_vectorizable_v &&is_vectorizable_v &&is_simd_type_v + is_valid_simd_elem_type_v &&is_valid_simd_elem_type_v \ + &&is_simd_type_v __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(+, +=, __ESIMD_ARITH_OP_FILTER) __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(-, -=, __ESIMD_ARITH_OP_FILTER) @@ -286,7 +288,7 @@ class simd_view_impl { template >> auto operator!() { - using MaskVecT = typename simd_mask_type::vector_type; + using MaskVecT = typename simd_mask_type::raw_vector_type; auto V = read().data() == 0; return simd_mask_type{__builtin_convertvector(V, MaskVecT) & MaskVecT(1)}; @@ -313,12 +315,13 @@ class simd_view_impl { is_simd_type_v)&&(length == SimdT::length)>> Derived &operator=(const simd_obj_impl &Other) { - return write(convert(reinterpret_cast(Other))); + return write(convert_vector( + Other.data())); } - template >> + template >> Derived &operator=(T1 RHS) { - return write(value_type((element_type)RHS)); + return write(value_type(convert_scalar(RHS))); } /// @} diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp new file mode 100644 index 0000000000000..0af9a1a6718f2 --- /dev/null +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp @@ -0,0 +1,84 @@ +//==-------------- types.hpp - DPC++ Explicit SIMD API ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Meta-functions to compute compile-time element type of a simd_view resulting +// from format operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace __SEIEED { + +template struct compute_format_type; + +template struct compute_format_type_impl { + static constexpr int Size = sizeof(Ty) * N / sizeof(EltTy); + static constexpr int Stride = 1; + using type = region1d_t; +}; + +template class SimdT> +struct compute_format_type, EltTy> + : compute_format_type_impl {}; + +template +struct compute_format_type, EltTy> { + using ShapeTy = typename shape_type::type; + static constexpr int Size = ShapeTy::Size_in_bytes / sizeof(EltTy); + static constexpr int Stride = 1; + using type = region1d_t; +}; + +template +using compute_format_type_t = typename compute_format_type::type; + +// Compute the simd_view type of a 2D format operation. +template +struct compute_format_type_2d; + +template +struct compute_format_type_2d_impl { + static constexpr int Prod = sizeof(Ty) * N / sizeof(EltTy); + static_assert(Prod == Width * Height, "size mismatch"); + + static constexpr int SizeX = Width; + static constexpr int StrideX = 1; + static constexpr int SizeY = Height; + static constexpr int StrideY = 1; + using type = region2d_t; +}; + +template class SimdT> +struct compute_format_type_2d, EltTy, Height, Width> + : compute_format_type_2d_impl {}; + +template +struct compute_format_type_2d, EltTy, Height, + Width> { + using ShapeTy = typename shape_type::type; + static constexpr int Prod = ShapeTy::Size_in_bytes / sizeof(EltTy); + static_assert(Prod == Width * Height, "size mismatch"); + + static constexpr int SizeX = Width; + static constexpr int StrideX = 1; + static constexpr int SizeY = Height; + static constexpr int StrideY = 1; + using type = region2d_t; +}; + +template +using compute_format_type_2d_t = + typename compute_format_type_2d::type; + +} // namespace __SEIEED +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp index 9d82cccbf1aaa..2bd5a4defb676 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp @@ -24,6 +24,10 @@ #include +#define __SEIEED sycl::ext::intel::experimental::esimd::detail +#define __SEIEE sycl::ext::intel::experimental::esimd +#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { @@ -37,9 +41,19 @@ template class simd_view; namespace detail { +namespace csd = cl::sycl::detail; + +template +using uint_type_t = std::conditional_t< + N == 1, uint8_t, + std::conditional_t< + N == 2, uint16_t, + std::conditional_t>>>; + // forward declarations of major internal simd classes template class simd_mask_impl; -template +template class simd_obj_impl; // @{ @@ -72,100 +86,6 @@ static inline constexpr bool is_clang_vector_type_v = // @} -// @{ -// Checks if given type T derives from simd_obj_impl or is equal to it. -template -struct is_simd_obj_impl_derivative : public std::false_type { - using element_type = invalid_element_type; -}; - -// Specialization for the simd_obj_impl type itself. -template -struct is_simd_obj_impl_derivative> - : public std::true_type { - using element_type = ElT; -}; - -// Specialization for all other types. -template class Derived> -struct is_simd_obj_impl_derivative> - : public std::conditional_t< - std::is_base_of_v>, - Derived>, - std::true_type, std::false_type> { - using element_type = std::conditional_t< - std::is_base_of_v>, - Derived>, - ElT, void>; -}; - -// Convenience shortcut. -template -inline constexpr bool is_simd_obj_impl_derivative_v = - is_simd_obj_impl_derivative::value; -// @} - -// @{ -// "Resizes" given simd type \c T to given number of elements \c N. -template struct resize_a_simd_type; - -// Specialization for the simd_obj_impl type. -template class SimdT> -struct resize_a_simd_type>, Ndst> { - using type = simd_obj_impl>; -}; - -// Specialization for the simd_obj_impl type derivatives. -template class SimdT> -struct resize_a_simd_type, Ndst> { - using type = SimdT; -}; - -// Convenience shortcut. -template -using resize_a_simd_type_t = typename resize_a_simd_type::type; -// @} - -// @{ -// Converts element type of given simd type \c SimdT to -// given scalar type \c DstElemT. -template struct convert_simd_elem_type; - -// Specialization for the simd_obj_impl type. -template class SimdT> -struct convert_simd_elem_type>, - DstElemT> { - using type = simd_obj_impl>; -}; - -// Specialization for the simd_obj_impl type derivatives. -template class SimdT> -struct convert_simd_elem_type, DstElemT> { - using type = SimdT; -}; - -// Convenience shortcut. -template -using convert_simd_elem_type_t = - typename convert_simd_elem_type::type; - -// @} - -// Constructs a simd type with the same template type as in \c SimdT, and -// given element type and number. -template -using construct_a_simd_type_t = - convert_simd_elem_type_t, ElT>; - -// @} - -namespace csd = cl::sycl::detail; -using half = cl::sycl::detail::half_impl::StorageT; - template using remove_cvref_t = csd::remove_cv_t>; @@ -198,8 +118,15 @@ struct is_vectorizable : std::conditional_t, template static inline constexpr bool is_vectorizable_v = is_vectorizable::value; -// vector_type, using clang vector type extension. -template struct vector_type { +template +static inline constexpr bool is_esimd_scalar_v = + cl::sycl::detail::is_arithmetic::value; + +template +using is_esimd_scalar = typename std::bool_constant>; + +// raw_vector_type, using clang vector type extension. +template struct raw_vector_type { static_assert(!std::is_const::value, "const element type not supported"); static_assert(is_vectorizable_v, "element type not supported"); static_assert(N > 0, "zero-element vector not supported"); @@ -209,85 +136,96 @@ template struct vector_type { }; template -using vector_type_t = typename vector_type::type; +using vector_type_t = typename raw_vector_type::type; -// must match simd_mask::element_type -template -using simd_mask_storage_t = vector_type_t; +// @{ +// Checks if given type T derives from simd_obj_impl or is equal to it. +template +struct is_simd_obj_impl_derivative : public std::false_type {}; -// Compute the simd_view type of a 1D format operation. -template struct compute_format_type; +// Specialization for the simd_obj_impl type itself. +template +struct is_simd_obj_impl_derivative> + : public std::true_type {}; -template struct compute_format_type_impl { - static constexpr int Size = sizeof(Ty) * N / sizeof(EltTy); - static constexpr int Stride = 1; - using type = region1d_t; -}; +template struct element_type_traits; +template +using __raw_t = typename __SEIEED::element_type_traits::RawT; -template class SimdT> -struct compute_format_type, EltTy> - : compute_format_type_impl {}; - -template -struct compute_format_type, EltTy> - : compute_format_type_impl {}; - -template -struct compute_format_type, EltTy> { - using ShapeTy = typename shape_type::type; - static constexpr int Size = ShapeTy::Size_in_bytes / sizeof(EltTy); - static constexpr int Stride = 1; - using type = region1d_t; +// Specialization for all other types. +template class Derived> +struct is_simd_obj_impl_derivative> + : public std::conditional_t< + std::is_base_of_v, N, Derived>, + Derived>, + std::true_type, std::false_type> {}; + +// Convenience shortcut. +template +inline constexpr bool is_simd_obj_impl_derivative_v = + is_simd_obj_impl_derivative::value; +// @} + +// @{ +// "Resizes" given simd type \c T to given number of elements \c N. +template struct resize_a_simd_type; + +// Specialization for the simd_obj_impl type. +template class SimdT> +struct resize_a_simd_type, Nsrc, SimdT>, + Ndst> { + using type = simd_obj_impl<__raw_t, Ndst, SimdT>; }; -template -using compute_format_type_t = typename compute_format_type::type; +// Specialization for the simd_obj_impl type derivatives. +template class SimdT> +struct resize_a_simd_type, Ndst> { + using type = SimdT; +}; -// Compute the simd_view type of a 2D format operation. -template -struct compute_format_type_2d; +// Convenience shortcut. +template +using resize_a_simd_type_t = typename resize_a_simd_type::type; +// @} -template -struct compute_format_type_2d_impl { - static constexpr int Prod = sizeof(Ty) * N / sizeof(EltTy); - static_assert(Prod == Width * Height, "size mismatch"); +// @{ +// Converts element type of given simd type \c SimdT to +// given scalar type \c NewElemT. +template struct convert_simd_elem_type; - static constexpr int SizeX = Width; - static constexpr int StrideX = 1; - static constexpr int SizeY = Height; - static constexpr int StrideY = 1; - using type = region2d_t; +// Specialization for the simd_obj_impl type. +template class SimdT> +struct convert_simd_elem_type< + simd_obj_impl<__raw_t, N, SimdT>, NewElemT> { + using type = simd_obj_impl<__raw_t, N, SimdT>; }; -template class SimdT> -struct compute_format_type_2d, EltTy, Height, Width> - : compute_format_type_2d_impl {}; - -template -struct compute_format_type_2d, EltTy, Height, Width> - : compute_format_type_2d_impl {}; - -template -struct compute_format_type_2d, EltTy, Height, - Width> { - using ShapeTy = typename shape_type::type; - static constexpr int Prod = ShapeTy::Size_in_bytes / sizeof(EltTy); - static_assert(Prod == Width * Height, "size mismatch"); - - static constexpr int SizeX = Width; - static constexpr int StrideX = 1; - static constexpr int SizeY = Height; - static constexpr int StrideY = 1; - using type = region2d_t; +struct convert_simd_elem_type, NewElemT> { + using type = SimdT; }; -template -using compute_format_type_2d_t = - typename compute_format_type_2d::type; +// Convenience shortcut. +template +using convert_simd_elem_type_t = + typename convert_simd_elem_type::type; + +// @} + +// Constructs a simd type with the same template type as in \c SimdT, and +// given element type and number. +template +using construct_a_simd_type_t = + convert_simd_elem_type_t, T>; + +// @} + +// must match simd_mask::element_type +template +using simd_mask_storage_t = vector_type_t; // @{ // Checks if given type is a view of any simd type (simd or simd_mask). @@ -371,56 +309,37 @@ struct element_type>> { using type = typename is_clang_vector_type::element_type; }; -// @} - -// @{ -// Get computation type of a binary operator given its operand types: -// - if both types are arithmetic - return CPP's "common real type" of the -// computation (matches C++) -// - if both types are simd types, they must be of the same length N, -// and the returned type is simd, where N is the "common real type" of -// the element type of the operands (diverges from clang) -// - otherwise, one type is simd and another is arithmetic - the simd type is -// returned (matches clang) - -struct invalid_computation_type; - -template struct computation_type { - using type = invalid_computation_type; -}; +template using element_type_t = typename element_type::type; -template -struct computation_type< - T1, T2, std::enable_if_t && is_vectorizable_v>> { - using type = decltype(std::declval() + std::declval()); +// Determine element type of simd_obj_impl's Derived type w/o having to have +// complete instantiation of the Derived type (is required by element_type_t, +// hence can't be used here). +template struct simd_like_obj_info; +template struct simd_like_obj_info> { + using type = T; + static inline constexpr int length = N; }; - -template -struct computation_type< - T1, T2, - std::enable_if_t && is_simd_like_type_v>> { -private: - using Ty1 = typename element_type::type; - using Ty2 = typename element_type::type; - using EltTy = typename computation_type::type; - static constexpr int N1 = T1::length; - static constexpr int N2 = T2::length; - static_assert(N1 == N2, "size mismatch"); - -public: - using type = simd; +template struct simd_like_obj_info> { + using type = simd_mask_elem_type; // equals T + static inline constexpr int length = N; }; -template -using computation_type_t = - typename computation_type, remove_cvref_t>::type; +template +using simd_like_obj_element_type_t = typename simd_like_obj_info::type; +template +static inline constexpr int simd_like_obj_length = + simd_like_obj_info::length; // @} template std::enable_if_t && is_clang_vector_type_v, To> -convert(From Val) { - return __builtin_convertvector(Val, To); + ESIMD_INLINE convert(From Val) { + if constexpr (std::is_same_v) { + return Val; + } else { + return __builtin_convertvector(Val, To); + } } /// Base case for checking if a type U is one of the types. @@ -462,18 +381,6 @@ bitcast(vector_type_t Val) { return reinterpret_cast(Val); } -inline std::ostream &operator<<(std::ostream &O, half const &rhs) { - O << static_cast(rhs); - return O; -} - -inline std::istream &operator>>(std::istream &I, half &rhs) { - float ValFloat = 0.0f; - I >> ValFloat; - rhs = ValFloat; - return I; -} - } // namespace detail // Alias for backward compatibility. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp index 12d25e1dbd9af..5481d100c92a7 100755 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp @@ -15,10 +15,6 @@ #include -#define __SEIEED sycl::ext::intel::experimental::esimd::detail -#define __SEIEE sycl::ext::intel::experimental::esimd -#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail - #ifdef __SYCL_DEVICE_ONLY__ #define __ESIMD_INTRIN SYCL_EXTERNAL SYCL_ESIMD_FUNCTION #else @@ -92,10 +88,6 @@ template struct is_esimd_vector : public std::false_type {}; template struct is_esimd_vector> : public std::true_type {}; -template -using is_esimd_scalar = - typename std::bool_constant::value>; - template using is_hw_int_type = typename std::bool_constant && (sizeof(T) == N)>; @@ -119,7 +111,7 @@ using is_fp_or_dword_type = /// Convert types into vector types template struct simd_type { using type = simd; }; -template struct simd_type> { +template struct simd_type> { using type = simd; }; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 4b3a74db9e3c5..5c30ff6df7a83 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -137,13 +137,14 @@ __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) { // /// Flat-address gather. /// \ingroup sycl_esimd -template +template > __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) && (ElemsPerAddr == 1 || ElemsPerAddr == 2 || ElemsPerAddr == 4)), - simd> -gather(const T *p, simd offsets, simd_mask pred = 1) { + simd> +gather(const Tx *p, simd offsets, simd_mask pred = 1) { detail::IfNotNone::warn(); simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); @@ -177,11 +178,12 @@ gather(const T *p, simd offsets, simd_mask pred = 1) { /// Flat-address scatter. /// \ingroup sycl_esimd -template +template > __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) && (ElemsPerAddr == 1 || ElemsPerAddr == 2 || ElemsPerAddr == 4))> -scatter(T *p, simd offsets, simd vals, +scatter(Tx *p, simd offsets, simd vals, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); @@ -231,10 +233,11 @@ __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) && /// Flat-address block-load. /// \ingroup sycl_esimd -template , typename = std::enable_if_t>> -__ESIMD_API simd block_load(const T *addr, Flags = {}) { +__ESIMD_API simd block_load(const Tx *addr, Flags = {}) { detail::IfNotNone::warn(); constexpr unsigned Sz = sizeof(T) * n; static_assert(Sz >= detail::OperandSize::OWORD, @@ -257,10 +260,12 @@ __ESIMD_API simd block_load(const T *addr, Flags = {}) { /// Accessor-based block-load. /// \ingroup sycl_esimd -template >> -__ESIMD_API simd block_load(AccessorTy acc, uint32_t offset, Flags = {}) { + typename = std::enable_if_t>, + class T = detail::__raw_t> +__ESIMD_API simd block_load(AccessorTy acc, uint32_t offset, + Flags = {}) { constexpr unsigned Sz = sizeof(T) * n; static_assert(Sz >= detail::OperandSize::OWORD, "block size must be at least 1 oword"); @@ -295,9 +300,9 @@ __ESIMD_API simd block_load(AccessorTy acc, uint32_t offset, Flags = {}) { /// Flat-address block-store. /// \ingroup sycl_esimd // TODO the above note about cache hints applies to this API as well. -template -__ESIMD_API void block_store(T *p, simd vals) { +template > +__ESIMD_API void block_store(Tx *p, simd vals) { detail::IfNotNone::warn(); constexpr unsigned Sz = sizeof(T) * n; static_assert(Sz >= detail::OperandSize::OWORD, @@ -315,8 +320,10 @@ __ESIMD_API void block_store(T *p, simd vals) { /// Accessor-based block-store. /// \ingroup sycl_esimd -template -__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd vals) { +template > +__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, + simd vals) { constexpr unsigned Sz = sizeof(T) * n; static_assert(Sz >= detail::OperandSize::OWORD, "block size must be at least 1 oword"); @@ -352,12 +359,14 @@ ESIMD_INLINE const auto si = __ESIMD_GET_SURF_HANDLE(acc); if constexpr (sizeof(T) < 4) { - static_assert(std::is_integral::value, - "only integral 1- & 2-byte types are supported"); + using Tint = std::conditional_t, T, + detail::uint_type_t>; + using Treal = __raw_t; + simd vals_int = bitcast(std::move(vals).data()); using PromoT = - typename sycl::detail::conditional_t::value, int32_t, - uint32_t>; - const simd promo_vals = convert(vals); + typename sycl::detail::conditional_t::value, + int32_t, uint32_t>; + const simd promo_vals = convert(std::move(vals_int)); __esimd_scatter_scaled( pred.data(), si, glob_offset, offsets.data(), promo_vals.data()); } else { @@ -380,16 +389,25 @@ gather_impl(AccessorTy acc, simd offsets, uint32_t glob_offset, const auto si = get_surface_index(acc); if constexpr (sizeof(T) < 4) { - static_assert(std::is_integral::value, + using Tint = std::conditional_t, T, + detail::uint_type_t>; + using Treal = __raw_t; + static_assert(std::is_integral::value, "only integral 1- & 2-byte types are supported"); using PromoT = - typename sycl::detail::conditional_t::value, int32_t, - uint32_t>; + typename sycl::detail::conditional_t::value, + int32_t, uint32_t>; const simd promo_vals = __esimd_gather_masked_scaled2(si, glob_offset, offsets.data(), pred.data()); - return convert(promo_vals); + auto Res = convert(promo_vals); + + if constexpr (!std::is_same_v) { + return detail::bitcast(Res.data()); + } else { + return Res; + } } else { return __esimd_gather_masked_scaled2(si, glob_offset, offsets.data(), @@ -537,10 +555,11 @@ __ESIMD_API void scalar_store1(AccessorTy acc, uint32_t offset, T val) { /// @param offsets byte-offsets within the \p buffer to be gathered. /// @param pred predication control used for masking lanes. /// \ingroup sycl_esimd -template +template > __ESIMD_API std::enable_if_t<(N == 16 || N == 32) && (sizeof(T) == 4), - simd> -gather_rgba(const T *p, simd offsets, simd_mask pred = 1) { + simd> +gather_rgba(const Tx *p, simd offsets, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); @@ -576,10 +595,11 @@ __ESIMD_API std::enable_if_t< /// @param offsets byte-offsets within the \p buffer to be written. /// @param pred predication control used for masking lanes. /// \ingroup sycl_esimd -template +template > __ESIMD_API std::enable_if_t<(N == 16 || N == 32) && (sizeof(T) == 4)> -scatter_rgba(T *p, simd offsets, - simd vals, +scatter_rgba(Tx *p, simd offsets, + simd vals, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); @@ -656,8 +676,8 @@ constexpr bool check_atomic() { static_assert(NumSrc == 1, "One source operand is expected"); return false; } - if constexpr (!is_type()) { - static_assert((is_type()), + if constexpr (!is_type()) { + static_assert((is_type()), "Type F or HF is expected"); return false; } @@ -676,9 +696,8 @@ constexpr bool check_atomic() { "Type UW, UD or UQ is expected"); return false; } - if constexpr (Op == atomic_op::fcmpwr && - !is_type()) { - static_assert((is_type()), + if constexpr (Op == atomic_op::fcmpwr && !is_type()) { + static_assert((is_type()), "Type F or HF is expected"); return false; } @@ -699,9 +718,9 @@ constexpr bool check_atomic() { /// USM address atomic update, version with no source operands: \c inc and \c /// dec. \ingroup sycl_esimd -template -__ESIMD_API std::enable_if_t(), simd> -atomic_update(T *p, simd offset, simd_mask pred) { +template > +__ESIMD_API std::enable_if_t(), simd> +atomic_update(Tx *p, simd offset, simd_mask pred) { simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); vAddr += offset_i1; @@ -721,9 +740,9 @@ __ESIMD_API std::enable_if_t(), /// USM address atomic update, version with one source operand: e.g. \c add, \c /// sub. \ingroup sycl_esimd -template -__ESIMD_API std::enable_if_t(), simd> -atomic_update(T *p, simd offset, simd src0, +template > +__ESIMD_API std::enable_if_t(), simd> +atomic_update(Tx *p, simd offset, simd src0, simd_mask pred) { simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); @@ -745,10 +764,10 @@ __ESIMD_API std::enable_if_t(), /// USM address atomic update, version with two source operands: e.g. \c /// cmpxchg. \ingroup sycl_esimd -template -__ESIMD_API std::enable_if_t(), simd> -atomic_update(T *p, simd offset, simd src0, simd src1, - simd_mask pred) { +template > +__ESIMD_API std::enable_if_t(), simd> +atomic_update(Tx *p, simd offset, simd src0, + simd src1, simd_mask pred) { simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); vAddr += offset_i1; @@ -972,7 +991,7 @@ __ESIMD_API simd slm_block_load(uint32_t offset) { "block size must be at most 16 owords"); const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); - return __esimd_oword_ld(si, offset >> 4); + return __esimd_oword_ld, n>(si, offset >> 4); } /// SLM block-store. @@ -987,15 +1006,14 @@ __ESIMD_API void slm_block_store(uint32_t offset, simd vals) { "block must be 1, 2, 4 or 8 owords long"); static_assert(Sz <= 8 * detail::OperandSize::OWORD, "block size must be at most 8 owords"); - const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); // offset in genx.oword.st is in owords - __esimd_oword_st(si, offset >> 4, vals.data()); + __esimd_oword_st, n>(si, offset >> 4, vals.data()); } /// SLM atomic update operation, no source operands: \c inc and \c dec. -template -__ESIMD_API std::enable_if_t(), simd> +template > +__ESIMD_API std::enable_if_t(), simd> slm_atomic_update(simd offsets, simd_mask pred) { const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); return __esimd_dword_atomic0(pred.data(), si, offsets.data()); @@ -1010,9 +1028,9 @@ __ESIMD_API std::enable_if_t(), } /// SLM atomic update operation, one source operand: e.g. \c add, \c sub. -template -__ESIMD_API std::enable_if_t(), simd> -slm_atomic_update(simd offsets, simd src0, +template > +__ESIMD_API std::enable_if_t(), simd> +slm_atomic_update(simd offsets, simd src0, simd_mask pred) { const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); return __esimd_dword_atomic1(pred.data(), si, offsets.data(), @@ -1029,9 +1047,9 @@ __ESIMD_API std::enable_if_t(), } /// SLM atomic, two source operands. -template -__ESIMD_API std::enable_if_t(), simd> -slm_atomic_update(simd offsets, simd src0, simd src1, +template > +__ESIMD_API std::enable_if_t(), simd> +slm_atomic_update(simd offsets, simd src0, simd src1, simd_mask pred) { const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); return __esimd_dword_atomic2(pred.data(), si, offsets.data(), diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp index 1a1bd8098a2e2..26803ad39f4b3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp @@ -39,15 +39,16 @@ namespace esimd { /// /// \ingroup sycl_esimd template -class simd - : public detail::simd_obj_impl< - Ty, N, simd, std::enable_if_t>> { - using base_type = detail::simd_obj_impl>; +class simd : public detail::simd_obj_impl< + detail::__raw_t, N, simd, + std::enable_if_t>> { + using base_type = detail::simd_obj_impl, N, simd>; public: using base_type::base_type; - using element_type = typename base_type::element_type; - using vector_type = typename base_type::vector_type; + using element_type = Ty; + using raw_element_type = typename base_type::raw_element_type; + using raw_vector_type = typename base_type::raw_vector_type; static constexpr int length = N; // Implicit conversion constructor from another simd object of the same @@ -56,24 +57,25 @@ class simd class = std::enable_if_t<__SEIEED::is_simd_type_v && (length == SimdT::length)>> simd(const SimdT &RHS) - : base_type(__builtin_convertvector(RHS.data(), vector_type)) { + : base_type(detail::convert_vector, N>( + RHS.data())) { __esimd_dbg_print(simd(const SimdT &RHS)); } // Broadcast constructor with conversion. template >> - simd(T1 Val) : base_type((Ty)Val) { + class = std::enable_if_t>> + simd(T1 Val) : base_type(Val) { __esimd_dbg_print(simd(T1 Val)); } - /// Explicit conversion for simd_obj_impl into T. + /// Type conversion for simd into T. template >> + class = sycl::detail::enable_if_t< + (T::length == 1) && detail::is_valid_simd_elem_type_v>> operator To() const { - __esimd_dbg_print(explicit operator To()); - return (To)base_type::data()[0]; + __esimd_dbg_print(operator To()); + return detail::convert_scalar(base_type::data()[0]); } /// @{ @@ -101,15 +103,16 @@ class simd } /// @} -#define __ESIMD_DEF_SIMD_ARITH_UNARY_OP(ARITH_UNARY_OP) \ +#define __ESIMD_DEF_SIMD_ARITH_UNARY_OP(ARITH_UNARY_OP, ID) \ template simd operator ARITH_UNARY_OP() const { \ static_assert(!std::is_unsigned_v, \ #ARITH_UNARY_OP "doesn't apply to unsigned types"); \ - return simd(ARITH_UNARY_OP(base_type::data())); \ + return simd{detail::vector_unary_op( \ + base_type::data())}; \ } - __ESIMD_DEF_SIMD_ARITH_UNARY_OP(-) - __ESIMD_DEF_SIMD_ARITH_UNARY_OP(+) + __ESIMD_DEF_SIMD_ARITH_UNARY_OP(-, minus) + __ESIMD_DEF_SIMD_ARITH_UNARY_OP(+, plus) #undef __ESIMD_DEF_SIMD_ARITH_UNARY_OP }; @@ -120,7 +123,8 @@ ESIMD_INLINE simd convert(const simd &val) { if constexpr (std::is_same_v) return val; else - return __builtin_convertvector(val.data(), detail::vector_type_t); + return detail::convert_vector(val.data()); + ; } #undef __ESIMD_DEF_RELOP diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp index 9000f8fee80f7..ac19f1339a9bf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp @@ -58,7 +58,8 @@ class simd_view : public detail::simd_view_impl { using value_type = get_simd_t; /// The underlying builtin value type - using vector_type = detail::vector_type_t; + using raw_vector_type = + detail::vector_type_t, length>; protected: /// @{ @@ -93,13 +94,15 @@ class simd_view : public detail::simd_view_impl { } \ \ /* simd_view RELOP SCALAR */ \ - template >> \ + template >> \ ESIMD_INLINE friend bool operator RELOP(const simd_view &X, T1 Y) { \ return (element_type)X RELOP Y; \ } \ \ /* SCALAR RELOP simd_view */ \ - template >> \ + template >> \ ESIMD_INLINE friend bool operator RELOP(T1 X, const simd_view &Y) { \ return X RELOP(element_type) Y; \ } @@ -116,21 +119,22 @@ class simd_view : public detail::simd_view_impl { /// bool b = v[0] > v[1] && v[2] < 42; /// /// \ingroup sycl_esimd -template -class simd_view> - : public detail::simd_view_impl> { +template +class simd_view> + : public detail::simd_view_impl> { template friend class detail::simd_obj_impl; template friend class detail::simd_view_impl; public: - using RegionTy = region1d_scalar_t; + using RegionTy = region1d_scalar_t; using BaseClass = detail::simd_view_impl; using ShapeTy = typename shape_type::type; static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y; static_assert(1 == length, "length of this view is not equal to 1"); + static_assert(std::is_same_v); /// The element type of this class, which could be different from the element /// type of the base object type. - using element_type = typename ShapeTy::element_type; + using element_type = ViewedElemT; using base_type = BaseTy; template using get_simd_t = typename BaseClass::template get_simd_t; @@ -146,8 +150,8 @@ class simd_view> simd_view(BaseTy &Base) : BaseClass(Base) {} operator element_type() const { - const auto v = BaseClass::read(); - return v[0]; + const auto v = BaseClass::read().data(); + return detail::bitcast_to_wrapper_type(std::move(v)[0]); } using BaseClass::operator--; @@ -170,22 +174,23 @@ class simd_view> /// simd v = 1; /// auto v1 = v.select<2, 1>(0); /// auto v2 = v1[0]; // simd_view of a nested region for a single element -template -class simd_view, NestedRegion>> +template +class simd_view, NestedRegion>> : public detail::simd_view_impl< - BaseTy, std::pair, NestedRegion>> { + BaseTy, std::pair, NestedRegion>> { template friend class simd; template friend class detail::simd_view_impl; public: - using RegionTy = std::pair, NestedRegion>; + using RegionTy = std::pair, NestedRegion>; using BaseClass = detail::simd_view_impl; using ShapeTy = typename shape_type::type; static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y; static_assert(1 == length, "length of this view is not equal to 1"); + static_assert(std::is_same_v); /// The element type of this class, which could be different from the element /// type of the base object type. - using element_type = T; + using element_type = ViewedElemT; private: simd_view(BaseTy &Base, RegionTy Region) : BaseClass(Base, Region) {} @@ -196,7 +201,7 @@ class simd_view, NestedRegion>> operator element_type() const { const auto v = BaseClass::read(); - return v[0]; + return detail::convert_scalar(v[0]); } __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>) diff --git a/sycl/test/esimd/flat_atomic.cpp b/sycl/test/esimd/flat_atomic.cpp index dcda882214641..fc4961fd55f1e 100644 --- a/sycl/test/esimd/flat_atomic.cpp +++ b/sycl/test/esimd/flat_atomic.cpp @@ -27,12 +27,14 @@ void kernel1(uint32_t *ptr) SYCL_ESIMD_FUNCTION { flat_atomic(ptr, offsets, v1, 1); } -void kernel2(uint32_t *ptr) SYCL_ESIMD_FUNCTION { +template void kernel2(T *ptr) SYCL_ESIMD_FUNCTION { simd offsets(0, 1); - simd v1(0, 1); + simd v1(0, 1); - atomic_update(ptr, offsets, v1, v1, 1); + atomic_update(ptr, offsets, v1, v1, 1); // deprecated form: - flat_atomic(ptr, offsets, v1, - v1, 1); + flat_atomic(ptr, offsets, v1, v1, + 1); } + +template void kernel2(uint32_t *) SYCL_ESIMD_FUNCTION; diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp index 974f4b74c1c8e..4663bf44c4bc0 100644 --- a/sycl/test/esimd/intrins_trans.cpp +++ b/sycl/test/esimd/intrins_trans.cpp @@ -179,9 +179,9 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { // level of testing strength // 2. Test cases above should be refactored not to use user-level APIs like // gather and use __esimd* calls instead. -template using vec = typename simd::vector_type; +template using vec = typename simd::raw_vector_type; -template using mask = typename simd_mask::vector_type; +template using mask = typename simd_mask::raw_vector_type; SYCL_EXTERNAL void use(const vec &x) SYCL_ESIMD_FUNCTION; SYCL_EXTERNAL void use(const vec &x) SYCL_ESIMD_FUNCTION; diff --git a/sycl/test/esimd/simd.cpp b/sycl/test/esimd/simd.cpp index a38b732751312..0c9afd2c4fa05 100644 --- a/sycl/test/esimd/simd.cpp +++ b/sycl/test/esimd/simd.cpp @@ -7,103 +7,135 @@ using namespace sycl::ext::intel::experimental::esimd; -bool test_simd_ctors() SYCL_ESIMD_FUNCTION { - simd v0 = 1; - simd v1(v0); - simd v2(simd(0, 1)); - const simd v3{0, 2, 4, 6, 1, 3, 5, 7}; +template bool test_simd_ctors() SYCL_ESIMD_FUNCTION { + simd v0 = 1; + simd v1(v0); + simd v2(simd(0, 1)); + const simd v3{0, 2, 4, 6, 1, 3, 5, 7}; return v0[0] + v1[1] + v2[2] + v3[3] == 1 + 1 + 2 + 6; } -void test_simd_class_traits() SYCL_ESIMD_FUNCTION { - static_assert(std::is_default_constructible>::value, +template bool test_simd_ctors() SYCL_ESIMD_FUNCTION; +template bool test_simd_ctors() SYCL_ESIMD_FUNCTION; + +template void test_simd_class_traits() SYCL_ESIMD_FUNCTION { + static_assert(std::is_default_constructible>::value, "type trait mismatch"); - static_assert(std::is_trivially_default_constructible>::value, + static_assert(std::is_trivially_default_constructible>::value, "type trait mismatch"); - static_assert(std::is_copy_constructible>::value, + static_assert(std::is_copy_constructible>::value, "type trait mismatch"); - static_assert(!std::is_trivially_copy_constructible>::value, + static_assert(!std::is_trivially_copy_constructible>::value, "type trait mismatch"); - static_assert(std::is_move_constructible>::value, + static_assert(std::is_move_constructible>::value, "type trait mismatch"); - static_assert(!std::is_trivially_move_constructible>::value, + static_assert(!std::is_trivially_move_constructible>::value, "type trait mismatch"); - static_assert(std::is_copy_assignable>::value, + static_assert(std::is_copy_assignable>::value, "type trait mismatch"); - static_assert(std::is_trivially_copy_assignable>::value, + static_assert(std::is_trivially_copy_assignable>::value, "type trait mismatch"); - static_assert(std::is_move_assignable>::value, + static_assert(std::is_move_assignable>::value, "type trait mismatch"); - static_assert(std::is_trivially_move_assignable>::value, + static_assert(std::is_trivially_move_assignable>::value, "type trait mismatch"); } +template void test_simd_class_traits() SYCL_ESIMD_FUNCTION; +template void test_simd_class_traits() SYCL_ESIMD_FUNCTION; + void test_conversion() SYCL_ESIMD_FUNCTION { simd v = 3; simd f = v; simd c = f; - simd c1 = f.select<16, 1>(0); - c.select<32, 1>(0) = f; + simd h = c; + simd c1 = h.template select<16, 1>(0); + c.template select<32, 1>(0) = f; + h.template select<7, 1>(3) = + v.template select<22, 1>(0).template select<7, 3>(1); f = v + static_cast>(c); } -bool test_1d_select() SYCL_ESIMD_FUNCTION { - simd v = 0; - v.select<8, 1>(0) = 1; - v.select<8, 1>(8) = 2; - v.select<8, 1>(16) = 3; - v.select<8, 1>(24) = 4; +template bool test_1d_select() SYCL_ESIMD_FUNCTION { + simd v = 0; + v.template select<8, 1>(0) = 1; + v.template select<8, 1>(8) = 2; + v.template select<8, 1>(16) = 3; + v.template select<8, 1>(24) = 4; return v[0] + v[8] + v[16] + v[24] == (1 + 2 + 3 + 4); } +template bool test_1d_select() SYCL_ESIMD_FUNCTION; +template bool test_1d_select() SYCL_ESIMD_FUNCTION; + +template bool test_simd_format() SYCL_ESIMD_FUNCTION { - simd v{0, 1, 2, 3, 4, 5, 6, 7}; - auto ref1 = v.bit_cast_view(); - auto ref2 = v.bit_cast_view(); - auto ref3 = v.bit_cast_view(); + simd v{0, 1, 2, 3, 4, 5, 6, 7}; + auto ref1 = v.template bit_cast_view(); + auto ref2 = v.template bit_cast_view(); + auto ref3 = v.template bit_cast_view(); return (decltype(ref1)::length == 32) && (decltype(ref2)::length == 8) && (decltype(ref3)::getSizeX() == 4) && (decltype(ref3)::getSizeY() == 8); } -bool test_simd_select(int a) SYCL_ESIMD_FUNCTION { +template bool test_simd_format() SYCL_ESIMD_FUNCTION; +template bool +test_simd_format() SYCL_ESIMD_FUNCTION; + +template bool test_simd_select(T1 a) SYCL_ESIMD_FUNCTION { { - simd f = a; - simd c1 = 2; - c1.select<16, 1>(0) = f.select<16, 1>(0); - c1.select<16, 1>(0).select<16, 1>(0) = f.select<16, 1>(0).select<16, 1>(0); + simd f = a; + simd c1 = 2; + c1.template select<16, 1>(0) = f.template select<16, 1>(0); + c1.template select<16, 1>(0).template select<16, 1>(0) = + f.template select<16, 1>(0).template select<16, 1>(0); } { - simd v(0, 1); - auto ref0 = v.select<4, 2>(1); // r{1, 3, 5, 7} - auto ref1 = v.bit_cast_view(); // 0,1,2,3; - // 4,5,6,7; - // 8,9,10,11; - // 12,13,14,15 - auto ref2 = ref1.select<2, 1, 2, 2>(0, 1); - return ref0[0] == 1 && decltype(ref2)::getSizeX() == 2 && - decltype(ref2)::getStrideY() == 1; + simd v(0, 1); + auto ref0 = v.template select<4, 2>(1); // r{1, 3, 5, 7} + auto ref1 = v.template bit_cast_view(); // 0,1,2,3; + // 4,5,6,7; + // 8,9,10,11; + // 12,13,14,15 + auto ref2 = ref1.template select<2, 1, 2, 2>(0, 1); + return (ref0[0] == 1) && (decltype(ref2)::getSizeX() == 2) && + (decltype(ref2)::getStrideY() == 1); } + return false; } -bool test_2d_offset() SYCL_ESIMD_FUNCTION { - simd v = 0; - auto ref = v.bit_cast_view(); - return ref.select<2, 2, 2, 2>(2, 1).getOffsetX() == 1 && - ref.select<2, 2, 2, 2>(2, 1).getOffsetY() == 2; +template bool test_simd_select(float) SYCL_ESIMD_FUNCTION; +template bool + test_simd_select(uint64_t) SYCL_ESIMD_FUNCTION; + +template bool test_2d_offset() SYCL_ESIMD_FUNCTION { + simd v = 0; + auto ref = v.template bit_cast_view(); + return ref.template select<2, 2, 2, 2>(2, 1).getOffsetX() == 1 && + ref.template select<2, 2, 2, 2>(2, 1).getOffsetY() == 2; } +template bool test_2d_offset() SYCL_ESIMD_FUNCTION; +template bool test_2d_offset() SYCL_ESIMD_FUNCTION; + +template bool test_simd_bin_op_promotion() SYCL_ESIMD_FUNCTION { - simd v0 = std::numeric_limits::max(); - simd v1 = 1; - simd v2 = v0 + v1; + simd v0 = std::numeric_limits::max(); + simd v1 = 1; + simd v2 = v0 + v1; return v2[0] == 32768; } -bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION { - simd v0 = 1; - simd v1 = 2; +template bool test_simd_bin_op_promotion() SYCL_ESIMD_FUNCTION; +template bool +test_simd_bin_op_promotion() SYCL_ESIMD_FUNCTION; + +template bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION { + simd v0 = 1; + simd v1 = 2; v0 += v1; - v0 %= v1; + if constexpr (std::is_integral_v) + v0 %= v1; v0 = 2 - v0; v0 -= v1; v0 -= 2; @@ -114,101 +146,142 @@ bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION { return v0[0] == 1; } -bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION { - simd v0 = 1; - simd v1 = 2; - v0 <<= v1; +template bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION; +template bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION; + +template bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION { + simd v0 = 1; + simd v1 = 2; + if constexpr (std::is_integral_v) + v0 <<= v1; v1 = -v0; - v0 = ~v1; + if constexpr (std::is_integral_v) + v0 = ~v1; return v1[0] == 1; } -bool test_nested_1d_select() SYCL_ESIMD_FUNCTION { - simd r0(0, 1); +template bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION; +template bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION; + +template bool test_nested_1d_select() SYCL_ESIMD_FUNCTION { + simd r0(0, 1); - auto r1 = r0.select<4, 2>(0); - auto r2 = r1.select<2, 2>(0); - auto r3 = r2.select<1, 0>(1); + auto r1 = r0.template select<4, 2>(0); + auto r2 = r1.template select<2, 2>(0); + auto r3 = r2.template select<1, 0>(1); r3 = 37; return r0[4] == 37; } -bool test_format_1d_read() SYCL_ESIMD_FUNCTION { - simd r = 0x0FF00F0F; - auto rl = r.bit_cast_view(); - auto rl2 = rl.select<8, 2>(0); // 0F0F - auto rh = r.bit_cast_view(); - auto rh2 = rh.select<8, 2>(1); // 0FF0 +template bool test_nested_1d_select() SYCL_ESIMD_FUNCTION; +template bool test_nested_1d_select() SYCL_ESIMD_FUNCTION; + +template bool test_format_1d_read() SYCL_ESIMD_FUNCTION { + simd r = 0x0FF00F0F; + auto rl = r.template bit_cast_view(); + auto rl2 = rl.template select<8, 2>(0); // 0F0F + auto rh = r.template bit_cast_view(); + auto rh2 = rh.template select<8, 2>(1); // 0FF0 return rl2[0] == 0x0F0F && rh2[0] == 0x0FF0; } -bool test_format_1d_write() SYCL_ESIMD_FUNCTION { - simd r; - auto rl = r.bit_cast_view(); - auto rl2 = rl.select<8, 2>(0); - auto rh = r.bit_cast_view(); - auto rh2 = rh.select<8, 2>(1); +template bool test_format_1d_read() SYCL_ESIMD_FUNCTION; +template bool test_format_1d_read() SYCL_ESIMD_FUNCTION; + +template bool test_format_1d_write() SYCL_ESIMD_FUNCTION { + simd r; + auto rl = r.template bit_cast_view(); + auto rl2 = rl.template select<8, 2>(0); + auto rh = r.template bit_cast_view(); + auto rh2 = rh.template select<8, 2>(1); rh2 = 0x0F, rl2 = 0xF0; return r[0] == 0x0FF0; } +template bool test_format_1d_write() SYCL_ESIMD_FUNCTION; +template bool test_format_1d_write() SYCL_ESIMD_FUNCTION; + +template bool test_format_1d_read_write_nested() SYCL_ESIMD_FUNCTION { - simd v = 0; - auto r1 = v.bit_cast_view(); - auto r11 = r1.select<8, 1>(0); - auto r12 = r11.bit_cast_view(); - auto r2 = v.bit_cast_view(); - auto r21 = r2.select<8, 1>(8); - auto r22 = r21.bit_cast_view(); + simd v = 0; + auto r1 = v.template bit_cast_view(); + auto r11 = r1.template select<8, 1>(0); + auto r12 = r11.template bit_cast_view(); + auto r2 = v.template bit_cast_view(); + auto r21 = r2.template select<8, 1>(8); + auto r22 = r21.template bit_cast_view(); r12 += 1, r22 += 2; return v[0] == 1 && v[4] == 2; } -bool test_format_2d_read() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto r1 = v0.bit_cast_view(); - simd v1 = r1.select<1, 0, 4, 1>(1, 0).read(); // second row +template bool +test_format_1d_read_write_nested() SYCL_ESIMD_FUNCTION; +template bool +test_format_1d_read_write_nested() SYCL_ESIMD_FUNCTION; + +template bool test_format_2d_read() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto r1 = v0.template bit_cast_view(); + simd v1 = r1.template select<1, 0, 4, 1>(1, 0).read(); // second row return v1[0] == 4; } -bool test_format_2d_write() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto r1 = v0.bit_cast_view(); - r1.select<1, 0, 4, 1>(1, 0) = 37; +template bool test_format_2d_read() SYCL_ESIMD_FUNCTION; +template bool test_format_2d_read() SYCL_ESIMD_FUNCTION; + +template bool test_format_2d_write() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto r1 = v0.template bit_cast_view(); + r1.template select<1, 0, 4, 1>(1, 0) = 37; return v0[4] == 37; } -bool test_select_rvalue() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - v0.select<4, 2>(1).select<2, 2>(0) = 37; +template bool test_format_2d_write() SYCL_ESIMD_FUNCTION; +template bool test_format_2d_write() SYCL_ESIMD_FUNCTION; + +template bool test_select_rvalue() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + v0.template select<4, 2>(1).template select<2, 2>(0) = 37; return v0[5] == 37; } -auto test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION { - simd v0 = 0x0F0F0F0F; - v0.bit_cast_view().select<8, 2>(0) = 0x0E0E; +template bool test_select_rvalue() SYCL_ESIMD_FUNCTION; +template bool test_select_rvalue() SYCL_ESIMD_FUNCTION; + +template bool test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION { + simd v0 = 0x0F0F0F0F; + v0.template bit_cast_view().template select<8, 2>(0) = 0x0E0E; return v0[2] == 0x0E0E0E0E; } -bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - v0.bit_cast_view().select<1, 0, 4, 1>(0, 0) = 37; +template bool test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION; +template bool test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION; + +template bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + v0.template bit_cast_view().template select<1, 0, 4, 1>(0, 0) = 37; return v0[3] == 37; } -auto test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto r1 = v0.bit_cast_view() - .select<1, 0, 4, 1>(1, 0) - .bit_cast_view() - .select<2, 2>(1); +template bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION; +template bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION; + +template bool test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto r1 = v0.template bit_cast_view() + .template select<1, 0, 4, 1>(1, 0) + .template bit_cast_view() + .template select<2, 2>(1); return r1[0] == 5; } -bool test_row_read_write() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto m = v0.bit_cast_view(); +template bool test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION; +template bool test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION; + +template bool test_row_read_write() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto m = v0.template bit_cast_view(); auto r0 = m.row(0); // 0 1 2 3 auto r1 = m.row(1); // 4 5 6 7 @@ -221,9 +294,12 @@ bool test_row_read_write() SYCL_ESIMD_FUNCTION { return r0[0] == 8 && r1[0] == 16; } -bool test_column_read_write() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto m = v0.bit_cast_view(); +template bool test_row_read_write() SYCL_ESIMD_FUNCTION; +template bool test_row_read_write() SYCL_ESIMD_FUNCTION; + +template bool test_column_read_write() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto m = v0.template bit_cast_view(); auto c0 = m.column(0); // 0 2 auto c1 = m.column(1); // 1 3 @@ -234,44 +310,62 @@ bool test_column_read_write() SYCL_ESIMD_FUNCTION { return v0[0] == 1 && v0[3] == 4; } -bool test_replicate() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto v0_rep = v0.replicate<1>(); +template bool test_column_read_write() SYCL_ESIMD_FUNCTION; +template bool test_column_read_write() SYCL_ESIMD_FUNCTION; + +template bool test_replicate() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto v0_rep = v0.template replicate<1>(); return v0[0] == v0_rep[0] && v0[7] == v0_rep[7]; } -bool test_replicate1() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto v0_rep = v0.replicate_w<4, 2>(2); +template bool test_replicate() SYCL_ESIMD_FUNCTION; +template bool test_replicate() SYCL_ESIMD_FUNCTION; + +template bool test_replicate1() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto v0_rep = v0.template replicate_w<4, 2>(2); return v0[2] == v0_rep[2] && v0[3] == v0_rep[5]; } -bool test_replicate2() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto v0_rep = v0.replicate_vs_w<2, 4, 2>(1); +template bool test_replicate1() SYCL_ESIMD_FUNCTION; +template bool test_replicate1() SYCL_ESIMD_FUNCTION; + +template bool test_replicate2() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto v0_rep = v0.template replicate_vs_w<2, 4, 2>(1); return v0_rep[0] == v0[1] && v0_rep[1] == v0[2] && v0_rep[2] == v0[5]; } -bool test_replicate3() SYCL_ESIMD_FUNCTION { - simd v0(0, 1); - auto v0_rep = v0.replicate_vs_w_hs<2, 4, 2, 2>(1); +template bool test_replicate2() SYCL_ESIMD_FUNCTION; +template bool test_replicate2() SYCL_ESIMD_FUNCTION; + +template bool test_replicate3() SYCL_ESIMD_FUNCTION { + simd v0(0, 1); + auto v0_rep = v0.template replicate_vs_w_hs<2, 4, 2, 2>(1); return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5]; } -bool test_simd_iselect() SYCL_ESIMD_FUNCTION { - simd v(0, 1); - simd a(0, 2); +template bool test_replicate3() SYCL_ESIMD_FUNCTION; +template bool test_replicate3() SYCL_ESIMD_FUNCTION; + +template bool test_simd_iselect() SYCL_ESIMD_FUNCTION { + simd v(0, 1); + simd a(0, 2); auto data = v.iselect(a); data += 16; - v.iupdate(a, data, simd_mask<8>(1)); - auto ref = v.select<8, 2>(0); + v.template iupdate(a, data, simd_mask<8>(1)); + auto ref = v.template select<8, 2>(0); return ref[0] == 16 && ref[14] == 32; } +template bool test_simd_iselect() SYCL_ESIMD_FUNCTION; +template bool test_simd_iselect() SYCL_ESIMD_FUNCTION; + void test_simd_binop_honor_int_promo() SYCL_ESIMD_FUNCTION { simd a; simd b;