diff --git a/sycl/include/CL/sycl/half_type.hpp b/sycl/include/CL/sycl/half_type.hpp
index 9d171d7c17867..dbcaa2883ca3c 100644
--- a/sycl/include/CL/sycl/half_type.hpp
+++ b/sycl/include/CL/sycl/half_type.hpp
@@ -35,6 +35,19 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+
+namespace ext {
+namespace intel {
+namespace experimental {
+namespace esimd {
+namespace detail {
+class WrapperElementTypeProxy;
+} // namespace detail
+} // namespace esimd
+} // namespace experimental
+} // namespace intel
+} // namespace ext
+
 namespace detail {
 
 inline __SYCL_CONSTEXPR_HALF uint16_t float2Half(const float &Val) {
@@ -255,6 +268,9 @@ class __SYCL_EXPORT half_v2 {
   // Initialize underlying data
   constexpr explicit half_v2(uint16_t x) : Buf(x) {}
 
+  friend class sycl::ext::intel::experimental::esimd::detail::
+      WrapperElementTypeProxy;
+
 private:
   uint16_t Buf;
 };
@@ -391,6 +407,9 @@ class half {
 
   template <typename Key> friend struct std::hash;
 
+  friend class sycl::ext::intel::experimental::esimd::detail::
+      WrapperElementTypeProxy;
+
 private:
   StorageT Data;
 };
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
index 8cacaf63e330c..dff4d9dc559f1 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp
@@ -29,6 +29,9 @@
 #define ESIMD_REGISTER(n) __attribute__((register_num(n)))
 
 #define __ESIMD_API ESIMD_NODEBUG ESIMD_INLINE
+
+#define __ESIMD_UNSUPPORTED_ON_HOST
+
 #else // __SYCL_DEVICE_ONLY__
 #define SYCL_ESIMD_KERNEL
 #define SYCL_ESIMD_FUNCTION
@@ -41,6 +44,9 @@
 #define ESIMD_REGISTER(n)
 
 #define __ESIMD_API ESIMD_INLINE
+
+#define __ESIMD_UNSUPPORTED_ON_HOST throw cl::sycl::feature_not_supported()
+
 #endif // __SYCL_DEVICE_ONLY__
 
 // Mark a function being noinline
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp
new file mode 100644
index 0000000000000..c2e7aed5fad2e
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp
@@ -0,0 +1,708 @@
+//==------------ - elem_type_traits.hpp - DPC++ Explicit SIMD API ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This header provides basic infrastructure to support non-standard C++ types
+// as simd element types. This non-standard element types are usually structs or
+// classes (example: sycl::half).
+// Terms:
+// - "wrapper type" - a non-standard element type
+// - "raw type" - the real types used to represent real storage type of the data
+//   bits wrapped by the corresponding wrapper structure/class
+// By design, user program never uses the raw types, so they are not exposed at
+// user level.
+//
+// The main reasons why the infrastructure is needed are:
+// - attempt to create a clang vector with wrapper element type
+//   vector_type_t<WrapperT, N> will result in compilation error
+// - C++ operations on WrapperT are usually supported by the Intel GPU hardware
+//   (which is the main reason of supporting them in ESIMD) and need to be
+//   mapped to efficient hardware code sequences.
+//
+// To make a wrapper type appear as first-class element type, the following
+// major components must be available/implemented for the type:
+// 1) Storage ("raw") type must be defined. The raw type must be bit-castable to
+//   the wrapper type and thus must have the same bit size and alignment
+//   requirements.
+// 2) "Nearest enclosing" standard C++ type must be defined. This is a standard
+//   C++ type which can represent values of the wrapper type. The enclosing type
+//   can be used as a fall-back type for default implementations of operations
+//   on the wrapper type
+// 3) Type conversion intrinsics between the bit representation of a wrapper
+//   type value and the equivalent enclosing C++ type value
+// 4) The above three are enough to emulate any wrapper type, as all operations
+//   can be performed on the enclosing type values, converting from raw to
+//   enclosing before the operation and converting back from enclosing to raw
+//   after the operation. But this would be inefficient in some cases - when
+//   enclosing C++ type does not match the raw type, as H/W usually supports
+//   many operations directly on the raw type (which is bit representation of
+//   the wrapper type). So mapping to efficient H/W operations must be defined.
+//   For example, for SYCL half type efficient mapping primitive operations to
+//   Intel GPU harware is as easy as "unwrapping" sycl::half value, which yields
+//   "_Float16" natively supported by the device compiler and hardware, then
+//   using standard C++, operations such as '+', on _Float16 values. For other
+//   types like bfloat16 this will require mapping to appropriate intrinsics.
+// 5) The type must be marked as wrapper type explicitly, for the API to behave
+//   correctly.
+// Important note: some of these components might have different definition for
+// the same wrapper type depending on host vs device compilation. E.g. for SYCL
+// half the raw type is uint16_t on host and _Float16 on device.
+//
+// - The mechanism to define components 1) and 2) for a new wrapper type is to
+//   provide a specialization of the `element_type_traits` structure for this
+//   type.
+// - Component 3) is provided via implementing specializations of the following
+//   intrinsics:
+//   * __esimd_wrapper_type_bitcast_to/__esimd_wrapper_type_bitcast_from (should
+//     not be necessary with C++ 20 where there is a standard bitcast operation)
+//     to bitcast between the raw and the wrapper types.
+//   * __esimd_convertvector_to/__esimd_convertvector_from to type-convert
+//     between clang vectors of the wrapper type (bit-represented with the raw
+//     type) and clang vectors the the enclosing std type values.
+// - Component 4) is provided via:
+//   * (primitive operations) Specializations of the
+//       __esimd_binary_op
+//       __esimd_unary_op
+//       __esimd_cmp_op
+//       __esimd_vector_binary_op
+//       __esimd_vector_unary_op
+//       __esimd_vector_cmp_op
+//     intrinsics. If the `use_native_cpp_ops` element type trait is true, then
+//     implementing those intrinsics is not necessary and std C++ operations
+//     will be used.
+//   * (math operations) Overloading std math functions for the new wrapper
+//     type.
+// - Component 5) is provided via adding the new type to the list of types in
+//   `is_wrapper_elem_type_v` meta function.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+
+#include <CL/sycl/half_type.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace __SEIEED {
+
+// Primitive C++ operations supported by simd objects and templated upon by some
+// of the functions/classes.
+
+enum class BinOp {
+  add,
+  sub,
+  mul,
+  div,
+  rem,
+  shl,
+  shr,
+  bit_or,
+  bit_and,
+  bit_xor,
+  log_or,
+  log_and
+};
+
+enum class CmpOp { lt, lte, gte, gt, eq, ne };
+
+enum class UnaryOp { minus, plus, bit_not, log_not };
+
+// If given type is a special "wrapper" element type.
+template <class T>
+static inline constexpr bool is_wrapper_elem_type_v =
+    std::is_same_v<T, sycl::half>;
+
+template <class T>
+static inline constexpr bool is_valid_simd_elem_type_v =
+    (is_vectorizable_v<T> || is_wrapper_elem_type_v<T>);
+
+struct invalid_raw_element_type;
+
+// Default (unusable) definition of the element type traits.
+template <class T, class SFINAE> struct element_type_traits {
+  // The raw element type of the underlying clang vector used as a
+  // storage.
+  using RawT = invalid_raw_element_type;
+  // A starndard C++ type which this one can be converted to/from.
+  // The conversions are usually H/W-supported, and the C++ type can
+  // represent the entire range of values of this type.
+  using EnclosingCppT = void;
+  // Whether a value or clang vector value the raw element type can be used
+  // directly as operand to std C++ operations.
+  static inline constexpr bool use_native_cpp_ops = true;
+};
+
+// Element type traits specialization for C++ standard element type.
+template <class T>
+struct element_type_traits<T, std::enable_if_t<is_vectorizable_v<T>>> {
+  using RawT = T;
+  using EnclosingCppT = T;
+  static inline constexpr bool use_native_cpp_ops = true;
+};
+
+// --- Type conversions
+
+// Low-level conversion functions to and from a wrapper element type.
+// Must be implemented for each supported
+// <wrapper element type, C++ std type pair>.
+
+// These are default implementations for wrapper types with native cpp
+// operations support for their corresponding raw type.
+template <class WrapperTy, class StdTy, int N>
+ESIMD_INLINE vector_type_t<__raw_t<WrapperTy>, N>
+__esimd_convertvector_to(vector_type_t<StdTy, N> Val)
+#ifdef __SYCL_DEVICE_ONLY__
+    ; // needs to be implemented for WrapperTy's for which
+      // element_type_traits<WrapperTy>::use_native_cpp_ops is false.
+#else
+{
+  // TODO implement for host
+  throw sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+template <class WrapperTy, class StdTy, int N>
+ESIMD_INLINE vector_type_t<StdTy, N>
+__esimd_convertvector_from(vector_type_t<__raw_t<WrapperTy>, N> Val)
+#ifdef __SYCL_DEVICE_ONLY__
+    ; // needs to be implemented for WrapperTy's for which
+      // element_type_traits<WrapperTy>::use_native_cpp_ops is false.
+#else
+{
+  // TODO implement for host
+  throw sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+// TODO should be replaced by std::bit_cast once C++20 is supported.
+template <class WrapperTy>
+WrapperTy __esimd_wrapper_type_bitcast_to(__raw_t<WrapperTy> Val);
+template <class WrapperTy>
+__raw_t<WrapperTy> __esimd_wrapper_type_bitcast_from(WrapperTy Val);
+
+template <class WrapperTy, class StdTy> struct wrapper_type_converter {
+  using RawTy = __raw_t<WrapperTy>;
+
+  template <int N>
+  ESIMD_INLINE static vector_type_t<RawTy, N>
+  to_vector(vector_type_t<StdTy, N> Val) {
+    if constexpr (element_type_traits<WrapperTy>::use_native_cpp_ops) {
+      return __builtin_convertvector(Val, vector_type_t<RawTy, N>);
+    } else {
+      return __esimd_convertvector_to<WrapperTy, StdTy, N>(Val);
+    }
+  }
+
+  template <int N>
+  ESIMD_INLINE static vector_type_t<StdTy, N>
+  from_vector(vector_type_t<RawTy, N> Val) {
+    if constexpr (element_type_traits<WrapperTy>::use_native_cpp_ops) {
+      return __builtin_convertvector(Val, vector_type_t<StdTy, N>);
+    } else {
+      return __esimd_convertvector_from<WrapperTy, StdTy, N>(Val);
+    }
+  }
+};
+
+// Converts a raw representation of a simd vector with element type
+// SrcWrapperTy to a raw representation of a simd vector with element type
+// DstWrapperTy.
+template <class DstWrapperTy, class SrcWrapperTy, int N,
+          class DstRawVecTy = vector_type_t<__raw_t<DstWrapperTy>, N>,
+          class SrcRawVecTy = vector_type_t<__raw_t<SrcWrapperTy>, N>>
+ESIMD_INLINE DstRawVecTy convert_vector(SrcRawVecTy Val) {
+  if constexpr (std::is_same_v<SrcWrapperTy, DstWrapperTy>) {
+    return Val;
+  } else if constexpr (!is_wrapper_elem_type_v<SrcWrapperTy> &&
+                       !is_wrapper_elem_type_v<DstWrapperTy>) {
+    return __builtin_convertvector(Val, DstRawVecTy);
+  } else {
+    // The chain of conversions (some can be no-op if types match):
+    // SrcRawVecTy (of SrcWrapperTy)
+    //     | step A [wrapper_type_converter<SrcWrapperTy, SrcStdT>]::from_vector
+    //     v
+    //  SrcStdT
+    //     | step B [__builtin_convertvector]
+    //     v
+    //  DstStdT
+    //     | step C [wrapper_type_converter<DstWrapperTy, DstStdT>]::to_vector
+    //     v
+    // DstRawVecTy (of DstWrapperTy)
+    //
+    using DstStdT = typename element_type_traits<DstWrapperTy>::EnclosingCppT;
+    using SrcStdT = typename element_type_traits<SrcWrapperTy>::EnclosingCppT;
+    using SrcConv = wrapper_type_converter<SrcWrapperTy, SrcStdT>;
+    using DstConv = wrapper_type_converter<DstWrapperTy, DstStdT>;
+    using DstStdVecT = vector_type_t<DstStdT, N>;
+    using SrcStdVecT = vector_type_t<SrcStdT, N>;
+    SrcStdVecT TmpSrcVal;
+
+    if constexpr (std::is_same_v<SrcStdT, SrcWrapperTy>) {
+      TmpSrcVal = std::move(Val);
+    } else {
+      TmpSrcVal = SrcConv::template from_vector<N>(Val); // step A
+    }
+    if constexpr (std::is_same_v<SrcStdT, DstWrapperTy>) {
+      return TmpSrcVal;
+    } else {
+      DstStdVecT TmpDstVal;
+
+      if constexpr (std::is_same_v<SrcStdT, DstStdVecT>) {
+        TmpDstVal = std::move(TmpSrcVal);
+      } else {
+        TmpDstVal = __builtin_convertvector(TmpSrcVal, DstStdVecT); // step B
+      }
+      if constexpr (std::is_same_v<DstStdT, DstWrapperTy>) {
+        return TmpDstVal;
+      } else {
+        return DstConv::template to_vector<N>(TmpDstVal); // step C
+      }
+    }
+  }
+}
+
+template <class Ty> ESIMD_INLINE __raw_t<Ty> bitcast_to_raw_type(Ty Val) {
+  if constexpr (!is_wrapper_elem_type_v<Ty>) {
+    return Val;
+  } else {
+    return __esimd_wrapper_type_bitcast_from<Ty>(Val);
+  }
+}
+
+template <class Ty> ESIMD_INLINE Ty bitcast_to_wrapper_type(__raw_t<Ty> Val) {
+  if constexpr (!is_wrapper_elem_type_v<Ty>) {
+    return Val;
+  } else {
+    return __esimd_wrapper_type_bitcast_to<Ty>(Val);
+  }
+}
+
+// Converts a scalar value from given source type to destination type. Both
+// types can be non-std element types, in which case additional non-C++
+// conversions happen if the types are different.
+// NOTE: this is not symmetric with convert_vector, which inputs and outputs
+// raw (storage) vector types.
+template <class DstWrapperTy, class SrcWrapperTy,
+          class DstRawTy = __raw_t<DstWrapperTy>,
+          class SrcRawTy = __raw_t<SrcWrapperTy>>
+ESIMD_INLINE DstWrapperTy convert_scalar(SrcWrapperTy Val) {
+  if constexpr (std::is_same_v<SrcWrapperTy, DstWrapperTy>) {
+    return Val;
+  } else if constexpr (!is_wrapper_elem_type_v<SrcWrapperTy> &&
+                       !is_wrapper_elem_type_v<DstWrapperTy>) {
+    return static_cast<DstRawTy>(Val);
+  } else {
+    vector_type_t<SrcRawTy, 1> V0 = bitcast_to_raw_type<SrcWrapperTy>(Val);
+    vector_type_t<DstRawTy, 1> V1 =
+        convert_vector<DstWrapperTy, SrcWrapperTy, 1>(V0);
+    return bitcast_to_wrapper_type<DstWrapperTy>(V1[0]);
+  }
+}
+
+template <BinOp Op, class T> T binary_op_default_impl(T X, T Y) {
+  T Res{};
+  if constexpr (Op == BinOp::add)
+    Res = X + Y;
+  else if constexpr (Op == BinOp::sub)
+    Res = X - Y;
+  else if constexpr (Op == BinOp::mul)
+    Res = X * Y;
+  else if constexpr (Op == BinOp::div)
+    Res = X / Y;
+  else if constexpr (Op == BinOp::rem)
+    Res = X % Y;
+  else if constexpr (Op == BinOp::shl)
+    Res = X << Y;
+  else if constexpr (Op == BinOp::shr)
+    Res = X >> Y;
+  else if constexpr (Op == BinOp::bit_or)
+    Res = X | Y;
+  else if constexpr (Op == BinOp::bit_and)
+    Res = X & Y;
+  else if constexpr (Op == BinOp::bit_xor)
+    Res = X ^ Y;
+  else if constexpr (Op == BinOp::log_or)
+    Res = X || Y;
+  else if constexpr (Op == BinOp::log_and)
+    Res = X && Y;
+  return Res;
+}
+
+template <CmpOp Op, class T> auto comparison_op_default_impl(T X, T Y) {
+  decltype(X < Y) Res{};
+  if constexpr (Op == CmpOp::lt)
+    Res = X < Y;
+  else if constexpr (Op == CmpOp::lte)
+    Res = X <= Y;
+  else if constexpr (Op == CmpOp::eq)
+    Res = X == Y;
+  else if constexpr (Op == CmpOp::ne)
+    Res = X != Y;
+  else if constexpr (Op == CmpOp::gte)
+    Res = X >= Y;
+  else if constexpr (Op == CmpOp::gt)
+    Res = X > Y;
+  return Res;
+}
+
+template <UnaryOp Op, class T> auto unary_op_default_impl(T X) {
+  if constexpr (Op == UnaryOp::minus)
+    return -X;
+  else if constexpr (Op == UnaryOp::plus)
+    return +X;
+  else if constexpr (Op == UnaryOp::bit_not)
+    return ~X;
+  else if constexpr (Op == UnaryOp::log_not)
+    return !X;
+}
+
+template <class ElemT, int N> struct __hlp {
+  using RawElemT = __raw_t<ElemT>;
+  using RawVecT = vector_type_t<RawElemT, N>;
+  using BinopT = decltype(std::declval<RawVecT>() + std::declval<RawVecT>());
+  using CmpT = decltype(std::declval<RawVecT>() < std::declval<RawVecT>());
+};
+
+template <class Hlp> using __re_t = typename Hlp::RawElemT;
+template <class Hlp> using __rv_t = typename Hlp::RawVecT;
+template <class Hlp> using __cmp_t = typename Hlp::CmpT;
+
+// --- Scalar versions of binary operations
+
+template <BinOp Op, class T> ESIMD_INLINE T __esimd_binary_op(T X, T Y);
+
+template <BinOp Op, class T,
+          class = std::enable_if_t<is_valid_simd_elem_type_v<T>>>
+ESIMD_INLINE T binary_op_default(T X, T Y) {
+  static_assert(element_type_traits<T>::use_native_cpp_ops);
+  using T1 = __raw_t<T>;
+  T1 X1 = bitcast_to_raw_type(X);
+  T1 Y1 = bitcast_to_raw_type(Y);
+  T1 Res = binary_op_default_impl<Op>(X1, Y1);
+  return bitcast_to_wrapper_type<T>(Res);
+}
+
+// Default (inefficient) implementation of a scalar binary operation, which
+// involves conversion to an std C++ type, performing the op and converting
+// back.
+template <BinOp Op, class T> ESIMD_INLINE T __esimd_binary_op(T X, T Y) {
+  using T1 = typename element_type_traits<T>::EnclosingCppT;
+  T1 X1 = convert_scalar<T1, T>(X);
+  T1 Y1 = convert_scalar<T1, T>(Y);
+  return convert_scalar<T>(binary_op_default<Op, T1>(X1, Y1));
+}
+
+template <BinOp Op, class T,
+          class = std::enable_if_t<is_valid_simd_elem_type_v<T>>>
+ESIMD_INLINE T binary_op(T X, T Y) {
+  if constexpr (element_type_traits<T>::use_native_cpp_ops) {
+    return binary_op_default<Op>(X, Y);
+  } else {
+    return __esimd_binary_op<Op>(X, Y);
+  }
+}
+
+// --- Vector versions of binary operations
+
+template <BinOp Op, class ElemT, int N, class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT vector_binary_op_default(RawVecT X, RawVecT Y) {
+  static_assert(element_type_traits<ElemT>::use_native_cpp_ops);
+  return binary_op_default_impl<Op, RawVecT>(X, Y);
+}
+
+// Default (inefficient) implementation of a vector binary operation, which
+// involves conversion to an std C++ type, performing the op and converting
+// back.
+template <BinOp Op, class ElemT, int N, class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT __esimd_vector_binary_op(RawVecT X, RawVecT Y) {
+  using T1 = typename element_type_traits<ElemT>::EnclosingCppT;
+  using VecT1 = vector_type_t<T1, N>;
+  VecT1 X1 = convert_vector<T1, ElemT, N>(X);
+  VecT1 Y1 = convert_vector<T1, ElemT, N>(Y);
+  return convert_vector<ElemT, T1, N>(
+      vector_binary_op_default<Op, T1, N>(X1, Y1));
+}
+
+template <BinOp Op, class ElemT, int N, class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT vector_binary_op(RawVecT X, RawVecT Y) {
+  if constexpr (element_type_traits<ElemT>::use_native_cpp_ops) {
+    return vector_binary_op_default<Op, ElemT, N>(X, Y);
+  } else {
+    return __esimd_vector_binary_op<Op, ElemT, N>(X, Y);
+  }
+}
+
+// --- Scalar versions of unary operations
+
+template <UnaryOp Op, class T> ESIMD_INLINE T __esimd_unary_op(T X);
+
+template <UnaryOp Op, class T,
+          class = std::enable_if_t<is_valid_simd_elem_type_v<T>>>
+ESIMD_INLINE T unary_op_default(T X) {
+  static_assert(element_type_traits<T>::use_native_cpp_ops);
+  using T1 = __raw_t<T>;
+  T1 X1 = bitcast_to_raw_type(X);
+  T1 Res = unary_op_default_impl<Op>(X1);
+  return bitcast_to_wrapper_type<T>(Res);
+}
+
+// Default (inefficient) implementation of a scalar unary operation, which
+// involves conversion to an std C++ type, performing the op and converting
+// back.
+template <UnaryOp Op, class T> ESIMD_INLINE T __esimd_unary_op(T X) {
+  using T1 = typename element_type_traits<T>::EnclosingCppT;
+  T1 X1 = convert_scalar<T1, T>(X);
+  return convert_scalar<T>(unary_op_default<Op, T1>(X1));
+}
+
+template <UnaryOp Op, class T,
+          class = std::enable_if_t<is_valid_simd_elem_type_v<T>>>
+ESIMD_INLINE T unary_op(T X) {
+  if constexpr (element_type_traits<T>::use_native_cpp_ops) {
+    return unary_op_default<Op>(X);
+  } else {
+    return __esimd_unary_op<Op>(X);
+  }
+}
+
+// --- Vector versions of unary operations
+
+template <UnaryOp Op, class ElemT, int N,
+          class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT vector_unary_op_default(RawVecT X) {
+  static_assert(element_type_traits<ElemT>::use_native_cpp_ops);
+  return unary_op_default_impl<Op, RawVecT>(X);
+}
+
+// Default (inefficient) implementation of a vector unary operation, which
+// involves conversion to an std C++ type, performing the op and converting
+// back.
+template <UnaryOp Op, class ElemT, int N,
+          class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT __esimd_vector_unary_op(RawVecT X) {
+  using T1 = typename element_type_traits<ElemT>::EnclosingCppT;
+  using VecT1 = vector_type_t<T1, N>;
+  VecT1 X1 = convert_vector<T1, ElemT, N>(X);
+  return convert_vector<ElemT, T1, N>(vector_unary_op_default<Op, T1, N>(X1));
+}
+
+template <UnaryOp Op, class ElemT, int N,
+          class RawVecT = __rv_t<__hlp<ElemT, N>>>
+ESIMD_INLINE RawVecT vector_unary_op(RawVecT X) {
+  if constexpr (element_type_traits<ElemT>::use_native_cpp_ops) {
+    return vector_unary_op_default<Op, ElemT, N>(X);
+  } else {
+    return __esimd_vector_unary_op<Op, ElemT, N>(X);
+  }
+}
+
+// --- Vector versions of comparison operations
+
+template <CmpOp Op, class ElemT, int N, class H = __hlp<ElemT, N>,
+          class RetT = __cmp_t<H>, class RawVecT = __rv_t<H>>
+ESIMD_INLINE RetT vector_comparison_op_default(RawVecT X, RawVecT Y) {
+  static_assert(element_type_traits<ElemT>::use_native_cpp_ops);
+  return comparison_op_default_impl<Op, RawVecT>(X, Y);
+}
+
+// Default (inefficient) implementation of a vector comparison operation, which
+// involves conversion to an std C++ type, performing the op and converting
+// back.
+template <CmpOp Op, class ElemT, int N, class H = __hlp<ElemT, N>,
+          class RetT = __cmp_t<H>, class RawVecT = __rv_t<H>>
+ESIMD_INLINE RetT __esimd_vector_comparison_op(RawVecT X, RawVecT Y) {
+  using T1 = typename element_type_traits<ElemT>::EnclosingCppT;
+  using VecT1 = vector_type_t<T1, N>;
+  VecT1 X1 = convert_vector<T1, ElemT, N>(X);
+  VecT1 Y1 = convert_vector<T1, ElemT, N>(Y);
+  return convert_vector<element_type_t<RetT>, T1, N>(
+      vector_comparison_op_default<Op, T1, N>(X1, Y1));
+}
+
+template <CmpOp Op, class ElemT, int N, class H = __hlp<ElemT, N>,
+          class RetT = __cmp_t<H>, class RawVecT = __rv_t<H>>
+ESIMD_INLINE RetT vector_comparison_op(RawVecT X, RawVecT Y) {
+  if constexpr (element_type_traits<ElemT>::use_native_cpp_ops) {
+    return vector_comparison_op_default<Op, ElemT, N>(X, Y);
+  } else {
+    return __esimd_vector_comparison_op<Op, ElemT, N>(X, Y);
+  }
+}
+
+// Proxy class to access bit representation of a wrapper type both on host and
+// device.
+// TODO add this functionality to sycl type implementation? With C++20,
+// std::bit_cast should be a good replacement.
+class WrapperElementTypeProxy {
+public:
+  template <class T = sycl::half>
+  static inline __raw_t<T> bitcast_from_half(T Val) {
+#ifdef __SYCL_DEVICE_ONLY__
+    return Val.Data;
+#else
+    return Val.Data.Buf;
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+  template <class T = sycl::half>
+  static inline T bitcast_to_half(__raw_t<T> Bits) {
+#ifndef __SYCL_DEVICE_ONLY__
+    return sycl::half{Bits};
+#else
+    sycl::half Res;
+    Res.Data = Bits;
+    return Res;
+#endif // __SYCL_DEVICE_ONLY__
+  }
+};
+
+// "Generic" version of std::is_floating_point_v which returns "true" also for
+// the wrapper floating-point types such as sycl::half.
+template <typename T>
+static inline constexpr bool is_generic_floating_point_v =
+    std::is_floating_point_v<typename element_type_traits<T>::EnclosingCppT>;
+
+// @{
+// Get computation type of a binary operator given its operand types:
+// - if both types are arithmetic - return CPP's "common real type" of the
+//   computation (matches C++)
+// - if both types are simd types, they must be of the same length N,
+//   and the returned type is simd<T, N>, where N is the "common real type" of
+//   the element type of the operands (diverges from clang)
+// - otherwise, one type is simd and another is arithmetic - the simd type is
+//   returned (matches clang)
+
+struct invalid_computation_type;
+
+template <class T1, class T2, class SFINAE = void> struct computation_type {
+  using type = invalid_computation_type;
+};
+
+template <class T1, class T2>
+struct computation_type<T1, T2,
+                        std::enable_if_t<is_valid_simd_elem_type_v<T1> &&
+                                         is_valid_simd_elem_type_v<T2>>> {
+private:
+  template <class T> using tr = element_type_traits<T>;
+  template <class T>
+  using native_t =
+      std::conditional_t<tr<T>::use_native_cpp_ops, typename tr<T>::RawT,
+                         typename tr<T>::EnclosingCppT>;
+  static inline constexpr bool is_wr1 = is_wrapper_elem_type_v<T1>;
+  static inline constexpr bool is_wr2 = is_wrapper_elem_type_v<T2>;
+  static inline constexpr bool is_fp1 = is_generic_floating_point_v<T1>;
+  static inline constexpr bool is_fp2 = is_generic_floating_point_v<T2>;
+
+public:
+  using type = std::conditional_t<
+      !is_wr1 && !is_wr2,
+      // T1 and T2 are both std C++ types - use std C++ type promotion
+      decltype(std::declval<T1>() + std::declval<T2>()),
+      std::conditional_t<
+          std::is_same_v<T1, T2>,
+          // Types are the same wrapper type - return any
+          T1,
+          std::conditional_t<is_fp1 != is_fp2,
+                             // One of the types is floating-point - return it
+                             // (e.g. computation_type<int, sycl::half> will
+                             // yield sycl::half)
+                             std::conditional_t<is_fp1, T1, T2>,
+                             // both are either floating point or integral -
+                             // return result of C++ promotion of the native
+                             // types
+                             decltype(std::declval<native_t<T1>>() +
+                                      std::declval<native_t<T2>>())>>>;
+};
+
+template <class T1, class T2>
+struct computation_type<
+    T1, T2,
+    std::enable_if_t<is_simd_like_type_v<T1> || is_simd_like_type_v<T2>>> {
+private:
+  using Ty1 = element_type_t<T1>;
+  using Ty2 = element_type_t<T2>;
+  using EltTy = typename computation_type<Ty1, Ty2>::type;
+  static constexpr int N1 = is_simd_like_type_v<T1> ? T1::length : 0;
+  static constexpr int N2 = is_simd_like_type_v<T2> ? T2::length : 0;
+  static_assert((N1 == N2) || ((N1 & N2) == 0), "size mismatch");
+  static constexpr int N = N1 ? N1 : N2;
+
+public:
+  using type = simd<EltTy, N1>;
+};
+
+template <class T1, class T2 = T1>
+using computation_type_t =
+    typename computation_type<remove_cvref_t<T1>, remove_cvref_t<T2>>::type;
+
+// @}
+
+////////////////////////////////////////////////////////////////////////////////
+// sycl::half traits
+////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+struct element_type_traits<T, std::enable_if_t<std::is_same_v<T, sycl::half>>> {
+  // Can't use sycl::detail::half_impl::StorageT as RawT for both host and
+  // device as it still maps to struct on/ host (even though the struct is a
+  // trivial wrapper around uint16_t), and for ESIMD we need a type which can be
+  // an element of clang vector.
+#ifdef __SYCL_DEVICE_ONLY__
+  using RawT = sycl::detail::half_impl::StorageT;
+  // On device, _Float16 is native Cpp type, so it is the enclosing C++ type
+  using EnclosingCppT = RawT;
+  // On device, operations on half are translated to operations on _Float16,
+  // which is natively supported by the device compiler
+  static inline constexpr bool use_native_cpp_ops = true;
+#else
+  using RawT = uint16_t;
+  using EnclosingCppT = float;
+  // On host, we can't use native Cpp '+', '-' etc. over uint16_t to emulate the
+  // operations on half type.
+  static inline constexpr bool use_native_cpp_ops = false;
+#endif // __SYCL_DEVICE_ONLY__
+};
+
+using half_raw = __raw_t<sycl::half>;
+
+template <>
+ESIMD_INLINE sycl::half
+__esimd_wrapper_type_bitcast_to<sycl::half>(half_raw Val) {
+  return WrapperElementTypeProxy::bitcast_to_half(Val);
+}
+
+template <>
+ESIMD_INLINE half_raw
+__esimd_wrapper_type_bitcast_from<sycl::half>(sycl::half Val) {
+  return WrapperElementTypeProxy::bitcast_from_half(Val);
+}
+
+template <>
+struct is_esimd_arithmetic_type<__raw_t<sycl::half>, void> : std::true_type {};
+
+// Misc
+inline std::ostream &operator<<(std::ostream &O, sycl::half const &rhs) {
+  O << static_cast<float>(rhs);
+  return O;
+}
+
+inline std::istream &operator>>(std::istream &I, sycl::half &rhs) {
+  float ValFloat = 0.0f;
+  I >> ValFloat;
+  rhs = ValFloat;
+  return I;
+}
+
+// The only other place which needs to be updated to support a new type is
+// the is_wrapper_elem_type_v meta function.
+
+////////////////////////////////////////////////////////////////////////////////
+// sycl::bfloat16 traits
+////////////////////////////////////////////////////////////////////////////////
+// TODO
+
+} // namespace __SEIEED
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
index bb8e6d5843ab4..5bbe1b6519715 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp
@@ -139,12 +139,15 @@ namespace experimental {
 namespace esimd {
 namespace detail {
 
+template <class T> using __st = __raw_t<T>;
+
 /// read from a basic region of a vector, return a vector
 template <typename BT, int BN, typename RTy>
-__SEIEED::vector_type_t<typename RTy::element_type, RTy::length> ESIMD_INLINE
-readRegion(const __SEIEED::vector_type_t<BT, BN> &Base, RTy Region) {
-  using ElemTy = typename RTy::element_type;
-  auto Base1 = bitcast<ElemTy, BT, BN>(Base);
+__SEIEED::vector_type_t<__st<typename RTy::element_type>, RTy::length>
+    ESIMD_INLINE readRegion(const __SEIEED::vector_type_t<__st<BT>, BN> &Base,
+                            RTy Region) {
+  using ElemTy = __st<typename RTy::element_type>;
+  auto Base1 = bitcast<ElemTy, __st<BT>, BN>(Base);
   constexpr int Bytes = BN * sizeof(BT);
   if constexpr (Bytes == RTy::Size_in_bytes)
     // This is a no-op format.
@@ -163,14 +166,14 @@ readRegion(const __SEIEED::vector_type_t<BT, BN> &Base, RTy Region) {
 
 /// read from a nested region of a vector, return a vector
 template <typename BT, int BN, typename T, typename U>
-ESIMD_INLINE __SEIEED::vector_type_t<typename T::element_type, T::length>
-readRegion(const __SEIEED::vector_type_t<BT, BN> &Base,
+ESIMD_INLINE __SEIEED::vector_type_t<__st<typename T::element_type>, T::length>
+readRegion(const __SEIEED::vector_type_t<__st<BT>, BN> &Base,
            std::pair<T, U> Region) {
   // parent-region type
   using PaTy = typename shape_type<U>::type;
   constexpr int BN1 = PaTy::length;
   using BT1 = typename PaTy::element_type;
-  using ElemTy = typename T::element_type;
+  using ElemTy = __st<typename T::element_type>;
   // Recursively read the base
   auto Base1 = readRegion<BT, BN>(Base, Region.second);
   if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes)
@@ -178,7 +181,7 @@ readRegion(const __SEIEED::vector_type_t<BT, BN> &Base,
     return readRegion<BT1, BN1>(Base1, Region.first);
   else {
     static_assert(T::Is_2D);
-    static_assert(std::is_same<ElemTy, BT1>::value);
+    static_assert(std::is_same<ElemTy, __st<BT1>>::value);
     // To read a 2D region, we need the parent region
     // Read full rows with non-trivial vertical and horizontal stride = 1.
     constexpr int M = T::Size_y * PaTy::Size_x;
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp
index e19707515d1d1..0cd1efe73d927 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/operators.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
@@ -68,20 +69,21 @@ namespace __SEIEED {
 
 // ========= simd_obj_impl bitwise logic and arithmetic operators
 
-#define __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(BINOP, COND)                          \
+#define __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(BINOP, BINOP_ID, COND)                \
                                                                                \
   /* simd_obj_impl BINOP simd_obj_impl */                                      \
   template <class T1, class T2, int N, template <class, int> class SimdT,      \
             class SimdTx = SimdT<T1, N>, class = std::enable_if_t<COND>>       \
   inline auto operator BINOP(                                                  \
-      const __SEIEED::simd_obj_impl<T1, N, SimdT<T1, N>> &LHS,                 \
-      const __SEIEED::simd_obj_impl<T2, N, SimdT<T2, N>> &RHS) {               \
+      const __SEIEED::simd_obj_impl<__raw_t<T1>, N, SimdT<T1, N>> &LHS,        \
+      const __SEIEED::simd_obj_impl<__raw_t<T2>, N, SimdT<T2, N>> &RHS) {      \
     if constexpr (__SEIEED::is_simd_type_v<SimdT<T1, N>>) {                    \
-      using SimdPromotedT =                                                    \
-          __SEIEED::computation_type_t<SimdT<T1, N>, SimdT<T2, N>>;            \
-      using VecT = typename SimdPromotedT::vector_type;                        \
-      return SimdPromotedT(__SEIEED::convert<VecT>(LHS.data())                 \
-                               BINOP __SEIEED::convert<VecT>(RHS.data()));     \
+      using PromotedT = __SEIEED::computation_type_t<T1, T2>;                  \
+      /* vector_binary_op returns SimdT<PromotedT, N>::raw_vector_type */      \
+      SimdT<PromotedT, N> Res = vector_binary_op<BINOP_ID, PromotedT, N>(      \
+          __SEIEED::convert_vector<PromotedT, T1, N>(LHS.data()),              \
+          __SEIEED::convert_vector<PromotedT, T2, N>(RHS.data()));             \
+      return Res;                                                              \
     } else {                                                                   \
       /* for SimdT=simd_mask_impl T1 and T2 are both equal to                  \
        * simd_mask_elem_type */                                                \
@@ -93,7 +95,8 @@ namespace __SEIEED {
   template <class T1, int N1, template <class, int> class SimdT1, class T2,    \
             class SimdTx = SimdT1<T1, N1>, class = std::enable_if_t<COND>>     \
   inline auto operator BINOP(                                                  \
-      const __SEIEED::simd_obj_impl<T1, N1, SimdT1<T1, N1>> &LHS, T2 RHS) {    \
+      const __SEIEED::simd_obj_impl<__raw_t<T1>, N1, SimdT1<T1, N1>> &LHS,     \
+      T2 RHS) {                                                                \
     if constexpr (__SEIEED::is_simd_type_v<SimdT1<T1, N1>>) {                  \
       /* convert the SCALAR to vector type and reuse the basic operation over  \
        * simd objects */                                                       \
@@ -109,7 +112,8 @@ namespace __SEIEED {
   template <class T1, class T2, int N2, template <class, int> class SimdT2,    \
             class SimdTx = SimdT2<T2, N2>, class = std::enable_if_t<COND>>     \
   inline auto operator BINOP(                                                  \
-      T1 LHS, const __SEIEED::simd_obj_impl<T2, N2, SimdT2<T2, N2>> &RHS) {    \
+      T1 LHS,                                                                  \
+      const __SEIEED::simd_obj_impl<__raw_t<T2>, N2, SimdT2<T2, N2>> &RHS) {   \
     if constexpr (__SEIEED::is_simd_type_v<SimdT2<T2, N2>>) {                  \
       /* convert the SCALAR to vector type and reuse the basic operation over  \
        * simd objects */                                                       \
@@ -122,27 +126,28 @@ namespace __SEIEED {
 
 #define __ESIMD_BITWISE_OP_FILTER                                              \
   std::is_integral_v<T1> &&std::is_integral_v<T2>
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(^, __ESIMD_BITWISE_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(|, __ESIMD_BITWISE_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(&, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(^, BinOp::bit_xor, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(|, BinOp::bit_or, __ESIMD_BITWISE_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(&, BinOp::bit_and, __ESIMD_BITWISE_OP_FILTER)
 #undef __ESIMD_BITWISE_OP_FILTER
 
 #define __ESIMD_SHIFT_OP_FILTER                                                \
   std::is_integral_v<T1> &&std::is_integral_v<T2>                              \
       &&__SEIEED::is_simd_type_v<SimdTx>
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(%, __ESIMD_SHIFT_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(<<, __ESIMD_SHIFT_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(%, BinOp::rem, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(<<, BinOp::shl, __ESIMD_SHIFT_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(>>, BinOp::shr, __ESIMD_SHIFT_OP_FILTER)
 #undef __ESIMD_SHIFT_OP_FILTER
 
 #define __ESIMD_ARITH_OP_FILTER                                                \
-  __SEIEED::is_vectorizable_v<T1> &&__SEIEED::is_vectorizable_v<T2>            \
-      &&__SEIEED::is_simd_type_v<SimdTx>
-
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(+, __ESIMD_ARITH_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(-, __ESIMD_ARITH_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(*, __ESIMD_ARITH_OP_FILTER)
-__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER)
+  __SEIEED::is_valid_simd_elem_type_v<T1>                                      \
+      &&__SEIEED::is_valid_simd_elem_type_v<T2>                                \
+          &&__SEIEED::is_simd_type_v<SimdTx>
+
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(+, BinOp::add, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(-, BinOp::sub, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(*, BinOp::mul, __ESIMD_ARITH_OP_FILTER)
+__ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, BinOp::div, __ESIMD_ARITH_OP_FILTER)
 #undef __ESIMD_ARITH_OP_FILTER
 
 #undef __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP
@@ -151,24 +156,28 @@ __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER)
 // Both simd and simd_mask will match simd_obj_impl argument when resolving
 // operator overloads.
 
-#define __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(CMPOP, COND)                          \
+#define __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(CMPOP, CMPOP_ID, COND)                \
                                                                                \
   /* simd_obj_impl CMPOP simd_obj_impl */                                      \
   template <class T1, class T2, int N, template <class, int> class SimdT,      \
             class SimdTx = SimdT<T1, N>, class = std::enable_if_t<COND>>       \
   inline __SEIEE::simd_mask<N> operator CMPOP(                                 \
-      const __SEIEED::simd_obj_impl<T1, N, SimdT<T1, N>> &LHS,                 \
-      const __SEIEED::simd_obj_impl<T2, N, SimdT<T2, N>> &RHS) {               \
-    using MaskVecT = typename __SEIEE::simd_mask<N>::vector_type;              \
+      const __SEIEED::simd_obj_impl<__raw_t<T1>, N, SimdT<T1, N>> &LHS,        \
+      const __SEIEED::simd_obj_impl<__raw_t<T2>, N, SimdT<T2, N>> &RHS) {      \
+    using MaskVecT = typename __SEIEE::simd_mask<N>::raw_vector_type;          \
                                                                                \
-    if constexpr (__SEIEED::is_simd_type_v<SimdT<T1, N>>) {                    \
-      using PromSimdT =                                                        \
-          __SEIEED::computation_type_t<SimdT<T1, N>, SimdT<T2, N>>;            \
-      using PromVecT = typename PromSimdT::vector_type;                        \
-      auto ResVec = __SEIEED::convert<PromVecT>(LHS.data())                    \
-          CMPOP __SEIEED::convert<PromVecT>(RHS.data());                       \
-      return __SEIEE::simd_mask<N>(__SEIEED::convert<MaskVecT>(ResVec) &       \
-                                   MaskVecT(1));                               \
+    if constexpr (is_simd_type_v<SimdT<T1, N>>) {                              \
+      using PromotedT = computation_type_t<T1, T2>;                            \
+      /* vector_comparison_op returns vector_type_t<Ti, N>, where Ti is        \
+       * integer type */                                                       \
+      /* of the same bit size as PromotedT */                                  \
+      auto Res = vector_comparison_op<CMPOP_ID, PromotedT, N>(                 \
+          __SEIEED::convert_vector<PromotedT, T1, N>(LHS.data()),              \
+          __SEIEED::convert_vector<PromotedT, T2, N>(RHS.data()));             \
+      using ResElemT = element_type_t<decltype(Res)>;                          \
+      return __SEIEE::simd_mask<N>(                                            \
+          __SEIEED::convert_vector<simd_mask_elem_type, ResElemT, N>(Res) &    \
+          MaskVecT(1));                                                        \
     } else {                                                                   \
       /* this is comparison of masks, don't perform type promotion */          \
       auto ResVec = LHS.data() CMPOP RHS.data();                               \
@@ -180,49 +189,57 @@ __ESIMD_DEF_SIMD_OBJ_IMPL_BIN_OP(/, __ESIMD_ARITH_OP_FILTER)
   /* simd_obj_impl CMPOP SCALAR */                                             \
   template <class T1, int N1, template <class, int> class SimdT1, class T2,    \
             class SimdTx = SimdT1<T1, N1>,                                     \
-            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T2> && COND>> \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_valid_simd_elem_type_v<T2> && COND>>              \
   inline __SEIEE::simd_mask<N1> operator CMPOP(                                \
-      const __SEIEED::simd_obj_impl<T1, N1, SimdT1<T1, N1>> &LHS, T2 RHS) {    \
+      const __SEIEED::simd_obj_impl<__raw_t<T1>, N1, SimdT1<T1, N1>> &LHS,     \
+      T2 RHS) {                                                                \
     if constexpr (__SEIEED::is_simd_type_v<SimdT1<T1, N1>>)                    \
       /* simd case */                                                          \
       return LHS CMPOP SimdT1<T2, N1>(RHS);                                    \
     else                                                                       \
       /* simd_mask case - element type is fixed */                             \
-      return LHS CMPOP SimdT1<T1, N1>((T1)RHS);                                \
+      return LHS CMPOP SimdT1<T1, N1>(convert_scalar<T1>(RHS));                \
   }                                                                            \
                                                                                \
   /* SCALAR CMPOP simd_obj_impl */                                             \
   template <class T1, class T2, int N2, template <class, int> class SimdT2,    \
             class SimdTx = SimdT2<T2, N2>,                                     \
-            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T1> && COND>> \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_valid_simd_elem_type_v<T1> && COND>>              \
   inline __SEIEE::simd_mask<N2> operator CMPOP(                                \
-      T1 LHS, const __SEIEED::simd_obj_impl<T2, N2, SimdT2<T2, N2>> &RHS) {    \
+      T1 LHS,                                                                  \
+      const __SEIEED::simd_obj_impl<__raw_t<T2>, N2, SimdT2<T2, N2>> &RHS) {   \
     if constexpr (__SEIEED::is_simd_type_v<SimdT2<T2, N2>>)                    \
       /* simd case */                                                          \
       return SimdT2<T1, N2>(LHS) CMPOP RHS;                                    \
     else                                                                       \
       /* simd_mask case - element type is fixed */                             \
-      return SimdT2<T2, N2>((T2)LHS) CMPOP RHS;                                \
+      return SimdT2<T2, N2>(convert_scalar<T2>(LHS)) CMPOP RHS;                \
   }
 
 // Equality comparison is defined for all simd_obj_impl subclasses.
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(==, true)
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(!=, true)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(==, CmpOp::eq, true)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(!=, CmpOp::ne, true)
 
 // Relational operators are defined only for the simd type.
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<, __SEIEED::is_simd_type_v<SimdTx>)
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>, __SEIEED::is_simd_type_v<SimdTx>)
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<=, __SEIEED::is_simd_type_v<SimdTx>)
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>=, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<, CmpOp::lt, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>, CmpOp::gt, __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(<=, CmpOp::lte,
+                                 __SEIEED::is_simd_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(>=, CmpOp::gte,
+                                 __SEIEED::is_simd_type_v<SimdTx>)
 
 // Logical operators are defined only for the simd_mask type
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(&&, __SEIEED::is_simd_mask_type_v<SimdTx>)
-__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(||, __SEIEED::is_simd_mask_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(&&, BinOp::log_and,
+                                 __SEIEED::is_simd_mask_type_v<SimdTx>)
+__ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP(||, BinOp::log_or,
+                                 __SEIEED::is_simd_mask_type_v<SimdTx>)
 
 #undef __ESIMD_DEF_SIMD_OBJ_IMPL_CMP_OP
 } // namespace __SEIEED
 
-namespace __SEIEE {
+namespace __SEIEED {
 ////////////////////////////////////////////////////////////////////////////////
 // simd_view global operators
 ////////////////////////////////////////////////////////////////////////////////
@@ -320,8 +337,8 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(>>, __ESIMD_SHIFT_OP_FILTER)
 #undef __ESIMD_SHIFT_OP_FILTER
 
 #define __ESIMD_ARITH_OP_FILTER                                                \
-  __SEIEED::is_simd_type_v<SimdT1> &&__SEIEED::is_vectorizable_v<T1>           \
-      &&__SEIEED::is_vectorizable_v<T2>
+  __SEIEED::is_simd_type_v<SimdT1> &&__SEIEED::is_valid_simd_elem_type_v<T1>   \
+      &&__SEIEED::is_valid_simd_elem_type_v<T2>
 
 __ESIMD_DEF_SIMD_VIEW_BIN_OP(+, __ESIMD_ARITH_OP_FILTER)
 __ESIMD_DEF_SIMD_VIEW_BIN_OP(-, __ESIMD_ARITH_OP_FILTER)
@@ -364,32 +381,33 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(||, __SEIEED::is_simd_mask_type_v<SimdT1>)
   }                                                                            \
                                                                                \
   /* simd_view CMPOP simd_obj_impl */                                          \
-  template <class SimdT1, class RegionT1, class T2, int N2, class SimdT2,      \
+  template <class SimdT1, class RegionT1, class RawT2, int N2, class SimdT2,   \
             class = std::enable_if_t<                                          \
                 (__SEIEE::shape_type<RegionT1>::length == N2) &&               \
                 (__SEIEED::is_simd_type_v<SimdT1> ==                           \
                  __SEIEED::is_simd_type_v<SimdT2>)&&COND>>                     \
   inline __SEIEE::simd_mask<N2> operator CMPOP(                                \
       const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,                         \
-      const __SEIEED::simd_obj_impl<T2, N2, SimdT2> &RHS) {                    \
+      const __SEIEED::simd_obj_impl<RawT2, N2, SimdT2> &RHS) {                 \
     return LHS.read() CMPOP SimdT2(RHS.data());                                \
   }                                                                            \
                                                                                \
   /* simd_obj_impl CMPOP simd_view */                                          \
-  template <class T1, int N1, class SimdT1, class SimdT2, class RegionT2,      \
+  template <class RawT1, int N1, class SimdT1, class SimdT2, class RegionT2,   \
             class = std::enable_if_t<                                          \
                 (__SEIEE::shape_type<RegionT2>::length == N1) &&               \
                 (__SEIEED::is_simd_type_v<SimdT1> ==                           \
                  __SEIEED::is_simd_type_v<SimdT2>)&&COND>>                     \
   inline __SEIEE::simd_mask<N1> operator CMPOP(                                \
-      const __SEIEED::simd_obj_impl<T1, N1, SimdT1> &LHS,                      \
+      const __SEIEED::simd_obj_impl<RawT1, N1, SimdT1> &LHS,                   \
       const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {                       \
     return SimdT1(LHS.data()) CMPOP RHS.read();                                \
   }                                                                            \
                                                                                \
   /* simd_view CMPOP SCALAR */                                                 \
   template <class SimdT1, class RegionT1, class T2,                            \
-            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T2> && COND>> \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_valid_simd_elem_type_v<T2> && COND>>              \
   inline auto operator CMPOP(const __SEIEE::simd_view<SimdT1, RegionT1> &LHS,  \
                              T2 RHS) {                                         \
     return LHS.read() CMPOP RHS;                                               \
@@ -397,7 +415,8 @@ __ESIMD_DEF_SIMD_VIEW_BIN_OP(||, __SEIEED::is_simd_mask_type_v<SimdT1>)
                                                                                \
   /* SCALAR CMPOP simd_view */                                                 \
   template <class T1, class SimdT2, class RegionT2, class SimdT1 = SimdT2,     \
-            class = std::enable_if_t<__SEIEED::is_vectorizable_v<T1> && COND>> \
+            class = std::enable_if_t<                                          \
+                __SEIEED::is_valid_simd_elem_type_v<T1> && COND>>              \
   inline auto operator CMPOP(                                                  \
       T1 LHS, const __SEIEE::simd_view<SimdT2, RegionT2> &RHS) {               \
     return LHS CMPOP RHS.read();                                               \
@@ -415,4 +434,4 @@ __ESIMD_DEF_SIMD_VIEW_CMP_OP(>=, __SEIEED::is_simd_type_v<SimdT1>)
 
 #undef __ESIMD_DEF_SIMD_VIEW_CMP_OP
 
-} // namespace __SEIEE
+} // namespace __SEIEED
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp
index 806cd413827f8..bed505ac88b94 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp
@@ -34,9 +34,10 @@ class simd_mask_impl
   using base_type = detail::simd_obj_impl<T, N, simd_mask_impl<T, N>>;
 
 public:
+  using raw_element_type = T;
   using element_type = T;
-  using vector_type = typename base_type::vector_type;
-  static_assert(std::is_same_v<vector_type, simd_mask_storage_t<N>> &&
+  using raw_vector_type = typename base_type::raw_vector_type;
+  static_assert(std::is_same_v<raw_vector_type, simd_mask_storage_t<N>> &&
                 "mask impl type mismatch");
 
   simd_mask_impl() = default;
@@ -48,7 +49,7 @@ class simd_mask_impl
 
   /// Implicit conversion constructor from a raw vector object.
   // TODO this should be made inaccessible from user code.
-  simd_mask_impl(const vector_type &Val) : base_type(Val) {}
+  simd_mask_impl(const raw_vector_type &Val) : base_type(Val) {}
 
   /// Initializer list constructor.
   __SYCL_DEPRECATED("use constructor from array, e.g: simd_mask<3> x({0,1,1});")
@@ -56,7 +57,7 @@ class simd_mask_impl
 
   /// Construct from an array. To allow e.g. simd_mask<N> m({1,0,0,1,...}).
   template <int N1, class = std::enable_if_t<N1 == N>>
-  simd_mask_impl(const element_type(&&Arr)[N1]) {
+  simd_mask_impl(const raw_element_type (&&Arr)[N1]) {
     base_type::template init_from_array<N1>(std::move(Arr));
   }
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp
index 7c908c57935a3..35a3c8fa91657 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp
@@ -10,10 +10,11 @@
 
 #pragma once
 
+#include <sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/intrin.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp>
 #include <sycl/ext/intel/experimental/esimd/detail/sycl_util.hpp>
-#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/type_format.hpp>
 #include <sycl/ext/intel/experimental/esimd/simd_view.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -29,7 +30,7 @@ namespace esimd {
 /// element_aligned_tag type. Flag of this type should be used in load and store
 /// operations when memory address is aligned by simd object's element type.
 struct element_aligned_tag {
-  template <typename VT, typename ET = typename detail::element_type<VT>::type>
+  template <typename VT, typename ET = detail::element_type_t<VT>>
   static constexpr unsigned alignment = alignof(ET);
 };
 
@@ -86,7 +87,7 @@ namespace detail {
 /// template arguments are needed, template aliases can be used
 /// (simd_mask_type).
 ///
-/// \tparam Ty the element type
+/// \tparam RawTy raw (storage) element type
 /// \tparam N number of elements
 /// \tparam Derived - a class derived from this one; this class and its
 ///    derivatives must follow the 'curiously recurring template' pattern.
@@ -94,24 +95,29 @@ namespace detail {
 ///    types.hpp, used to disable invalid specializations.
 ///
 /// \ingroup sycl_esimd
-template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
+///
+template <typename RawTy, int N, class Derived, class SFINAE>
+class simd_obj_impl {
   template <typename, typename> friend class simd_view;
   template <typename, int> friend class simd;
   template <typename, int> friend class simd_mask_impl;
 
+  using element_type = simd_like_obj_element_type_t<Derived>;
+  using Ty = element_type;
+
 public:
   /// The underlying builtin data type.
-  using vector_type = vector_type_t<Ty, N>;
+  using raw_vector_type = vector_type_t<RawTy, N>;
 
   /// The element type of this simd_obj_impl object.
-  using element_type = Ty;
+  using raw_element_type = RawTy;
 
   /// The number of elements in this simd_obj_impl object.
   static constexpr int length = N;
 
 protected:
   template <int N1, class = std::enable_if_t<N1 == N>>
-  void init_from_array(const Ty(&&Arr)[N1]) noexcept {
+  void init_from_array(const RawTy (&&Arr)[N1]) noexcept {
     for (auto I = 0; I < N; ++I) {
       M_data[I] = Arr[I];
     }
@@ -133,20 +139,15 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   }
 
   /// Implicit conversion constructor from another \c simd_obj_impl object.
-  template <typename SrcTy>
-  simd_obj_impl(
-      const simd_obj_impl<SrcTy, N, convert_simd_elem_type_t<Derived, SrcTy>,
-                          SFINAE> &other) {
+  template <class Ty1, typename Derived1>
+  simd_obj_impl(const simd_obj_impl<Ty1, N, Derived1, SFINAE> &other) {
     __esimd_dbg_print(simd_obj_impl(const simd_obj_impl... > &other));
-    if constexpr (std::is_same_v<SrcTy, Ty>)
-      set(other.data());
-    else
-      set(__builtin_convertvector(other.data(), vector_type));
+    set(convert_vector<Ty, element_type_t<Derived1>, N>(other.data()));
   }
 
   /// Implicit conversion constructor from a raw vector object.
-  simd_obj_impl(const vector_type &Val) {
-    __esimd_dbg_print(simd_obj_impl(const vector_type &Val));
+  simd_obj_impl(const raw_vector_type &Val) {
+    __esimd_dbg_print(simd_obj_impl(const raw_vector_type &Val));
     set(Val);
   }
 
@@ -158,8 +159,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   ///    following will compile:
   ///   simd<int, 2> x = {1, 2, 3, 4};
   __SYCL_DEPRECATED("use constructor from array, e.g: simd<int,3> x({1,2,3});")
-  simd_obj_impl(std::initializer_list<Ty> Ilist) noexcept {
-    __esimd_dbg_print(simd_obj_impl(std::initializer_list<Ty> Ilist));
+  simd_obj_impl(std::initializer_list<RawTy> Ilist) noexcept {
+    __esimd_dbg_print(simd_obj_impl(std::initializer_list<RawTy> Ilist));
     int i = 0;
     for (auto It = Ilist.begin(); It != Ilist.end() && i < N; ++It) {
       M_data[i++] = *It;
@@ -171,21 +172,23 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
     __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step));
 #pragma unroll
     for (int i = 0; i < N; ++i) {
-      M_data[i] = Val;
-      Val += Step;
+      M_data[i] = bitcast_to_raw_type(Val);
+      Val = binary_op<BinOp::add, Ty>(Val, Step);
     }
   }
 
   /// Broadcast constructor
-  simd_obj_impl(Ty Val) noexcept {
-    __esimd_dbg_print(simd_obj_impl(Ty Val));
-    M_data = Val;
+  template <class T1,
+            class = std::enable_if_t<detail::is_valid_simd_elem_type_v<T1>>>
+  simd_obj_impl(T1 Val) noexcept {
+    __esimd_dbg_print(simd_obj_impl(T1 Val));
+    M_data = bitcast_to_raw_type(detail::convert_scalar<Ty>(Val));
   }
 
   /// Construct from an array. To allow e.g. simd_mask_type<N> m({1,0,0,1,...}).
   template <int N1, class = std::enable_if_t<N1 == N>>
-  simd_obj_impl(const Ty(&&Arr)[N1]) noexcept {
-    __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1]));
+  simd_obj_impl(const RawTy (&&Arr)[N1]) noexcept {
+    __esimd_dbg_print(simd_obj_impl(const RawTy(&&Arr)[N1]));
     init_from_array(std::move(Arr));
   }
 
@@ -212,9 +215,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// @}
 
   // Load the object's value from array.
-  template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty(&&Arr)[N1]) {
-    __esimd_dbg_print(copy_from(const Ty(&&Arr)[N1]));
-    vector_type Tmp;
+  template <int N1>
+  std::enable_if_t<N1 == N> copy_from(const RawTy (&&Arr)[N1]) {
+    __esimd_dbg_print(copy_from(const RawTy(&&Arr)[N1]));
+    raw_vector_type Tmp;
     for (auto I = 0; I < N; ++I) {
       Tmp[I] = Arr[I];
     }
@@ -222,8 +226,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   }
 
   // Store the object's value to array.
-  template <int N1> std::enable_if_t<N1 == N> copy_to(Ty(&&Arr)[N1]) const {
-    __esimd_dbg_print(copy_to(Ty(&&Arr)[N1]));
+  template <int N1> std::enable_if_t<N1 == N> copy_to(RawTy (&&Arr)[N1]) const {
+    __esimd_dbg_print(copy_to(RawTy(&&Arr)[N1]));
     for (auto I = 0; I < N; ++I) {
       Arr[I] = data()[I];
     }
@@ -231,30 +235,31 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
   /// @{
   /// Conversion operators.
-  explicit operator const vector_type &() const & {
-    __esimd_dbg_print(explicit operator const vector_type &() const &);
+  explicit operator const raw_vector_type &() const & {
+    __esimd_dbg_print(explicit operator const raw_vector_type &() const &);
     return M_data;
   }
-  explicit operator vector_type &() & {
-    __esimd_dbg_print(explicit operator vector_type &() &);
+  explicit operator raw_vector_type &() & {
+    __esimd_dbg_print(explicit operator raw_vector_type &() &);
     return M_data;
   }
 
-  /// Explicit conversion for simd_obj_impl<T, 1> into T.
+  /// Type conversion into a scalar:
+  /// simd_obj_impl<RawTy, 1, simd<Ty,1>> to Ty.
   template <typename T = simd_obj_impl,
             typename = sycl::detail::enable_if_t<T::length == 1>>
   operator Ty() const {
-    __esimd_dbg_print(explicit operator Ty());
-    return data()[0];
+    __esimd_dbg_print(operator Ty());
+    return bitcast_to_wrapper_type<Ty>(data()[0]);
   }
   /// @}
 
-  vector_type data() const {
-    __esimd_dbg_print(vector_type data());
+  raw_vector_type data() const {
+    __esimd_dbg_print(raw_vector_type data());
 #ifndef __SYCL_DEVICE_ONLY__
     return M_data;
 #else
-    return __esimd_vload<Ty, N>(&M_data);
+    return __esimd_vload<RawTy, N>(&M_data);
 #endif
   }
 
@@ -269,8 +274,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
   /// Whole region update with predicates.
   void merge(const Derived &Val, const simd_mask_type<N> &Mask) {
-    set(__esimd_wrregion<Ty, N, N, 0 /*VS*/, N, 1, N>(data(), Val.data(), 0,
-                                                      Mask.data()));
+    set(__esimd_wrregion<RawTy, N, N, 0 /*VS*/, N, 1, N>(data(), Val.data(), 0,
+                                                         Mask.data()));
   }
 
   void merge(const Derived &Val1, Derived Val2, const simd_mask_type<N> &Mask) {
@@ -280,7 +285,7 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
   /// View this simd_obj_impl object in a different element type.
   template <typename EltTy> auto bit_cast_view() &[[clang::lifetimebound]] {
-    using TopRegionTy = compute_format_type_t<simd_obj_impl, EltTy>;
+    using TopRegionTy = compute_format_type_t<Derived, EltTy>;
     using RetTy = simd_view<Derived, TopRegionTy>;
     return RetTy{cast_this_to_derived(), TopRegionTy{0}};
   }
@@ -294,8 +299,7 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// View as a 2-dimensional simd_view.
   template <typename EltTy, int Height, int Width>
   auto bit_cast_view() &[[clang::lifetimebound]] {
-    using TopRegionTy =
-        compute_format_type_2d_t<simd_obj_impl, EltTy, Height, Width>;
+    using TopRegionTy = compute_format_type_2d_t<Derived, EltTy, Height, Width>;
     using RetTy = simd_view<Derived, TopRegionTy>;
     return RetTy{cast_this_to_derived(), TopRegionTy{0, 0}};
   }
@@ -332,16 +336,16 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
     static_assert(Size > 1 || Stride == 1,
                   "Stride must be 1 in single-element region");
     Derived &&Val = std::move(cast_this_to_derived());
-    return __esimd_rdregion<Ty, N, Size, /*VS*/ 0, Size, Stride>(Val.data(),
-                                                                 Offset);
+    return __esimd_rdregion<RawTy, N, Size, /*VS*/ 0, Size, Stride>(Val.data(),
+                                                                    Offset);
   }
 
   /// Read single element, return value only (not reference).
-  Ty operator[](int i) const { return data()[i]; }
+  Ty operator[](int i) const { return bitcast_to_wrapper_type<Ty>(data()[i]); }
 
   /// Read single element, return value only (not reference).
   __SYCL_DEPRECATED("use operator[] form.")
-  Ty operator()(int i) const { return data()[i]; }
+  Ty operator()(int i) const { return bitcast_to_wrapper_type<Ty>(data()[i]); }
 
   /// Return writable view of a single element.
   simd_view<Derived, region1d_scalar_t<Ty>> operator[](int i)
@@ -360,14 +364,14 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   template <int Size>
   resize_a_simd_type_t<Derived, Size>
   iselect(const simd<uint16_t, Size> &Indices) {
-    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
-    return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(RawTy);
+    return __esimd_rdindirect<RawTy, N, Size>(data(), Offsets);
   }
   // TODO ESIMD_EXPERIMENTAL
   /// update single element
   void iupdate(ushort Index, Ty V) {
     auto Val = data();
-    Val[Index] = V;
+    Val[Index] = bitcast_to_raw_type(V);
     set(Val);
   }
 
@@ -377,9 +381,9 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   void iupdate(const simd<uint16_t, Size> &Indices,
                const resize_a_simd_type_t<Derived, Size> &Val,
                const simd_mask_type<Size> &Mask) {
-    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
-    set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets,
-                                        Mask.data()));
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(RawTy);
+    set(__esimd_wrindirect<RawTy, N, Size>(data(), Val.data(), Offsets,
+                                           Mask.data()));
   }
 
   /// \name Replicate
@@ -454,8 +458,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   template <int Rep, int VS, int W, int HS>
   resize_a_simd_type_t<Derived, Rep * W>
   replicate_vs_w_hs(uint16_t Offset) const {
-    return __esimd_rdregion<Ty, N, Rep * W, VS, W, HS, N>(data(),
-                                                          Offset * sizeof(Ty));
+    return __esimd_rdregion<RawTy, N, Rep * W, VS, W, HS, N>(
+        data(), Offset * sizeof(RawTy));
   }
   ///@}
 
@@ -479,19 +483,18 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
   /// Write a simd_obj_impl-vector into a basic region of a simd_obj_impl
   /// object.
-  template <typename RTy>
-  ESIMD_INLINE void writeRegion(
-      RTy Region,
-      const vector_type_t<typename RTy::element_type, RTy::length> &Val) {
-    using ElemTy = typename RTy::element_type;
-    if constexpr (N * sizeof(Ty) == RTy::length * sizeof(ElemTy))
+  template <typename RTy, class ElemTy = __raw_t<typename RTy::element_type>>
+  ESIMD_INLINE void writeRegion(RTy Region,
+                                const vector_type_t<ElemTy, RTy::length> &Val) {
+
+    if constexpr (N * sizeof(RawTy) == RTy::length * sizeof(ElemTy))
       // update the entire vector
-      set(bitcast<Ty, ElemTy, RTy::length>(Val));
+      set(bitcast<RawTy, ElemTy, RTy::length>(Val));
     else {
       static_assert(!RTy::Is_2D);
       // If element type differs, do bitcast conversion first.
-      auto Base = bitcast<ElemTy, Ty, N>(data());
-      constexpr int BN = (N * sizeof(Ty)) / sizeof(ElemTy);
+      auto Base = bitcast<ElemTy, RawTy, N>(data());
+      constexpr int BN = (N * sizeof(RawTy)) / sizeof(ElemTy);
       // Access the region information.
       constexpr int M = RTy::Size_x;
       constexpr int Stride = RTy::Stride_x;
@@ -501,27 +504,26 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
       auto Merged = __esimd_wrregion<ElemTy, BN, M,
                                      /*VS*/ 0, M, Stride>(Base, Val, Offset);
       // Convert back to the original element type, if needed.
-      set(bitcast<Ty, ElemTy, BN>(Merged));
+      set(bitcast<RawTy, ElemTy, BN>(Merged));
     }
   }
 
   /// Write a simd_obj_impl-vector into a nested region of a simd_obj_impl
   /// object.
-  template <typename TR, typename UR>
-  ESIMD_INLINE void
-  writeRegion(std::pair<TR, UR> Region,
-              const vector_type_t<typename TR::element_type, TR::length> &Val) {
+  template <typename TR, typename UR,
+            class ElemTy = __raw_t<typename TR::element_type>>
+  ESIMD_INLINE void writeRegion(std::pair<TR, UR> Region,
+                                const vector_type_t<ElemTy, TR::length> &Val) {
     // parent-region type
     using PaTy = typename shape_type<UR>::type;
-    using ElemTy = typename TR::element_type;
-    using BT = typename PaTy::element_type;
+    using BT = __raw_t<typename PaTy::element_type>;
     constexpr int BN = PaTy::length;
 
     if constexpr (PaTy::Size_in_bytes == TR::Size_in_bytes) {
       writeRegion(Region.second, bitcast<BT, ElemTy, TR::length>(Val));
     } else {
       // Recursively read the base
-      auto Base = readRegion<Ty, N>(data(), Region.second);
+      auto Base = readRegion<RawTy, N>(data(), Region.second);
       // If element type differs, do bitcast conversion first.
       auto Base1 = bitcast<ElemTy, BT, BN>(Base);
       constexpr int BN1 = PaTy::Size_in_bytes / sizeof(ElemTy);
@@ -574,9 +576,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// global address space, otherwise behavior is undefined.
   /// @param flags for the copy operation. If the template parameter Flags is
   /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is
-  /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type
-  /// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N.
-  /// Program not meeting alignment requirements results in undefined behavior.
+  /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's
+  /// raw_vector_type alignment. If Flags is overaligned_tag<N>, \p addr must be
+  /// aligned by N. Program not meeting alignment requirements results in
+  /// undefined behavior.
   template <typename Flags = element_aligned_tag, int ChunkSize = 32,
             typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
   ESIMD_INLINE void copy_from(const Ty *addr, Flags = {}) SYCL_ESIMD_FUNCTION;
@@ -589,9 +592,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// @param offset offset to copy from (in bytes).
   /// @param flags for the copy operation. If the template parameter Flags is
   /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is
-  /// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type
-  /// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N.
-  /// Program not meeting alignment requirements results in undefined behavior.
+  /// vector_aligned_tag, offset must be aligned by simd_obj_impl's
+  /// raw_vector_type alignment. If Flags is overaligned_tag<N>, offset must be
+  /// aligned by N. Program not meeting alignment requirements results in
+  /// undefined behavior.
   template <typename AccessorT, typename Flags = element_aligned_tag,
             int ChunkSize = 32,
             typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
@@ -604,9 +608,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// global address space, otherwise behavior is undefined.
   /// @param flags for the copy operation. If the template parameter Flags is
   /// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is
-  /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type
-  /// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N.
-  /// Program not meeting alignment requirements results in undefined behavior.
+  /// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's
+  /// raw_vector_type alignment. If Flags is overaligned_tag<N>, \p addr must be
+  /// aligned by N. Program not meeting alignment requirements results in
+  /// undefined behavior.
   template <typename Flags = element_aligned_tag, int ChunkSize = 32,
             typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
   ESIMD_INLINE void copy_to(Ty *addr, Flags = {}) const SYCL_ESIMD_FUNCTION;
@@ -618,9 +623,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   /// @param offset offset to copy from.
   /// @param flags for the copy operation. If the template parameter Flags is
   /// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is
-  /// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type
-  /// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N.
-  /// Program not meeting alignment requirements results in undefined behavior.
+  /// vector_aligned_tag, offset must be aligned by simd_obj_impl's
+  /// raw_vector_type alignment. If Flags is overaligned_tag<N>, offset must be
+  /// aligned by N. Program not meeting alignment requirements results in
+  /// undefined behavior.
   template <typename AccessorT, typename Flags = element_aligned_tag,
             int ChunkSize = 32,
             typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
@@ -630,21 +636,22 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
   /// @} // Memory operations
 
+  // Unary operations.
+
   /// Bitwise inversion, available in all subclasses.
   template <class T1 = Ty, class = std::enable_if_t<std::is_integral_v<T1>>>
   Derived operator~() const {
-    return Derived(~data());
+    return Derived{
+        detail::vector_unary_op<detail::UnaryOp::bit_not, T1, N>(data())};
   }
 
   /// Unary logical negation operator, available in all subclasses.
-  /// Similarly to C++, where !x returns bool, !simd returns as simd_mask, where
+  /// Similarly to C++, where !x returns bool, !simd returns a simd_mask, where
   /// each element is a result of comparision with zero.
+  /// No need to implement via detail::vector_unary_op
   template <class T1 = Ty, class = std::enable_if_t<std::is_integral_v<T1>>>
   simd_mask_type<N> operator!() const {
-    using MaskVecT = typename simd_mask_type<N>::vector_type;
-    auto R = data() == vector_type(0);
-    return simd_mask_type<N>{__builtin_convertvector(R, MaskVecT) &
-                             MaskVecT(1)};
+    return *this == 0;
   }
 
 #define __ESIMD_DEF_SIMD_OBJ_IMPL_OPASSIGN(BINOP, OPASSIGN, COND)              \
@@ -656,7 +663,9 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   Derived &operator OPASSIGN(                                                  \
       const __SEIEED::simd_obj_impl<T1, N, SimdT> &RHS) {                      \
     auto Res = *this BINOP RHS;                                                \
-    set(__SEIEED::convert<vector_type>(Res.data()));                           \
+    using ResT = decltype(Res);                                                \
+    set(__SEIEED::convert_vector<element_type, typename ResT::element_type,    \
+                                 length>(Res.data()));                         \
     return cast_this_to_derived();                                             \
   }                                                                            \
                                                                                \
@@ -670,7 +679,9 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
   Derived &operator OPASSIGN(                                                  \
       const __SEIEE::simd_view<SimdT1, RegionT1> &RHS) {                       \
     auto Res = *this BINOP RHS.read();                                         \
-    set(__SEIEED::convert<vector_type>(Res.data()));                           \
+    using ResT = decltype(Res);                                                \
+    set(__SEIEED::convert_vector<element_type, typename ResT::element_type,    \
+                                 length>(Res.data()));                         \
     return cast_this_to_derived();                                             \
   }                                                                            \
                                                                                \
@@ -681,7 +692,7 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
       using RHSVecT = __SEIEED::construct_a_simd_type_t<Derived, T1, N>;       \
       return *this OPASSIGN RHSVecT(RHS);                                      \
     } else {                                                                   \
-      return *this OPASSIGN Derived((Ty)RHS);                                  \
+      return *this OPASSIGN Derived((RawTy)RHS);                               \
     }                                                                          \
   }
 
@@ -720,14 +731,14 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
 private:
   // The underlying data for this vector.
-  vector_type M_data;
+  raw_vector_type M_data;
 
 protected:
-  void set(const vector_type &Val) {
+  void set(const raw_vector_type &Val) {
 #ifndef __SYCL_DEVICE_ONLY__
     M_data = Val;
 #else
-    __esimd_vstore<Ty, N>(&M_data, Val);
+    __esimd_vstore<RawTy, N>(&M_data, Val);
 #endif
   }
 };
@@ -736,8 +747,10 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
 
 template <typename T, int N, class T1, class SFINAE>
 template <typename Flags, int ChunkSize, typename>
-void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr,
-                                                Flags) SYCL_ESIMD_FUNCTION {
+void simd_obj_impl<T, N, T1, SFINAE>::copy_from(
+    const simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
+    Flags) SYCL_ESIMD_FUNCTION {
+  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
   constexpr unsigned Size = sizeof(T) * N;
   constexpr unsigned Align = Flags::template alignment<T1>;
 
@@ -751,14 +764,14 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr,
       constexpr unsigned BlockN = BlockSize / sizeof(T);
       ForHelper<NumBlocks>::unroll([BlockN, Addr, this](unsigned Block) {
         select<BlockN, 1>(Block * BlockN) =
-            block_load<T, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
+            block_load<UT, BlockN, Flags>(Addr + (Block * BlockN), Flags{});
       });
     }
     if constexpr (RemSize > 0) {
       constexpr unsigned RemN = RemSize / sizeof(T);
       constexpr unsigned BlockN = BlockSize / sizeof(T);
       select<RemN, 1>(NumBlocks * BlockN) =
-          block_load<T, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
+          block_load<UT, RemN, Flags>(Addr + (NumBlocks * BlockN), Flags{});
     }
   } else if constexpr (sizeof(T) == 8) {
     simd<int32_t, N * 2> BC(reinterpret_cast<const int32_t *>(Addr), Flags{});
@@ -769,7 +782,7 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr,
       simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
       ForHelper<NumChunks>::unroll([Addr, &Offsets, this](unsigned Block) {
         select<ChunkSize, 1>(Block * ChunkSize) =
-            gather<T, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
+            gather<UT, ChunkSize>(Addr + (Block * ChunkSize), Offsets);
       });
     }
     constexpr unsigned RemN = N % ChunkSize;
@@ -779,14 +792,14 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr,
       } else if constexpr (RemN == 8 || RemN == 16) {
         simd<uint32_t, RemN> Offsets(0u, sizeof(T));
         select<RemN, 1>(NumChunks * ChunkSize) =
-            gather<T, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
+            gather<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets);
       } else {
         constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
         simd_mask_type<N1> Pred(0);
         Pred.template select<RemN, 1>() = 1;
         simd<uint32_t, N1> Offsets(0u, sizeof(T));
-        simd<T, N1> Vals =
-            gather<T, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
+        simd<UT, N1> Vals =
+            gather<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Pred);
         select<RemN, 1>(NumChunks * ChunkSize) =
             Vals.template select<RemN, 1>();
       }
@@ -800,6 +813,8 @@ ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
                               sycl::access::target::global_buffer, void>
 simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
                                            Flags) SYCL_ESIMD_FUNCTION {
+  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
+  static_assert(sizeof(UT) == sizeof(T));
   constexpr unsigned Size = sizeof(T) * N;
   constexpr unsigned Align = Flags::template alignment<T1>;
 
@@ -813,7 +828,7 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
       constexpr unsigned BlockN = BlockSize / sizeof(T);
       ForHelper<NumBlocks>::unroll([BlockN, acc, offset, this](unsigned Block) {
         select<BlockN, 1>(Block * BlockN) =
-            block_load<T, BlockN, AccessorT, Flags>(
+            block_load<UT, BlockN, AccessorT, Flags>(
                 acc, offset + (Block * BlockSize), Flags{});
       });
     }
@@ -821,7 +836,7 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
       constexpr unsigned RemN = RemSize / sizeof(T);
       constexpr unsigned BlockN = BlockSize / sizeof(T);
       select<RemN, 1>(NumBlocks * BlockN) =
-          block_load<T, RemN, AccessorT, Flags>(
+          block_load<UT, RemN, AccessorT, Flags>(
               acc, offset + (NumBlocks * BlockSize), Flags{});
     }
   } else if constexpr (sizeof(T) == 8) {
@@ -834,7 +849,7 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
       ForHelper<NumChunks>::unroll(
           [acc, offset, &Offsets, this](unsigned Block) {
             select<ChunkSize, 1>(Block * ChunkSize) =
-                gather<T, ChunkSize, AccessorT>(
+                gather<UT, ChunkSize, AccessorT>(
                     acc, Offsets, offset + (Block * ChunkSize * sizeof(T)));
           });
     }
@@ -842,14 +857,14 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
     if constexpr (RemN > 0) {
       if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
         simd<uint32_t, RemN> Offsets(0u, sizeof(T));
-        select<RemN, 1>(NumChunks * ChunkSize) = gather<T, RemN, AccessorT>(
+        select<RemN, 1>(NumChunks * ChunkSize) = gather<UT, RemN, AccessorT>(
             acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)));
       } else {
         constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
         simd_mask_type<N1> Pred(0);
         Pred.template select<RemN, 1>() = 1;
         simd<uint32_t, N1> Offsets(0u, sizeof(T));
-        simd<T, N1> Vals = gather<T, N1>(
+        simd<UT, N1> Vals = gather<UT, N1>(
             acc, Offsets, offset + (NumChunks * ChunkSize * sizeof(T)), Pred);
         select<RemN, 1>(NumChunks * ChunkSize) =
             Vals.template select<RemN, 1>();
@@ -860,8 +875,10 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
 
 template <typename T, int N, class T1, class SFINAE>
 template <typename Flags, int ChunkSize, typename>
-void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr,
-                                              Flags) const SYCL_ESIMD_FUNCTION {
+void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
+    simd_obj_impl<T, N, T1, SFINAE>::element_type *Addr,
+    Flags) const SYCL_ESIMD_FUNCTION {
+  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
   constexpr unsigned Size = sizeof(T) * N;
   constexpr unsigned Align = Flags::template alignment<T1>;
 
@@ -869,52 +886,52 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr,
   constexpr unsigned NumBlocks = Size / BlockSize;
   constexpr unsigned RemSize = Size % BlockSize;
 
-  simd<T, N> Tmp = data();
+  simd<UT, N> Tmp{data()};
   if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
                 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
     if constexpr (NumBlocks > 0) {
       constexpr unsigned BlockN = BlockSize / sizeof(T);
-      ForHelper<NumBlocks>::unroll([BlockN, addr, &Tmp](unsigned Block) {
-        block_store<T, BlockN>(addr + (Block * BlockN),
-                               Tmp.template select<BlockN, 1>(Block * BlockN));
+      ForHelper<NumBlocks>::unroll([BlockN, Addr, &Tmp](unsigned Block) {
+        block_store<UT, BlockN>(Addr + (Block * BlockN),
+                                Tmp.template select<BlockN, 1>(Block * BlockN));
       });
     }
     if constexpr (RemSize > 0) {
       constexpr unsigned RemN = RemSize / sizeof(T);
       constexpr unsigned BlockN = BlockSize / sizeof(T);
-      block_store<T, RemN>(addr + (NumBlocks * BlockN),
-                           Tmp.template select<RemN, 1>(NumBlocks * BlockN));
+      block_store<UT, RemN>(Addr + (NumBlocks * BlockN),
+                            Tmp.template select<RemN, 1>(NumBlocks * BlockN));
     }
   } else if constexpr (sizeof(T) == 8) {
     simd<int32_t, N * 2> BC = Tmp.template bit_cast_view<int32_t>();
-    BC.copy_to(reinterpret_cast<int32_t *>(addr), Flags{});
+    BC.copy_to(reinterpret_cast<int32_t *>(Addr), Flags{});
   } else {
     constexpr unsigned NumChunks = N / ChunkSize;
     if constexpr (NumChunks > 0) {
       simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
-      ForHelper<NumChunks>::unroll([addr, &Offsets, &Tmp](unsigned Block) {
-        scatter<T, ChunkSize>(
-            addr + (Block * ChunkSize), Offsets,
+      ForHelper<NumChunks>::unroll([Addr, &Offsets, &Tmp](unsigned Block) {
+        scatter<UT, ChunkSize>(
+            Addr + (Block * ChunkSize), Offsets,
             Tmp.template select<ChunkSize, 1>(Block * ChunkSize));
       });
     }
     constexpr unsigned RemN = N % ChunkSize;
     if constexpr (RemN > 0) {
       if constexpr (RemN == 1) {
-        addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
+        Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
       } else if constexpr (RemN == 8 || RemN == 16) {
         simd<uint32_t, RemN> Offsets(0u, sizeof(T));
-        scatter<T, RemN>(addr + (NumChunks * ChunkSize), Offsets,
-                         Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
+        scatter<UT, RemN>(Addr + (NumChunks * ChunkSize), Offsets,
+                          Tmp.template select<RemN, 1>(NumChunks * ChunkSize));
       } else {
         constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
         simd_mask_type<N1> Pred(0);
         Pred.template select<RemN, 1>() = 1;
-        simd<T, N1> Vals(0);
+        simd<UT, N1> Vals;
         Vals.template select<RemN, 1>() =
             Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
         simd<uint32_t, N1> Offsets(0u, sizeof(T));
-        scatter<T, N1>(addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
+        scatter<UT, N1>(Addr + (NumChunks * ChunkSize), Offsets, Vals, Pred);
       }
     }
   }
@@ -926,6 +943,7 @@ ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
                               sycl::access::target::global_buffer, void>
 simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
                                          Flags) const SYCL_ESIMD_FUNCTION {
+  using UT = simd_obj_impl<T, N, T1, SFINAE>::element_type;
   constexpr unsigned Size = sizeof(T) * N;
   constexpr unsigned Align = Flags::template alignment<T1>;
 
@@ -933,13 +951,14 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
   constexpr unsigned NumBlocks = Size / BlockSize;
   constexpr unsigned RemSize = Size % BlockSize;
 
-  simd<T, N> Tmp = data();
+  simd<UT, N> Tmp{data()};
+
   if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
                 detail::isPowerOf2(RemSize / OperandSize::OWORD)) {
     if constexpr (NumBlocks > 0) {
       constexpr unsigned BlockN = BlockSize / sizeof(T);
       ForHelper<NumBlocks>::unroll([BlockN, acc, offset, &Tmp](unsigned Block) {
-        block_store<T, BlockN, AccessorT>(
+        block_store<UT, BlockN, AccessorT>(
             acc, offset + (Block * BlockSize),
             Tmp.template select<BlockN, 1>(Block * BlockN));
       });
@@ -947,7 +966,7 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
     if constexpr (RemSize > 0) {
       constexpr unsigned RemN = RemSize / sizeof(T);
       constexpr unsigned BlockN = BlockSize / sizeof(T);
-      block_store<T, RemN, AccessorT>(
+      block_store<UT, RemN, AccessorT>(
           acc, offset + (NumBlocks * BlockSize),
           Tmp.template select<RemN, 1>(NumBlocks * BlockN));
     }
@@ -960,7 +979,7 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
       simd<uint32_t, ChunkSize> Offsets(0u, sizeof(T));
       ForHelper<NumChunks>::unroll([acc, offset, &Offsets,
                                     &Tmp](unsigned Block) {
-        scatter<T, ChunkSize, AccessorT>(
+        scatter<UT, ChunkSize, AccessorT>(
             acc, Offsets, Tmp.template select<ChunkSize, 1>(Block * ChunkSize),
             offset + (Block * ChunkSize * sizeof(T)));
       });
@@ -969,20 +988,20 @@ simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
     if constexpr (RemN > 0) {
       if constexpr (RemN == 1 || RemN == 8 || RemN == 16) {
         simd<uint32_t, RemN> Offsets(0u, sizeof(T));
-        scatter<T, RemN, AccessorT>(
+        scatter<UT, RemN, AccessorT>(
             acc, Offsets, Tmp.template select<RemN, 1>(NumChunks * ChunkSize),
             offset + (NumChunks * ChunkSize * sizeof(T)));
       } else {
         constexpr int N1 = RemN < 8 ? 8 : RemN < 16 ? 16 : 32;
         simd_mask_type<N1> Pred(0);
         Pred.template select<RemN, 1>() = 1;
-        simd<T, N1> Vals(0);
+        simd<UT, N1> Vals;
         Vals.template select<RemN, 1>() =
             Tmp.template select<RemN, 1>(NumChunks * ChunkSize);
         simd<uint32_t, N1> Offsets(0u, sizeof(T));
-        scatter<T, N1, AccessorT>(acc, Offsets, Vals,
-                                  offset + (NumChunks * ChunkSize * sizeof(T)),
-                                  Pred);
+        scatter<UT, N1, AccessorT>(acc, Offsets, Vals,
+                                   offset + (NumChunks * ChunkSize * sizeof(T)),
+                                   Pred);
       }
     }
   }
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
index d87f64e464909..d4be9820773e2 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_view_impl.hpp
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <sycl/ext/intel/experimental/esimd/detail/intrin.hpp>
-#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/type_format.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -51,12 +51,13 @@ class simd_view_impl {
   /// The element type of this class, which could be different from the element
   /// type of the base object type.
   using element_type = typename ShapeTy::element_type;
+  using raw_element_type = __raw_t<element_type>;
 
   /// The simd type if reading the object.
   using value_type = get_simd_t<element_type, length>;
 
   /// The underlying builtin vector type backing the value read from the object.
-  using vector_type = vector_type_t<element_type, length>;
+  using raw_vector_type = vector_type_t<__raw_t<element_type>, length>;
 
 private:
   Derived &cast_this_to_derived() { return reinterpret_cast<Derived &>(*this); }
@@ -83,7 +84,7 @@ class simd_view_impl {
     if constexpr (std::is_same_v<element_type, ToTy>)
       return read();
     else
-      return convert<ToTy, element_type, length>(read());
+      return convert_vector<ToTy, element_type, length>(read().data());
   }
 
   /// Implicit conversion to simd_mask_impl type, if element type is compatible.
@@ -117,7 +118,7 @@ class simd_view_impl {
     return value_type{readRegion<BT, BN>(M_base.data(), M_region)};
   }
 
-  typename value_type::vector_type data() const { return read().data(); }
+  typename value_type::raw_vector_type data() const { return read().data(); }
 
   /// Write to this object.
   Derived &write(const value_type &Val) {
@@ -259,7 +260,8 @@ class simd_view_impl {
 #undef __ESIMD_SHIFT_OP_FILTER
 
 #define __ESIMD_ARITH_OP_FILTER                                                \
-  is_vectorizable_v<T> &&is_vectorizable_v<T1> &&is_simd_type_v<SimdT>
+  is_valid_simd_elem_type_v<T> &&is_valid_simd_elem_type_v<T1>                 \
+      &&is_simd_type_v<SimdT>
 
   __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(+, +=, __ESIMD_ARITH_OP_FILTER)
   __ESIMD_DEF_SIMD_VIEW_IMPL_OPASSIGN(-, -=, __ESIMD_ARITH_OP_FILTER)
@@ -286,7 +288,7 @@ class simd_view_impl {
   template <class T = element_type,
             class = std::enable_if_t<std::is_integral_v<T>>>
   auto operator!() {
-    using MaskVecT = typename simd_mask_type<length>::vector_type;
+    using MaskVecT = typename simd_mask_type<length>::raw_vector_type;
     auto V = read().data() == 0;
     return simd_mask_type<length>{__builtin_convertvector(V, MaskVecT) &
                                   MaskVecT(1)};
@@ -313,12 +315,13 @@ class simd_view_impl {
                                       is_simd_type_v<BaseTy>)&&(length ==
                                                                 SimdT::length)>>
   Derived &operator=(const simd_obj_impl<T, N, SimdT> &Other) {
-    return write(convert<element_type>(reinterpret_cast<const SimdT &>(Other)));
+    return write(convert_vector<element_type, typename SimdT::element_type, N>(
+        Other.data()));
   }
 
-  template <class T1, class = std::enable_if_t<detail::is_vectorizable_v<T1>>>
+  template <class T1, class = std::enable_if_t<is_valid_simd_elem_type_v<T1>>>
   Derived &operator=(T1 RHS) {
-    return write(value_type((element_type)RHS));
+    return write(value_type(convert_scalar<element_type>(RHS)));
   }
 
   /// @}
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp
new file mode 100644
index 0000000000000..0af9a1a6718f2
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/type_format.hpp
@@ -0,0 +1,84 @@
+//==-------------- types.hpp - DPC++ Explicit SIMD API ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Meta-functions to compute compile-time element type of a simd_view resulting
+// from format operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/intel/experimental/esimd/detail/types.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace __SEIEED {
+
+template <typename BaseTy, typename EltTy> struct compute_format_type;
+
+template <typename Ty, int N, typename EltTy> struct compute_format_type_impl {
+  static constexpr int Size = sizeof(Ty) * N / sizeof(EltTy);
+  static constexpr int Stride = 1;
+  using type = region1d_t<EltTy, Size, Stride>;
+};
+
+template <typename Ty, int N, typename EltTy,
+          template <typename, int> class SimdT>
+struct compute_format_type<SimdT<Ty, N>, EltTy>
+    : compute_format_type_impl<Ty, N, EltTy> {};
+
+template <typename BaseTy, typename RegionTy, typename EltTy>
+struct compute_format_type<simd_view<BaseTy, RegionTy>, EltTy> {
+  using ShapeTy = typename shape_type<RegionTy>::type;
+  static constexpr int Size = ShapeTy::Size_in_bytes / sizeof(EltTy);
+  static constexpr int Stride = 1;
+  using type = region1d_t<EltTy, Size, Stride>;
+};
+
+template <typename Ty, typename EltTy>
+using compute_format_type_t = typename compute_format_type<Ty, EltTy>::type;
+
+// Compute the simd_view type of a 2D format operation.
+template <typename BaseTy, typename EltTy, int Height, int Width>
+struct compute_format_type_2d;
+
+template <typename Ty, int N, typename EltTy, int Height, int Width>
+struct compute_format_type_2d_impl {
+  static constexpr int Prod = sizeof(Ty) * N / sizeof(EltTy);
+  static_assert(Prod == Width * Height, "size mismatch");
+
+  static constexpr int SizeX = Width;
+  static constexpr int StrideX = 1;
+  static constexpr int SizeY = Height;
+  static constexpr int StrideY = 1;
+  using type = region2d_t<EltTy, SizeY, StrideY, SizeX, StrideX>;
+};
+
+template <typename Ty, int N, typename EltTy, int Height, int Width,
+          template <typename, int> class SimdT>
+struct compute_format_type_2d<SimdT<Ty, N>, EltTy, Height, Width>
+    : compute_format_type_2d_impl<Ty, N, EltTy, Height, Width> {};
+
+template <typename BaseTy, typename RegionTy, typename EltTy, int Height,
+          int Width>
+struct compute_format_type_2d<simd_view<BaseTy, RegionTy>, EltTy, Height,
+                              Width> {
+  using ShapeTy = typename shape_type<RegionTy>::type;
+  static constexpr int Prod = ShapeTy::Size_in_bytes / sizeof(EltTy);
+  static_assert(Prod == Width * Height, "size mismatch");
+
+  static constexpr int SizeX = Width;
+  static constexpr int StrideX = 1;
+  static constexpr int SizeY = Height;
+  static constexpr int StrideY = 1;
+  using type = region2d_t<EltTy, SizeY, StrideY, SizeX, StrideX>;
+};
+
+template <typename Ty, typename EltTy, int Height, int Width>
+using compute_format_type_2d_t =
+    typename compute_format_type_2d<Ty, EltTy, Height, Width>::type;
+
+} // namespace __SEIEED
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
index 9d82cccbf1aaa..2bd5a4defb676 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp
@@ -24,6 +24,10 @@
 
 #include <cstdint>
 
+#define __SEIEED sycl::ext::intel::experimental::esimd::detail
+#define __SEIEE sycl::ext::intel::experimental::esimd
+#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
@@ -37,9 +41,19 @@ template <typename BaseTy, typename RegionTy> class simd_view;
 
 namespace detail {
 
+namespace csd = cl::sycl::detail;
+
+template <int N>
+using uint_type_t = std::conditional_t<
+    N == 1, uint8_t,
+    std::conditional_t<
+        N == 2, uint16_t,
+        std::conditional_t<N == 4, uint32_t,
+                           std::conditional_t<N == 8, uint64_t, void>>>>;
+
 // forward declarations of major internal simd classes
 template <typename Ty, int N> class simd_mask_impl;
-template <typename ElT, int N, class Derived, class SFINAE = void>
+template <typename RawT, int N, class Derived, class SFINAE = void>
 class simd_obj_impl;
 
 // @{
@@ -72,100 +86,6 @@ static inline constexpr bool is_clang_vector_type_v =
 
 // @}
 
-// @{
-// Checks if given type T derives from simd_obj_impl or is equal to it.
-template <typename T>
-struct is_simd_obj_impl_derivative : public std::false_type {
-  using element_type = invalid_element_type;
-};
-
-// Specialization for the simd_obj_impl type itself.
-template <typename ElT, int N, class Derived>
-struct is_simd_obj_impl_derivative<simd_obj_impl<ElT, N, Derived>>
-    : public std::true_type {
-  using element_type = ElT;
-};
-
-// Specialization for all other types.
-template <typename ElT, int N, template <typename, int> class Derived>
-struct is_simd_obj_impl_derivative<Derived<ElT, N>>
-    : public std::conditional_t<
-          std::is_base_of_v<simd_obj_impl<ElT, N, Derived<ElT, N>>,
-                            Derived<ElT, N>>,
-          std::true_type, std::false_type> {
-  using element_type = std::conditional_t<
-      std::is_base_of_v<simd_obj_impl<ElT, N, Derived<ElT, N>>,
-                        Derived<ElT, N>>,
-      ElT, void>;
-};
-
-// Convenience shortcut.
-template <typename T>
-inline constexpr bool is_simd_obj_impl_derivative_v =
-    is_simd_obj_impl_derivative<T>::value;
-// @}
-
-// @{
-// "Resizes" given simd type \c T to given number of elements \c N.
-template <class SimdT, int Ndst> struct resize_a_simd_type;
-
-// Specialization for the simd_obj_impl type.
-template <typename ElT, int Nsrc, int Ndst,
-          template <typename, int> class SimdT>
-struct resize_a_simd_type<simd_obj_impl<ElT, Nsrc, SimdT<ElT, Nsrc>>, Ndst> {
-  using type = simd_obj_impl<ElT, Ndst, SimdT<ElT, Ndst>>;
-};
-
-// Specialization for the simd_obj_impl type derivatives.
-template <typename ElT, int Nsrc, int Ndst,
-          template <typename, int> class SimdT>
-struct resize_a_simd_type<SimdT<ElT, Nsrc>, Ndst> {
-  using type = SimdT<ElT, Ndst>;
-};
-
-// Convenience shortcut.
-template <class SimdT, int Ndst>
-using resize_a_simd_type_t = typename resize_a_simd_type<SimdT, Ndst>::type;
-// @}
-
-// @{
-// Converts element type of given simd type \c SimdT to
-// given scalar type \c DstElemT.
-template <class SimdT, typename DstElemT> struct convert_simd_elem_type;
-
-// Specialization for the simd_obj_impl type.
-template <typename SrcElemT, int N, typename DstElemT,
-          template <typename, int> class SimdT>
-struct convert_simd_elem_type<simd_obj_impl<SrcElemT, N, SimdT<SrcElemT, N>>,
-                              DstElemT> {
-  using type = simd_obj_impl<DstElemT, N, SimdT<DstElemT, N>>;
-};
-
-// Specialization for the simd_obj_impl type derivatives.
-template <typename SrcElemT, int N, typename DstElemT,
-          template <typename, int> class SimdT>
-struct convert_simd_elem_type<SimdT<SrcElemT, N>, DstElemT> {
-  using type = SimdT<DstElemT, N>;
-};
-
-// Convenience shortcut.
-template <class SimdT, typename DstElemT>
-using convert_simd_elem_type_t =
-    typename convert_simd_elem_type<SimdT, DstElemT>::type;
-
-// @}
-
-// Constructs a simd type with the same template type as in \c SimdT, and
-// given element type and number.
-template <class SimdT, typename ElT, int N>
-using construct_a_simd_type_t =
-    convert_simd_elem_type_t<resize_a_simd_type_t<SimdT, N>, ElT>;
-
-// @}
-
-namespace csd = cl::sycl::detail;
-using half = cl::sycl::detail::half_impl::StorageT;
-
 template <typename T>
 using remove_cvref_t = csd::remove_cv_t<csd::remove_reference_t<T>>;
 
@@ -198,8 +118,15 @@ struct is_vectorizable : std::conditional_t<is_esimd_arithmetic_type_v<Ty>,
 template <typename Ty>
 static inline constexpr bool is_vectorizable_v = is_vectorizable<Ty>::value;
 
-// vector_type, using clang vector type extension.
-template <typename Ty, int N> struct vector_type {
+template <typename T>
+static inline constexpr bool is_esimd_scalar_v =
+    cl::sycl::detail::is_arithmetic<T>::value;
+
+template <typename T>
+using is_esimd_scalar = typename std::bool_constant<is_esimd_scalar_v<T>>;
+
+// raw_vector_type, using clang vector type extension.
+template <typename Ty, int N> struct raw_vector_type {
   static_assert(!std::is_const<Ty>::value, "const element type not supported");
   static_assert(is_vectorizable_v<Ty>, "element type not supported");
   static_assert(N > 0, "zero-element vector not supported");
@@ -209,85 +136,96 @@ template <typename Ty, int N> struct vector_type {
 };
 
 template <typename Ty, int N>
-using vector_type_t = typename vector_type<Ty, N>::type;
+using vector_type_t = typename raw_vector_type<Ty, N>::type;
 
-// must match simd_mask<N>::element_type
-template <int N>
-using simd_mask_storage_t = vector_type_t<simd_mask_elem_type, N>;
+// @{
+// Checks if given type T derives from simd_obj_impl or is equal to it.
+template <typename T>
+struct is_simd_obj_impl_derivative : public std::false_type {};
 
-// Compute the simd_view type of a 1D format operation.
-template <typename BaseTy, typename EltTy> struct compute_format_type;
+// Specialization for the simd_obj_impl type itself.
+template <typename RawT, int N, class Derived>
+struct is_simd_obj_impl_derivative<simd_obj_impl<RawT, N, Derived>>
+    : public std::true_type {};
 
-template <typename Ty, int N, typename EltTy> struct compute_format_type_impl {
-  static constexpr int Size = sizeof(Ty) * N / sizeof(EltTy);
-  static constexpr int Stride = 1;
-  using type = region1d_t<EltTy, Size, Stride>;
-};
+template <class T, class SFINAE = void> struct element_type_traits;
+template <class T>
+using __raw_t = typename __SEIEED::element_type_traits<T>::RawT;
 
-template <typename Ty, int N, typename EltTy,
-          template <typename, int> class SimdT>
-struct compute_format_type<SimdT<Ty, N>, EltTy>
-    : compute_format_type_impl<Ty, N, EltTy> {};
-
-template <typename Ty, int N, typename EltTy, class SimdT>
-struct compute_format_type<simd_obj_impl<Ty, N, SimdT>, EltTy>
-    : compute_format_type_impl<Ty, N, EltTy> {};
-
-template <typename BaseTy, typename RegionTy, typename EltTy>
-struct compute_format_type<simd_view<BaseTy, RegionTy>, EltTy> {
-  using ShapeTy = typename shape_type<RegionTy>::type;
-  static constexpr int Size = ShapeTy::Size_in_bytes / sizeof(EltTy);
-  static constexpr int Stride = 1;
-  using type = region1d_t<EltTy, Size, Stride>;
+// Specialization for all other types.
+template <typename T, int N, template <typename, int> class Derived>
+struct is_simd_obj_impl_derivative<Derived<T, N>>
+    : public std::conditional_t<
+          std::is_base_of_v<simd_obj_impl<__raw_t<T>, N, Derived<T, N>>,
+                            Derived<T, N>>,
+          std::true_type, std::false_type> {};
+
+// Convenience shortcut.
+template <typename T>
+inline constexpr bool is_simd_obj_impl_derivative_v =
+    is_simd_obj_impl_derivative<T>::value;
+// @}
+
+// @{
+// "Resizes" given simd type \c T to given number of elements \c N.
+template <class SimdT, int Ndst> struct resize_a_simd_type;
+
+// Specialization for the simd_obj_impl type.
+template <typename T, int Nsrc, int Ndst, template <typename, int> class SimdT>
+struct resize_a_simd_type<simd_obj_impl<__raw_t<T>, Nsrc, SimdT<T, Nsrc>>,
+                          Ndst> {
+  using type = simd_obj_impl<__raw_t<T>, Ndst, SimdT<T, Ndst>>;
 };
 
-template <typename Ty, typename EltTy>
-using compute_format_type_t = typename compute_format_type<Ty, EltTy>::type;
+// Specialization for the simd_obj_impl type derivatives.
+template <typename T, int Nsrc, int Ndst, template <typename, int> class SimdT>
+struct resize_a_simd_type<SimdT<T, Nsrc>, Ndst> {
+  using type = SimdT<T, Ndst>;
+};
 
-// Compute the simd_view type of a 2D format operation.
-template <typename BaseTy, typename EltTy, int Height, int Width>
-struct compute_format_type_2d;
+// Convenience shortcut.
+template <class SimdT, int Ndst>
+using resize_a_simd_type_t = typename resize_a_simd_type<SimdT, Ndst>::type;
+// @}
 
-template <typename Ty, int N, typename EltTy, int Height, int Width>
-struct compute_format_type_2d_impl {
-  static constexpr int Prod = sizeof(Ty) * N / sizeof(EltTy);
-  static_assert(Prod == Width * Height, "size mismatch");
+// @{
+// Converts element type of given simd type \c SimdT to
+// given scalar type \c NewElemT.
+template <class SimdT, typename NewElemT> struct convert_simd_elem_type;
 
-  static constexpr int SizeX = Width;
-  static constexpr int StrideX = 1;
-  static constexpr int SizeY = Height;
-  static constexpr int StrideY = 1;
-  using type = region2d_t<EltTy, SizeY, StrideY, SizeX, StrideX>;
+// Specialization for the simd_obj_impl type.
+template <typename OldElemT, int N, typename NewElemT,
+          template <typename, int> class SimdT>
+struct convert_simd_elem_type<
+    simd_obj_impl<__raw_t<OldElemT>, N, SimdT<OldElemT, N>>, NewElemT> {
+  using type = simd_obj_impl<__raw_t<NewElemT>, N, SimdT<NewElemT, N>>;
 };
 
-template <typename Ty, int N, typename EltTy, int Height, int Width,
+// Specialization for the simd_obj_impl type derivatives.
+template <typename OldElemT, int N, typename NewElemT,
           template <typename, int> class SimdT>
-struct compute_format_type_2d<SimdT<Ty, N>, EltTy, Height, Width>
-    : compute_format_type_2d_impl<Ty, N, EltTy, Height, Width> {};
-
-template <typename Ty, int N, typename EltTy, int Height, int Width,
-          class SimdT>
-struct compute_format_type_2d<simd_obj_impl<Ty, N, SimdT>, EltTy, Height, Width>
-    : compute_format_type_2d_impl<Ty, N, EltTy, Height, Width> {};
-
-template <typename BaseTy, typename RegionTy, typename EltTy, int Height,
-          int Width>
-struct compute_format_type_2d<simd_view<BaseTy, RegionTy>, EltTy, Height,
-                              Width> {
-  using ShapeTy = typename shape_type<RegionTy>::type;
-  static constexpr int Prod = ShapeTy::Size_in_bytes / sizeof(EltTy);
-  static_assert(Prod == Width * Height, "size mismatch");
-
-  static constexpr int SizeX = Width;
-  static constexpr int StrideX = 1;
-  static constexpr int SizeY = Height;
-  static constexpr int StrideY = 1;
-  using type = region2d_t<EltTy, SizeY, StrideY, SizeX, StrideX>;
+struct convert_simd_elem_type<SimdT<OldElemT, N>, NewElemT> {
+  using type = SimdT<NewElemT, N>;
 };
 
-template <typename Ty, typename EltTy, int Height, int Width>
-using compute_format_type_2d_t =
-    typename compute_format_type_2d<Ty, EltTy, Height, Width>::type;
+// Convenience shortcut.
+template <class SimdT, typename NewElemT>
+using convert_simd_elem_type_t =
+    typename convert_simd_elem_type<SimdT, NewElemT>::type;
+
+// @}
+
+// Constructs a simd type with the same template type as in \c SimdT, and
+// given element type and number.
+template <class SimdT, typename T, int N>
+using construct_a_simd_type_t =
+    convert_simd_elem_type_t<resize_a_simd_type_t<SimdT, N>, T>;
+
+// @}
+
+// must match simd_mask<N>::element_type
+template <int N>
+using simd_mask_storage_t = vector_type_t<simd_mask_elem_type, N>;
 
 // @{
 // Checks if given type is a view of any simd type (simd or simd_mask).
@@ -371,56 +309,37 @@ struct element_type<T, std::enable_if_t<is_clang_vector_type_v<T>>> {
   using type = typename is_clang_vector_type<T>::element_type;
 };
 
-// @}
-
-// @{
-// Get computation type of a binary operator given its operand types:
-// - if both types are arithmetic - return CPP's "common real type" of the
-//   computation (matches C++)
-// - if both types are simd types, they must be of the same length N,
-//   and the returned type is simd<T, N>, where N is the "common real type" of
-//   the element type of the operands (diverges from clang)
-// - otherwise, one type is simd and another is arithmetic - the simd type is
-//   returned (matches clang)
-
-struct invalid_computation_type;
-
-template <class T1, class T2, class SFINAE = void> struct computation_type {
-  using type = invalid_computation_type;
-};
+template <typename T> using element_type_t = typename element_type<T>::type;
 
-template <class T1, class T2>
-struct computation_type<
-    T1, T2, std::enable_if_t<is_vectorizable_v<T1> && is_vectorizable_v<T2>>> {
-  using type = decltype(std::declval<T1>() + std::declval<T2>());
+// Determine element type of simd_obj_impl's Derived type w/o having to have
+// complete instantiation of the Derived type (is required by element_type_t,
+// hence can't be used here).
+template <class T> struct simd_like_obj_info;
+template <class T, int N> struct simd_like_obj_info<simd<T, N>> {
+  using type = T;
+  static inline constexpr int length = N;
 };
-
-template <class T1, class T2>
-struct computation_type<
-    T1, T2,
-    std::enable_if_t<is_simd_like_type_v<T1> && is_simd_like_type_v<T2>>> {
-private:
-  using Ty1 = typename element_type<T1>::type;
-  using Ty2 = typename element_type<T2>::type;
-  using EltTy = typename computation_type<Ty1, Ty2>::type;
-  static constexpr int N1 = T1::length;
-  static constexpr int N2 = T2::length;
-  static_assert(N1 == N2, "size mismatch");
-
-public:
-  using type = simd<EltTy, N1>;
+template <class T, int N> struct simd_like_obj_info<simd_mask_impl<T, N>> {
+  using type = simd_mask_elem_type; // equals T
+  static inline constexpr int length = N;
 };
 
-template <class T1, class T2 = T1>
-using computation_type_t =
-    typename computation_type<remove_cvref_t<T1>, remove_cvref_t<T2>>::type;
+template <typename T>
+using simd_like_obj_element_type_t = typename simd_like_obj_info<T>::type;
+template <typename T>
+static inline constexpr int simd_like_obj_length =
+    simd_like_obj_info<T>::length;
 
 // @}
 
 template <typename To, typename From>
 std::enable_if_t<is_clang_vector_type_v<To> && is_clang_vector_type_v<From>, To>
-convert(From Val) {
-  return __builtin_convertvector(Val, To);
+    ESIMD_INLINE convert(From Val) {
+  if constexpr (std::is_same_v<To, From>) {
+    return Val;
+  } else {
+    return __builtin_convertvector(Val, To);
+  }
 }
 
 /// Base case for checking if a type U is one of the types.
@@ -462,18 +381,6 @@ bitcast(vector_type_t<FromEltTy, FromN> Val) {
   return reinterpret_cast<VTy>(Val);
 }
 
-inline std::ostream &operator<<(std::ostream &O, half const &rhs) {
-  O << static_cast<float>(rhs);
-  return O;
-}
-
-inline std::istream &operator>>(std::istream &I, half &rhs) {
-  float ValFloat = 0.0f;
-  I >> ValFloat;
-  rhs = ValFloat;
-  return I;
-}
-
 } // namespace detail
 
 // Alias for backward compatibility.
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
index 12d25e1dbd9af..5481d100c92a7 100755
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp
@@ -15,10 +15,6 @@
 
 #include <type_traits>
 
-#define __SEIEED sycl::ext::intel::experimental::esimd::detail
-#define __SEIEE sycl::ext::intel::experimental::esimd
-#define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail
-
 #ifdef __SYCL_DEVICE_ONLY__
 #define __ESIMD_INTRIN SYCL_EXTERNAL SYCL_ESIMD_FUNCTION
 #else
@@ -92,10 +88,6 @@ template <typename T> struct is_esimd_vector : public std::false_type {};
 template <typename T, int N>
 struct is_esimd_vector<simd<T, N>> : public std::true_type {};
 
-template <typename T>
-using is_esimd_scalar =
-    typename std::bool_constant<cl::sycl::detail::is_arithmetic<T>::value>;
-
 template <typename T, int N>
 using is_hw_int_type =
     typename std::bool_constant<std::is_integral_v<T> && (sizeof(T) == N)>;
@@ -119,7 +111,7 @@ using is_fp_or_dword_type =
 
 /// Convert types into vector types
 template <typename T> struct simd_type { using type = simd<T, 1>; };
-template <typename T, int N> struct simd_type<vector_type<T, N>> {
+template <typename T, int N> struct simd_type<raw_vector_type<T, N>> {
   using type = simd<T, N>;
 };
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 4b3a74db9e3c5..5c30ff6df7a83 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -137,13 +137,14 @@ __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
 //
 /// Flat-address gather.
 /// \ingroup sycl_esimd
-template <typename T, int n, int ElemsPerAddr = 1,
-          CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
+template <typename Tx, int n, int ElemsPerAddr = 1,
+          CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None,
+          class T = detail::__raw_t<Tx>>
 __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) &&
                               (ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
                                ElemsPerAddr == 4)),
-                             simd<T, n * ElemsPerAddr>>
-gather(const T *p, simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
+                             simd<Tx, n * ElemsPerAddr>>
+gather(const Tx *p, simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
   detail::IfNotNone<L1H, L3H>::warn();
   simd<uint64_t, n> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, n> addrs(reinterpret_cast<uint64_t>(p));
@@ -177,11 +178,12 @@ gather(const T *p, simd<uint32_t, n> offsets, simd_mask<n> pred = 1) {
 
 /// Flat-address scatter.
 /// \ingroup sycl_esimd
-template <typename T, int n, int ElemsPerAddr = 1>
+template <typename Tx, int n, int ElemsPerAddr = 1,
+          class T = detail::__raw_t<Tx>>
 __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) &&
                               (ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
                                ElemsPerAddr == 4))>
-scatter(T *p, simd<uint32_t, n> offsets, simd<T, n * ElemsPerAddr> vals,
+scatter(Tx *p, simd<uint32_t, n> offsets, simd<Tx, n * ElemsPerAddr> vals,
         simd_mask<n> pred = 1) {
   simd<uint64_t, n> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, n> addrs(reinterpret_cast<uint64_t>(p));
@@ -231,10 +233,11 @@ __ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) &&
 
 /// Flat-address block-load.
 /// \ingroup sycl_esimd
-template <typename T, int n, typename Flags = vector_aligned_tag,
+template <typename Tx, int n, typename Flags = vector_aligned_tag,
           CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None,
+          class T = detail::__raw_t<Tx>,
           typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
-__ESIMD_API simd<T, n> block_load(const T *addr, Flags = {}) {
+__ESIMD_API simd<Tx, n> block_load(const Tx *addr, Flags = {}) {
   detail::IfNotNone<L1H, L3H>::warn();
   constexpr unsigned Sz = sizeof(T) * n;
   static_assert(Sz >= detail::OperandSize::OWORD,
@@ -257,10 +260,12 @@ __ESIMD_API simd<T, n> block_load(const T *addr, Flags = {}) {
 
 /// Accessor-based block-load.
 /// \ingroup sycl_esimd
-template <typename T, int n, typename AccessorTy,
+template <typename Tx, int n, typename AccessorTy,
           typename Flags = vector_aligned_tag,
-          typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
-__ESIMD_API simd<T, n> block_load(AccessorTy acc, uint32_t offset, Flags = {}) {
+          typename = std::enable_if_t<is_simd_flag_type_v<Flags>>,
+          class T = detail::__raw_t<Tx>>
+__ESIMD_API simd<Tx, n> block_load(AccessorTy acc, uint32_t offset,
+                                   Flags = {}) {
   constexpr unsigned Sz = sizeof(T) * n;
   static_assert(Sz >= detail::OperandSize::OWORD,
                 "block size must be at least 1 oword");
@@ -295,9 +300,9 @@ __ESIMD_API simd<T, n> block_load(AccessorTy acc, uint32_t offset, Flags = {}) {
 /// Flat-address block-store.
 /// \ingroup sycl_esimd
 // TODO the above note about cache hints applies to this API as well.
-template <typename T, int n, CacheHint L1H = CacheHint::None,
-          CacheHint L3H = CacheHint::None>
-__ESIMD_API void block_store(T *p, simd<T, n> vals) {
+template <typename Tx, int n, CacheHint L1H = CacheHint::None,
+          CacheHint L3H = CacheHint::None, class T = detail::__raw_t<Tx>>
+__ESIMD_API void block_store(Tx *p, simd<Tx, n> vals) {
   detail::IfNotNone<L1H, L3H>::warn();
   constexpr unsigned Sz = sizeof(T) * n;
   static_assert(Sz >= detail::OperandSize::OWORD,
@@ -315,8 +320,10 @@ __ESIMD_API void block_store(T *p, simd<T, n> vals) {
 
 /// Accessor-based block-store.
 /// \ingroup sycl_esimd
-template <typename T, int n, typename AccessorTy>
-__ESIMD_API void block_store(AccessorTy acc, uint32_t offset, simd<T, n> vals) {
+template <typename Tx, int n, typename AccessorTy,
+          class T = detail::__raw_t<Tx>>
+__ESIMD_API void block_store(AccessorTy acc, uint32_t offset,
+                             simd<Tx, n> vals) {
   constexpr unsigned Sz = sizeof(T) * n;
   static_assert(Sz >= detail::OperandSize::OWORD,
                 "block size must be at least 1 oword");
@@ -352,12 +359,14 @@ ESIMD_INLINE
   const auto si = __ESIMD_GET_SURF_HANDLE(acc);
 
   if constexpr (sizeof(T) < 4) {
-    static_assert(std::is_integral<T>::value,
-                  "only integral 1- & 2-byte types are supported");
+    using Tint = std::conditional_t<std::is_integral_v<T>, T,
+                                    detail::uint_type_t<sizeof(T)>>;
+    using Treal = __raw_t<T>;
+    simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
     using PromoT =
-        typename sycl::detail::conditional_t<std::is_signed<T>::value, int32_t,
-                                             uint32_t>;
-    const simd<PromoT, N> promo_vals = convert<PromoT>(vals);
+        typename sycl::detail::conditional_t<std::is_signed<Tint>::value,
+                                             int32_t, uint32_t>;
+    const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
     __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
         pred.data(), si, glob_offset, offsets.data(), promo_vals.data());
   } else {
@@ -380,16 +389,25 @@ gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
   const auto si = get_surface_index(acc);
 
   if constexpr (sizeof(T) < 4) {
-    static_assert(std::is_integral<T>::value,
+    using Tint = std::conditional_t<std::is_integral_v<T>, T,
+                                    detail::uint_type_t<sizeof(T)>>;
+    using Treal = __raw_t<T>;
+    static_assert(std::is_integral<Tint>::value,
                   "only integral 1- & 2-byte types are supported");
     using PromoT =
-        typename sycl::detail::conditional_t<std::is_signed<T>::value, int32_t,
-                                             uint32_t>;
+        typename sycl::detail::conditional_t<std::is_signed<Tint>::value,
+                                             int32_t, uint32_t>;
     const simd<PromoT, N> promo_vals =
         __esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
                                       scale>(si, glob_offset, offsets.data(),
                                              pred.data());
-    return convert<T>(promo_vals);
+    auto Res = convert<Tint>(promo_vals);
+
+    if constexpr (!std::is_same_v<Tint, T>) {
+      return detail::bitcast<Treal, Tint, N>(Res.data());
+    } else {
+      return Res;
+    }
   } else {
     return __esimd_gather_masked_scaled2<T, N, decltype(si), TypeSizeLog2,
                                          scale>(si, glob_offset, offsets.data(),
@@ -537,10 +555,11 @@ __ESIMD_API void scalar_store1(AccessorTy acc, uint32_t offset, T val) {
 /// @param offsets byte-offsets within the \p buffer to be gathered.
 /// @param pred predication control used for masking lanes.
 /// \ingroup sycl_esimd
-template <typename T, int N, rgba_channel_mask Mask>
+template <typename Tx, int N, rgba_channel_mask Mask,
+          class T = detail::__raw_t<Tx>>
 __ESIMD_API std::enable_if_t<(N == 16 || N == 32) && (sizeof(T) == 4),
-                             simd<T, N * get_num_channels_enabled(Mask)>>
-gather_rgba(const T *p, simd<uint32_t, N> offsets, simd_mask<N> pred = 1) {
+                             simd<Tx, N * get_num_channels_enabled(Mask)>>
+gather_rgba(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> pred = 1) {
 
   simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
@@ -576,10 +595,11 @@ __ESIMD_API std::enable_if_t<
 /// @param offsets byte-offsets within the \p buffer to be written.
 /// @param pred predication control used for masking lanes.
 /// \ingroup sycl_esimd
-template <typename T, int N, rgba_channel_mask Mask>
+template <typename Tx, int N, rgba_channel_mask Mask,
+          class T = detail::__raw_t<Tx>>
 __ESIMD_API std::enable_if_t<(N == 16 || N == 32) && (sizeof(T) == 4)>
-scatter_rgba(T *p, simd<uint32_t, N> offsets,
-             simd<T, N * get_num_channels_enabled(Mask)> vals,
+scatter_rgba(Tx *p, simd<uint32_t, N> offsets,
+             simd<Tx, N * get_num_channels_enabled(Mask)> vals,
              simd_mask<N> pred = 1) {
   simd<uint64_t, N> offsets_i = convert<uint64_t>(offsets);
   simd<uint64_t, N> addrs(reinterpret_cast<uint64_t>(p));
@@ -656,8 +676,8 @@ constexpr bool check_atomic() {
       static_assert(NumSrc == 1, "One source operand is expected");
       return false;
     }
-    if constexpr (!is_type<T, float, sycl::detail::half_impl::StorageT>()) {
-      static_assert((is_type<T, float, sycl::detail::half_impl::StorageT>()),
+    if constexpr (!is_type<T, float, sycl::half>()) {
+      static_assert((is_type<T, float, sycl::half>()),
                     "Type F or HF is expected");
       return false;
     }
@@ -676,9 +696,8 @@ constexpr bool check_atomic() {
                     "Type UW, UD or UQ is expected");
       return false;
     }
-    if constexpr (Op == atomic_op::fcmpwr &&
-                  !is_type<T, float, sycl::detail::half_impl::StorageT>()) {
-      static_assert((is_type<T, float, sycl::detail::half_impl::StorageT>()),
+    if constexpr (Op == atomic_op::fcmpwr && !is_type<T, float, sycl::half>()) {
+      static_assert((is_type<T, float, sycl::half>()),
                     "Type F or HF is expected");
       return false;
     }
@@ -699,9 +718,9 @@ constexpr bool check_atomic() {
 
 /// USM address atomic update, version with no source operands: \c inc and \c
 /// dec. \ingroup sycl_esimd
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 0>(), simd<T, n>>
-atomic_update(T *p, simd<unsigned, n> offset, simd_mask<n> pred) {
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, n, 0>(), simd<Tx, n>>
+atomic_update(Tx *p, simd<unsigned, n> offset, simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
   vAddr += offset_i1;
@@ -721,9 +740,9 @@ __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 0>(),
 
 /// USM address atomic update, version with one source operand: e.g. \c add, \c
 /// sub. \ingroup sycl_esimd
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 1>(), simd<T, n>>
-atomic_update(T *p, simd<unsigned, n> offset, simd<T, n> src0,
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, n, 1>(), simd<Tx, n>>
+atomic_update(Tx *p, simd<unsigned, n> offset, simd<Tx, n> src0,
               simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
@@ -745,10 +764,10 @@ __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 1>(),
 
 /// USM address atomic update, version with two source operands: e.g. \c
 /// cmpxchg. \ingroup sycl_esimd
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 2>(), simd<T, n>>
-atomic_update(T *p, simd<unsigned, n> offset, simd<T, n> src0, simd<T, n> src1,
-              simd_mask<n> pred) {
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, Tx, n, 2>(), simd<Tx, n>>
+atomic_update(Tx *p, simd<unsigned, n> offset, simd<Tx, n> src0,
+              simd<Tx, n> src1, simd_mask<n> pred) {
   simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(p));
   simd<uintptr_t, n> offset_i1 = convert<uintptr_t>(offset);
   vAddr += offset_i1;
@@ -972,7 +991,7 @@ __ESIMD_API simd<T, n> slm_block_load(uint32_t offset) {
                 "block size must be at most 16 owords");
 
   const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
-  return __esimd_oword_ld<T, n>(si, offset >> 4);
+  return __esimd_oword_ld<detail::__raw_t<T>, n>(si, offset >> 4);
 }
 
 /// SLM block-store.
@@ -987,15 +1006,14 @@ __ESIMD_API void slm_block_store(uint32_t offset, simd<T, n> vals) {
                 "block must be 1, 2, 4 or 8 owords long");
   static_assert(Sz <= 8 * detail::OperandSize::OWORD,
                 "block size must be at most 8 owords");
-
   const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
   // offset in genx.oword.st is in owords
-  __esimd_oword_st<T, n>(si, offset >> 4, vals.data());
+  __esimd_oword_st<detail::__raw_t<T>, n>(si, offset >> 4, vals.data());
 }
 
 /// SLM atomic update operation, no source operands: \c inc and \c dec.
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 0>(), simd<T, n>>
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 0>(), simd<Tx, n>>
 slm_atomic_update(simd<uint32_t, n> offsets, simd_mask<n> pred) {
   const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
   return __esimd_dword_atomic0<Op, T, n>(pred.data(), si, offsets.data());
@@ -1010,9 +1028,9 @@ __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 0>(),
 }
 
 /// SLM atomic update operation, one source operand: e.g. \c add, \c sub.
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 1>(), simd<T, n>>
-slm_atomic_update(simd<uint32_t, n> offsets, simd<T, n> src0,
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 1>(), simd<Tx, n>>
+slm_atomic_update(simd<uint32_t, n> offsets, simd<Tx, n> src0,
                   simd_mask<n> pred) {
   const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
   return __esimd_dword_atomic1<Op, T, n>(pred.data(), si, offsets.data(),
@@ -1029,9 +1047,9 @@ __ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 1>(),
 }
 
 /// SLM atomic, two source operands.
-template <atomic_op Op, typename T, int n>
-__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 2>(), simd<T, n>>
-slm_atomic_update(simd<uint32_t, n> offsets, simd<T, n> src0, simd<T, n> src1,
+template <atomic_op Op, typename Tx, int n, class T = detail::__raw_t<Tx>>
+__ESIMD_API std::enable_if_t<detail::check_atomic<Op, T, n, 2>(), simd<Tx, n>>
+slm_atomic_update(simd<uint32_t, n> offsets, simd<Tx, n> src0, simd<Tx, n> src1,
                   simd_mask<n> pred) {
   const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker());
   return __esimd_dword_atomic2<Op, T, n>(pred.data(), si, offsets.data(),
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
index 1a1bd8098a2e2..26803ad39f4b3 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd.hpp
@@ -39,15 +39,16 @@ namespace esimd {
 ///
 /// \ingroup sycl_esimd
 template <typename Ty, int N>
-class simd
-    : public detail::simd_obj_impl<
-          Ty, N, simd<Ty, N>, std::enable_if_t<detail::is_vectorizable_v<Ty>>> {
-  using base_type = detail::simd_obj_impl<Ty, N, simd<Ty, N>>;
+class simd : public detail::simd_obj_impl<
+                 detail::__raw_t<Ty>, N, simd<Ty, N>,
+                 std::enable_if_t<detail::is_valid_simd_elem_type_v<Ty>>> {
+  using base_type = detail::simd_obj_impl<detail::__raw_t<Ty>, N, simd<Ty, N>>;
 
 public:
   using base_type::base_type;
-  using element_type = typename base_type::element_type;
-  using vector_type = typename base_type::vector_type;
+  using element_type = Ty;
+  using raw_element_type = typename base_type::raw_element_type;
+  using raw_vector_type = typename base_type::raw_vector_type;
   static constexpr int length = N;
 
   // Implicit conversion constructor from another simd object of the same
@@ -56,24 +57,25 @@ class simd
             class = std::enable_if_t<__SEIEED::is_simd_type_v<SimdT> &&
                                      (length == SimdT::length)>>
   simd(const SimdT &RHS)
-      : base_type(__builtin_convertvector(RHS.data(), vector_type)) {
+      : base_type(detail::convert_vector<Ty, detail::element_type_t<SimdT>, N>(
+            RHS.data())) {
     __esimd_dbg_print(simd(const SimdT &RHS));
   }
 
   // Broadcast constructor with conversion.
   template <typename T1,
-            class = std::enable_if_t<detail::is_vectorizable_v<T1>>>
-  simd(T1 Val) : base_type((Ty)Val) {
+            class = std::enable_if_t<detail::is_valid_simd_elem_type_v<T1>>>
+  simd(T1 Val) : base_type(Val) {
     __esimd_dbg_print(simd(T1 Val));
   }
 
-  /// Explicit conversion for simd_obj_impl<T, 1> into T.
+  /// Type conversion for simd<T, 1> into T.
   template <class To, class T = simd,
-            class = sycl::detail::enable_if_t<(T::length == 1) &&
-                                              detail::is_vectorizable_v<To>>>
+            class = sycl::detail::enable_if_t<
+                (T::length == 1) && detail::is_valid_simd_elem_type_v<To>>>
   operator To() const {
-    __esimd_dbg_print(explicit operator To());
-    return (To)base_type::data()[0];
+    __esimd_dbg_print(operator To());
+    return detail::convert_scalar<To, element_type>(base_type::data()[0]);
   }
 
   /// @{
@@ -101,15 +103,16 @@ class simd
   }
   /// @}
 
-#define __ESIMD_DEF_SIMD_ARITH_UNARY_OP(ARITH_UNARY_OP)                        \
+#define __ESIMD_DEF_SIMD_ARITH_UNARY_OP(ARITH_UNARY_OP, ID)                    \
   template <class T1 = Ty> simd operator ARITH_UNARY_OP() const {              \
     static_assert(!std::is_unsigned_v<T1>,                                     \
                   #ARITH_UNARY_OP "doesn't apply to unsigned types");          \
-    return simd(ARITH_UNARY_OP(base_type::data()));                            \
+    return simd{detail::vector_unary_op<detail::UnaryOp::ID, T1, N>(           \
+        base_type::data())};                                                   \
   }
 
-  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(-)
-  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(+)
+  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(-, minus)
+  __ESIMD_DEF_SIMD_ARITH_UNARY_OP(+, plus)
 #undef __ESIMD_DEF_SIMD_ARITH_UNARY_OP
 };
 
@@ -120,7 +123,8 @@ ESIMD_INLINE simd<To, N> convert(const simd<From, N> &val) {
   if constexpr (std::is_same_v<To, From>)
     return val;
   else
-    return __builtin_convertvector(val.data(), detail::vector_type_t<To, N>);
+    return detail::convert_vector<To, From, N>(val.data());
+  ;
 }
 
 #undef __ESIMD_DEF_RELOP
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
index 9000f8fee80f7..ac19f1339a9bf 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/simd_view.hpp
@@ -58,7 +58,8 @@ class simd_view : public detail::simd_view_impl<BaseTy, RegionTy> {
   using value_type = get_simd_t<element_type, length>;
 
   /// The underlying builtin value type
-  using vector_type = detail::vector_type_t<element_type, length>;
+  using raw_vector_type =
+      detail::vector_type_t<detail::__raw_t<element_type>, length>;
 
 protected:
   /// @{
@@ -93,13 +94,15 @@ class simd_view : public detail::simd_view_impl<BaseTy, RegionTy> {
   }                                                                            \
                                                                                \
   /* simd_view RELOP SCALAR */                                                 \
-  template <typename T1, std::enable_if_t<detail::is_vectorizable_v<T1>>>      \
+  template <typename T1,                                                       \
+            std::enable_if_t<detail::is_valid_simd_elem_type_v<T1>>>           \
   ESIMD_INLINE friend bool operator RELOP(const simd_view &X, T1 Y) {          \
     return (element_type)X RELOP Y;                                            \
   }                                                                            \
                                                                                \
   /* SCALAR RELOP simd_view */                                                 \
-  template <typename T1, std::enable_if_t<detail::is_vectorizable_v<T1>>>      \
+  template <typename T1,                                                       \
+            std::enable_if_t<detail::is_valid_simd_elem_type_v<T1>>>           \
   ESIMD_INLINE friend bool operator RELOP(T1 X, const simd_view &Y) {          \
     return X RELOP(element_type) Y;                                            \
   }
@@ -116,21 +119,22 @@ class simd_view : public detail::simd_view_impl<BaseTy, RegionTy> {
 ///   bool b = v[0] > v[1] && v[2] < 42;
 ///
 /// \ingroup sycl_esimd
-template <typename BaseTy, typename T>
-class simd_view<BaseTy, region1d_scalar_t<T>>
-    : public detail::simd_view_impl<BaseTy, region1d_scalar_t<T>> {
+template <typename BaseTy, class ViewedElemT>
+class simd_view<BaseTy, region1d_scalar_t<ViewedElemT>>
+    : public detail::simd_view_impl<BaseTy, region1d_scalar_t<ViewedElemT>> {
   template <typename, int, class, class> friend class detail::simd_obj_impl;
   template <typename, typename> friend class detail::simd_view_impl;
 
 public:
-  using RegionTy = region1d_scalar_t<T>;
+  using RegionTy = region1d_scalar_t<ViewedElemT>;
   using BaseClass = detail::simd_view_impl<BaseTy, RegionTy>;
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
   static_assert(1 == length, "length of this view is not equal to 1");
+  static_assert(std::is_same_v<typename ShapeTy::element_type, ViewedElemT>);
   /// The element type of this class, which could be different from the element
   /// type of the base object type.
-  using element_type = typename ShapeTy::element_type;
+  using element_type = ViewedElemT;
   using base_type = BaseTy;
   template <typename ElT, int N>
   using get_simd_t = typename BaseClass::template get_simd_t<ElT, N>;
@@ -146,8 +150,8 @@ class simd_view<BaseTy, region1d_scalar_t<T>>
   simd_view(BaseTy &Base) : BaseClass(Base) {}
 
   operator element_type() const {
-    const auto v = BaseClass::read();
-    return v[0];
+    const auto v = BaseClass::read().data();
+    return detail::bitcast_to_wrapper_type<element_type>(std::move(v)[0]);
   }
 
   using BaseClass::operator--;
@@ -170,22 +174,23 @@ class simd_view<BaseTy, region1d_scalar_t<T>>
 ///   simd<int, 4> v = 1;
 ///   auto v1 = v.select<2, 1>(0);
 ///   auto v2 = v1[0]; // simd_view of a nested region for a single element
-template <typename BaseTy, typename T, typename NestedRegion>
-class simd_view<BaseTy, std::pair<region1d_scalar_t<T>, NestedRegion>>
+template <typename BaseTy, typename NestedRegion, class ViewedElemT>
+class simd_view<BaseTy, std::pair<region1d_scalar_t<ViewedElemT>, NestedRegion>>
     : public detail::simd_view_impl<
-          BaseTy, std::pair<region1d_scalar_t<T>, NestedRegion>> {
+          BaseTy, std::pair<region1d_scalar_t<ViewedElemT>, NestedRegion>> {
   template <typename, int> friend class simd;
   template <typename, typename> friend class detail::simd_view_impl;
 
 public:
-  using RegionTy = std::pair<region1d_scalar_t<T>, NestedRegion>;
+  using RegionTy = std::pair<region1d_scalar_t<ViewedElemT>, NestedRegion>;
   using BaseClass = detail::simd_view_impl<BaseTy, RegionTy>;
   using ShapeTy = typename shape_type<RegionTy>::type;
   static constexpr int length = ShapeTy::Size_x * ShapeTy::Size_y;
   static_assert(1 == length, "length of this view is not equal to 1");
+  static_assert(std::is_same_v<typename ShapeTy::element_type, ViewedElemT>);
   /// The element type of this class, which could be different from the element
   /// type of the base object type.
-  using element_type = T;
+  using element_type = ViewedElemT;
 
 private:
   simd_view(BaseTy &Base, RegionTy Region) : BaseClass(Base, Region) {}
@@ -196,7 +201,7 @@ class simd_view<BaseTy, std::pair<region1d_scalar_t<T>, NestedRegion>>
 
   operator element_type() const {
     const auto v = BaseClass::read();
-    return v[0];
+    return detail::convert_scalar<element_type>(v[0]);
   }
 
   __ESIMD_DEF_SCALAR_SIMD_VIEW_RELOP(>)
diff --git a/sycl/test/esimd/flat_atomic.cpp b/sycl/test/esimd/flat_atomic.cpp
index dcda882214641..fc4961fd55f1e 100644
--- a/sycl/test/esimd/flat_atomic.cpp
+++ b/sycl/test/esimd/flat_atomic.cpp
@@ -27,12 +27,14 @@ void kernel1(uint32_t *ptr) SYCL_ESIMD_FUNCTION {
   flat_atomic<EsimdAtomicOpType::ATOMIC_ADD, uint32_t, 32>(ptr, offsets, v1, 1);
 }
 
-void kernel2(uint32_t *ptr) SYCL_ESIMD_FUNCTION {
+template <class T> void kernel2(T *ptr) SYCL_ESIMD_FUNCTION {
   simd<uint32_t, 32> offsets(0, 1);
-  simd<uint32_t, 32> v1(0, 1);
+  simd<T, 32> v1(0, 1);
 
-  atomic_update<atomic_op::cmpxchg, uint32_t, 32>(ptr, offsets, v1, v1, 1);
+  atomic_update<atomic_op::cmpxchg, T, 32>(ptr, offsets, v1, v1, 1);
   // deprecated form:
-  flat_atomic<EsimdAtomicOpType::ATOMIC_CMPXCHG, uint32_t, 32>(ptr, offsets, v1,
-                                                               v1, 1);
+  flat_atomic<EsimdAtomicOpType::ATOMIC_CMPXCHG, T, 32>(ptr, offsets, v1, v1,
+                                                        1);
 }
+
+template void kernel2<uint32_t>(uint32_t *) SYCL_ESIMD_FUNCTION;
diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp
index 974f4b74c1c8e..4663bf44c4bc0 100644
--- a/sycl/test/esimd/intrins_trans.cpp
+++ b/sycl/test/esimd/intrins_trans.cpp
@@ -179,9 +179,9 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
 //   level of testing strength
 // 2. Test cases above should be refactored not to use user-level APIs like
 //   gather and use __esimd* calls instead.
-template <class T, int N> using vec = typename simd<T, N>::vector_type;
+template <class T, int N> using vec = typename simd<T, N>::raw_vector_type;
 
-template <int N> using mask = typename simd_mask<N>::vector_type;
+template <int N> using mask = typename simd_mask<N>::raw_vector_type;
 
 SYCL_EXTERNAL void use(const vec<float, 8> &x) SYCL_ESIMD_FUNCTION;
 SYCL_EXTERNAL void use(const vec<int, 8> &x) SYCL_ESIMD_FUNCTION;
diff --git a/sycl/test/esimd/simd.cpp b/sycl/test/esimd/simd.cpp
index a38b732751312..0c9afd2c4fa05 100644
--- a/sycl/test/esimd/simd.cpp
+++ b/sycl/test/esimd/simd.cpp
@@ -7,103 +7,135 @@
 
 using namespace sycl::ext::intel::experimental::esimd;
 
-bool test_simd_ctors() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v0 = 1;
-  simd<int, 16> v1(v0);
-  simd<int, 16> v2(simd<int, 16>(0, 1));
-  const simd<int, 16> v3{0, 2, 4, 6, 1, 3, 5, 7};
+template <class T> bool test_simd_ctors() SYCL_ESIMD_FUNCTION {
+  simd<T, 16> v0 = 1;
+  simd<T, 16> v1(v0);
+  simd<T, 16> v2(simd<T, 16>(0, 1));
+  const simd<T, 16> v3{0, 2, 4, 6, 1, 3, 5, 7};
   return v0[0] + v1[1] + v2[2] + v3[3] == 1 + 1 + 2 + 6;
 }
 
-void test_simd_class_traits() SYCL_ESIMD_FUNCTION {
-  static_assert(std::is_default_constructible<simd<int, 4>>::value,
+template bool test_simd_ctors<int>() SYCL_ESIMD_FUNCTION;
+template bool test_simd_ctors<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> void test_simd_class_traits() SYCL_ESIMD_FUNCTION {
+  static_assert(std::is_default_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_trivially_default_constructible<simd<int, 4>>::value,
+  static_assert(std::is_trivially_default_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_copy_constructible<simd<int, 4>>::value,
+  static_assert(std::is_copy_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(!std::is_trivially_copy_constructible<simd<int, 4>>::value,
+  static_assert(!std::is_trivially_copy_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_move_constructible<simd<int, 4>>::value,
+  static_assert(std::is_move_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(!std::is_trivially_move_constructible<simd<int, 4>>::value,
+  static_assert(!std::is_trivially_move_constructible<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_copy_assignable<simd<int, 4>>::value,
+  static_assert(std::is_copy_assignable<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_trivially_copy_assignable<simd<int, 4>>::value,
+  static_assert(std::is_trivially_copy_assignable<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_move_assignable<simd<int, 4>>::value,
+  static_assert(std::is_move_assignable<simd<T, 4>>::value,
                 "type trait mismatch");
-  static_assert(std::is_trivially_move_assignable<simd<int, 4>>::value,
+  static_assert(std::is_trivially_move_assignable<simd<T, 4>>::value,
                 "type trait mismatch");
 }
 
+template void test_simd_class_traits<int>() SYCL_ESIMD_FUNCTION;
+template void test_simd_class_traits<sycl::half>() SYCL_ESIMD_FUNCTION;
+
 void test_conversion() SYCL_ESIMD_FUNCTION {
   simd<int, 32> v = 3;
   simd<float, 32> f = v;
   simd<char, 32> c = f;
-  simd<char, 16> c1 = f.select<16, 1>(0);
-  c.select<32, 1>(0) = f;
+  simd<sycl::half, 32> h = c;
+  simd<char, 16> c1 = h.template select<16, 1>(0);
+  c.template select<32, 1>(0) = f;
+  h.template select<7, 1>(3) =
+      v.template select<22, 1>(0).template select<7, 3>(1);
   f = v + static_cast<simd<int, 32>>(c);
 }
 
-bool test_1d_select() SYCL_ESIMD_FUNCTION {
-  simd<int, 32> v = 0;
-  v.select<8, 1>(0) = 1;
-  v.select<8, 1>(8) = 2;
-  v.select<8, 1>(16) = 3;
-  v.select<8, 1>(24) = 4;
+template <class T> bool test_1d_select() SYCL_ESIMD_FUNCTION {
+  simd<T, 32> v = 0;
+  v.template select<8, 1>(0) = 1;
+  v.template select<8, 1>(8) = 2;
+  v.template select<8, 1>(16) = 3;
+  v.template select<8, 1>(24) = 4;
   return v[0] + v[8] + v[16] + v[24] == (1 + 2 + 3 + 4);
 }
 
+template bool test_1d_select<int>() SYCL_ESIMD_FUNCTION;
+template bool test_1d_select<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2, class T3>
 bool test_simd_format() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v{0, 1, 2, 3, 4, 5, 6, 7};
-  auto ref1 = v.bit_cast_view<short>();
-  auto ref2 = v.bit_cast_view<double>();
-  auto ref3 = v.bit_cast_view<short, 8, 4>();
+  simd<T1, 16> v{0, 1, 2, 3, 4, 5, 6, 7};
+  auto ref1 = v.template bit_cast_view<T2>();
+  auto ref2 = v.template bit_cast_view<T3>();
+  auto ref3 = v.template bit_cast_view<T2, 8, 4>();
   return (decltype(ref1)::length == 32) && (decltype(ref2)::length == 8) &&
          (decltype(ref3)::getSizeX() == 4) && (decltype(ref3)::getSizeY() == 8);
 }
 
-bool test_simd_select(int a) SYCL_ESIMD_FUNCTION {
+template bool test_simd_format<int, short, double>() SYCL_ESIMD_FUNCTION;
+template bool
+test_simd_format<uint32_t, sycl::half, uint64_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2> bool test_simd_select(T1 a) SYCL_ESIMD_FUNCTION {
   {
-    simd<float, 32> f = a;
-    simd<char, 32> c1 = 2;
-    c1.select<16, 1>(0) = f.select<16, 1>(0);
-    c1.select<16, 1>(0).select<16, 1>(0) = f.select<16, 1>(0).select<16, 1>(0);
+    simd<T1, 32> f = a;
+    simd<T2, 32> c1 = 2;
+    c1.template select<16, 1>(0) = f.template select<16, 1>(0);
+    c1.template select<16, 1>(0).template select<16, 1>(0) =
+        f.template select<16, 1>(0).template select<16, 1>(0);
   }
   {
-    simd<int, 16> v(0, 1);
-    auto ref0 = v.select<4, 2>(1);            // r{1, 3, 5, 7}
-    auto ref1 = v.bit_cast_view<int, 4, 4>(); // 0,1,2,3;
-                                              // 4,5,6,7;
-                                              // 8,9,10,11;
-                                              // 12,13,14,15
-    auto ref2 = ref1.select<2, 1, 2, 2>(0, 1);
-    return ref0[0] == 1 && decltype(ref2)::getSizeX() == 2 &&
-           decltype(ref2)::getStrideY() == 1;
+    simd<T1, 16> v(0, 1);
+    auto ref0 = v.template select<4, 2>(1);           // r{1, 3, 5, 7}
+    auto ref1 = v.template bit_cast_view<T1, 4, 4>(); // 0,1,2,3;
+                                                      // 4,5,6,7;
+                                                      // 8,9,10,11;
+                                                      // 12,13,14,15
+    auto ref2 = ref1.template select<2, 1, 2, 2>(0, 1);
+    return (ref0[0] == 1) && (decltype(ref2)::getSizeX() == 2) &&
+           (decltype(ref2)::getStrideY() == 1);
   }
+  return false;
 }
 
-bool test_2d_offset() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v = 0;
-  auto ref = v.bit_cast_view<short, 8, 4>();
-  return ref.select<2, 2, 2, 2>(2, 1).getOffsetX() == 1 &&
-         ref.select<2, 2, 2, 2>(2, 1).getOffsetY() == 2;
+template bool test_simd_select<float, char>(float) SYCL_ESIMD_FUNCTION;
+template bool
+    test_simd_select<uint64_t, sycl::half>(uint64_t) SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2> bool test_2d_offset() SYCL_ESIMD_FUNCTION {
+  simd<T1, 16> v = 0;
+  auto ref = v.template bit_cast_view<T2, 8, 4>();
+  return ref.template select<2, 2, 2, 2>(2, 1).getOffsetX() == 1 &&
+         ref.template select<2, 2, 2, 2>(2, 1).getOffsetY() == 2;
 }
 
+template bool test_2d_offset<int, short>() SYCL_ESIMD_FUNCTION;
+template bool test_2d_offset<sycl::half, uint8_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2>
 bool test_simd_bin_op_promotion() SYCL_ESIMD_FUNCTION {
-  simd<short, 8> v0 = std::numeric_limits<short>::max();
-  simd<short, 8> v1 = 1;
-  simd<int, 8> v2 = v0 + v1;
+  simd<T2, 8> v0 = std::numeric_limits<T2>::max();
+  simd<T2, 8> v1 = 1;
+  simd<T1, 8> v2 = v0 + v1;
   return v2[0] == 32768;
 }
 
-bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0 = 1;
-  simd<int, 8> v1 = 2;
+template bool test_simd_bin_op_promotion<int, short>() SYCL_ESIMD_FUNCTION;
+template bool
+test_simd_bin_op_promotion<sycl::half, uint64_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0 = 1;
+  simd<T, 8> v1 = 2;
   v0 += v1;
-  v0 %= v1;
+  if constexpr (std::is_integral_v<T>)
+    v0 %= v1;
   v0 = 2 - v0;
   v0 -= v1;
   v0 -= 2;
@@ -114,101 +146,142 @@ bool test_simd_bin_ops() SYCL_ESIMD_FUNCTION {
   return v0[0] == 1;
 }
 
-bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0 = 1;
-  simd<int, 8> v1 = 2;
-  v0 <<= v1;
+template bool test_simd_bin_ops<int>() SYCL_ESIMD_FUNCTION;
+template bool test_simd_bin_ops<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_simd_unary_ops() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0 = 1;
+  simd<T, 8> v1 = 2;
+  if constexpr (std::is_integral_v<T>)
+    v0 <<= v1;
   v1 = -v0;
-  v0 = ~v1;
+  if constexpr (std::is_integral_v<T>)
+    v0 = ~v1;
   return v1[0] == 1;
 }
 
-bool test_nested_1d_select() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> r0(0, 1);
+template bool test_simd_unary_ops<int>() SYCL_ESIMD_FUNCTION;
+template bool test_simd_unary_ops<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_nested_1d_select() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> r0(0, 1);
 
-  auto r1 = r0.select<4, 2>(0);
-  auto r2 = r1.select<2, 2>(0);
-  auto r3 = r2.select<1, 0>(1);
+  auto r1 = r0.template select<4, 2>(0);
+  auto r2 = r1.template select<2, 2>(0);
+  auto r3 = r2.template select<1, 0>(1);
   r3 = 37;
 
   return r0[4] == 37;
 }
 
-bool test_format_1d_read() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> r = 0x0FF00F0F;
-  auto rl = r.bit_cast_view<short>();
-  auto rl2 = rl.select<8, 2>(0); // 0F0F
-  auto rh = r.bit_cast_view<short>();
-  auto rh2 = rh.select<8, 2>(1); // 0FF0
+template bool test_nested_1d_select<int>() SYCL_ESIMD_FUNCTION;
+template bool test_nested_1d_select<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2> bool test_format_1d_read() SYCL_ESIMD_FUNCTION {
+  simd<T1, 8> r = 0x0FF00F0F;
+  auto rl = r.template bit_cast_view<T2>();
+  auto rl2 = rl.template select<8, 2>(0); // 0F0F
+  auto rh = r.template bit_cast_view<T2>();
+  auto rh2 = rh.template select<8, 2>(1); // 0FF0
   return rl2[0] == 0x0F0F && rh2[0] == 0x0FF0;
 }
 
-bool test_format_1d_write() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> r;
-  auto rl = r.bit_cast_view<short>();
-  auto rl2 = rl.select<8, 2>(0);
-  auto rh = r.bit_cast_view<short>();
-  auto rh2 = rh.select<8, 2>(1);
+template bool test_format_1d_read<int, short>() SYCL_ESIMD_FUNCTION;
+template bool test_format_1d_read<sycl::half, uint8_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2> bool test_format_1d_write() SYCL_ESIMD_FUNCTION {
+  simd<T1, 8> r;
+  auto rl = r.template bit_cast_view<T2>();
+  auto rl2 = rl.template select<8, 2>(0);
+  auto rh = r.template bit_cast_view<T2>();
+  auto rh2 = rh.template select<8, 2>(1);
   rh2 = 0x0F, rl2 = 0xF0;
   return r[0] == 0x0FF0;
 }
 
+template bool test_format_1d_write<int, short>() SYCL_ESIMD_FUNCTION;
+template bool test_format_1d_write<sycl::half, uint64_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2>
 bool test_format_1d_read_write_nested() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v = 0;
-  auto r1 = v.bit_cast_view<short>();
-  auto r11 = r1.select<8, 1>(0);
-  auto r12 = r11.bit_cast_view<int>();
-  auto r2 = v.bit_cast_view<short>();
-  auto r21 = r2.select<8, 1>(8);
-  auto r22 = r21.bit_cast_view<int>();
+  simd<T1, 32> v = 0;
+  auto r1 = v.template bit_cast_view<T2>();
+  auto r11 = r1.template select<8, 1>(0);
+  auto r12 = r11.template bit_cast_view<T1>();
+  auto r2 = v.template bit_cast_view<T2>();
+  auto r21 = r2.template select<8, 1>(8);
+  auto r22 = r21.template bit_cast_view<T1>();
   r12 += 1, r22 += 2;
   return v[0] == 1 && v[4] == 2;
 }
 
-bool test_format_2d_read() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto r1 = v0.bit_cast_view<int, 2, 4>();
-  simd<int, 4> v1 = r1.select<1, 0, 4, 1>(1, 0).read(); // second row
+template bool
+test_format_1d_read_write_nested<int, short>() SYCL_ESIMD_FUNCTION;
+template bool
+test_format_1d_read_write_nested<sycl::half, uint64_t>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_format_2d_read() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto r1 = v0.template bit_cast_view<T, 2, 4>();
+  simd<T, 4> v1 = r1.template select<1, 0, 4, 1>(1, 0).read(); // second row
   return v1[0] == 4;
 }
 
-bool test_format_2d_write() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto r1 = v0.bit_cast_view<int, 2, 4>();
-  r1.select<1, 0, 4, 1>(1, 0) = 37;
+template bool test_format_2d_read<int>() SYCL_ESIMD_FUNCTION;
+template bool test_format_2d_read<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_format_2d_write() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto r1 = v0.template bit_cast_view<T, 2, 4>();
+  r1.template select<1, 0, 4, 1>(1, 0) = 37;
   return v0[4] == 37;
 }
 
-bool test_select_rvalue() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  v0.select<4, 2>(1).select<2, 2>(0) = 37;
+template bool test_format_2d_write<int>() SYCL_ESIMD_FUNCTION;
+template bool test_format_2d_write<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_select_rvalue() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  v0.template select<4, 2>(1).template select<2, 2>(0) = 37;
   return v0[5] == 37;
 }
 
-auto test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0 = 0x0F0F0F0F;
-  v0.bit_cast_view<short>().select<8, 2>(0) = 0x0E0E;
+template bool test_select_rvalue<int>() SYCL_ESIMD_FUNCTION;
+template bool test_select_rvalue<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_format_1d_write_rvalue() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0 = 0x0F0F0F0F;
+  v0.template bit_cast_view<short>().template select<8, 2>(0) = 0x0E0E;
   return v0[2] == 0x0E0E0E0E;
 }
 
-bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  v0.bit_cast_view<int, 2, 4>().select<1, 0, 4, 1>(0, 0) = 37;
+template bool test_format_1d_write_rvalue<int>() SYCL_ESIMD_FUNCTION;
+template bool test_format_1d_write_rvalue<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_format_2d_write_rvalue() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  v0.template bit_cast_view<T, 2, 4>().template select<1, 0, 4, 1>(0, 0) = 37;
   return v0[3] == 37;
 }
 
-auto test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto r1 = v0.bit_cast_view<int, 2, 4>()
-                .select<1, 0, 4, 1>(1, 0)
-                .bit_cast_view<int>()
-                .select<2, 2>(1);
+template bool test_format_2d_write_rvalue<int>() SYCL_ESIMD_FUNCTION;
+template bool test_format_2d_write_rvalue<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_format_2d_read_rvalue() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto r1 = v0.template bit_cast_view<T, 2, 4>()
+                .template select<1, 0, 4, 1>(1, 0)
+                .template bit_cast_view<T>()
+                .template select<2, 2>(1);
   return r1[0] == 5;
 }
 
-bool test_row_read_write() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v0(0, 1);
-  auto m = v0.bit_cast_view<int, 4, 4>();
+template bool test_format_2d_read_rvalue<int>() SYCL_ESIMD_FUNCTION;
+template bool test_format_2d_read_rvalue<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_row_read_write() SYCL_ESIMD_FUNCTION {
+  simd<T, 16> v0(0, 1);
+  auto m = v0.template bit_cast_view<T, 4, 4>();
 
   auto r0 = m.row(0); // 0 1 2 3
   auto r1 = m.row(1); // 4 5 6 7
@@ -221,9 +294,12 @@ bool test_row_read_write() SYCL_ESIMD_FUNCTION {
   return r0[0] == 8 && r1[0] == 16;
 }
 
-bool test_column_read_write() SYCL_ESIMD_FUNCTION {
-  simd<int, 4> v0(0, 1);
-  auto m = v0.bit_cast_view<int, 2, 2>();
+template bool test_row_read_write<int>() SYCL_ESIMD_FUNCTION;
+template bool test_row_read_write<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_column_read_write() SYCL_ESIMD_FUNCTION {
+  simd<T, 4> v0(0, 1);
+  auto m = v0.template bit_cast_view<T, 2, 2>();
 
   auto c0 = m.column(0); // 0 2
   auto c1 = m.column(1); // 1 3
@@ -234,44 +310,62 @@ bool test_column_read_write() SYCL_ESIMD_FUNCTION {
   return v0[0] == 1 && v0[3] == 4;
 }
 
-bool test_replicate() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto v0_rep = v0.replicate<1>();
+template bool test_column_read_write<int>() SYCL_ESIMD_FUNCTION;
+template bool test_column_read_write<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_replicate() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto v0_rep = v0.template replicate<1>();
 
   return v0[0] == v0_rep[0] && v0[7] == v0_rep[7];
 }
 
-bool test_replicate1() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto v0_rep = v0.replicate_w<4, 2>(2);
+template bool test_replicate<int>() SYCL_ESIMD_FUNCTION;
+template bool test_replicate<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_replicate1() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto v0_rep = v0.template replicate_w<4, 2>(2);
 
   return v0[2] == v0_rep[2] && v0[3] == v0_rep[5];
 }
 
-bool test_replicate2() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto v0_rep = v0.replicate_vs_w<2, 4, 2>(1);
+template bool test_replicate1<int>() SYCL_ESIMD_FUNCTION;
+template bool test_replicate1<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_replicate2() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto v0_rep = v0.template replicate_vs_w<2, 4, 2>(1);
 
   return v0_rep[0] == v0[1] && v0_rep[1] == v0[2] && v0_rep[2] == v0[5];
 }
 
-bool test_replicate3() SYCL_ESIMD_FUNCTION {
-  simd<int, 8> v0(0, 1);
-  auto v0_rep = v0.replicate_vs_w_hs<2, 4, 2, 2>(1);
+template bool test_replicate2<int>() SYCL_ESIMD_FUNCTION;
+template bool test_replicate2<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T> bool test_replicate3() SYCL_ESIMD_FUNCTION {
+  simd<T, 8> v0(0, 1);
+  auto v0_rep = v0.template replicate_vs_w_hs<2, 4, 2, 2>(1);
 
   return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5];
 }
 
-bool test_simd_iselect() SYCL_ESIMD_FUNCTION {
-  simd<int, 16> v(0, 1);
-  simd<ushort, 8> a(0, 2);
+template bool test_replicate3<int>() SYCL_ESIMD_FUNCTION;
+template bool test_replicate3<sycl::half>() SYCL_ESIMD_FUNCTION;
+
+template <class T1, class T2> bool test_simd_iselect() SYCL_ESIMD_FUNCTION {
+  simd<T1, 16> v(0, 1);
+  simd<T2, 8> a(0, 2);
   auto data = v.iselect(a);
   data += 16;
-  v.iupdate(a, data, simd_mask<8>(1));
-  auto ref = v.select<8, 2>(0);
+  v.template iupdate(a, data, simd_mask<8>(1));
+  auto ref = v.template select<8, 2>(0);
   return ref[0] == 16 && ref[14] == 32;
 }
 
+template bool test_simd_iselect<int, ushort>() SYCL_ESIMD_FUNCTION;
+template bool test_simd_iselect<sycl::half, ushort>() SYCL_ESIMD_FUNCTION;
+
 void test_simd_binop_honor_int_promo() SYCL_ESIMD_FUNCTION {
   simd<short, 32> a;
   simd<unsigned short, 32> b;