diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47076d84fd..361f97df31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,6 +221,10 @@ OPTION(EMBREE_MIN_WIDTH "Enables min-width feature to enlarge curve and point th
 IF (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_OSX_ARCHITECTURES STREQUAL "") OR ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES))
   MESSAGE(STATUS "Building for Apple silicon")
   SET(EMBREE_ARM ON)
+# CMAKE_SYSTEM_PROCESSOR is unreliable on windows where it would report AMD64 with cross compilation
+ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
+  MESSAGE(STATUS "Building for Windows ARM64 (MSVC)")
+  SET(EMBREE_ARM ON)
 ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
   MESSAGE(STATUS "Building for AArch64")
   SET(EMBREE_ARM ON)
diff --git a/common/cmake/check_arm_neon.cpp b/common/cmake/check_arm_neon.cpp
index 2e1ff862a8..ed46159f13 100644
--- a/common/cmake/check_arm_neon.cpp
+++ b/common/cmake/check_arm_neon.cpp
@@ -1,7 +1,7 @@
 // Copyright 2009-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#if !defined(__ARM_NEON)
+#if !defined(__ARM_NEON) && !defined(_M_ARM64)
 #error "No ARM Neon support"
 #endif
 
diff --git a/common/cmake/msvc.cmake b/common/cmake/msvc.cmake
index 9f08cd0f03..d133780212 100644
--- a/common/cmake/msvc.cmake
+++ b/common/cmake/msvc.cmake
@@ -1,11 +1,18 @@
 ## Copyright 2009-2021 Intel Corporation
 ## SPDX-License-Identifier: Apache-2.0
 
-SET(FLAGS_SSE2  "/D__SSE__ /D__SSE2__")
-SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
-SET(FLAGS_AVX   "${FLAGS_SSE42} /arch:AVX")
-SET(FLAGS_AVX2  "${FLAGS_SSE42} /arch:AVX2")
-SET(FLAGS_AVX512  "${FLAGS_AVX2} /arch:AVX512")
+IF (EMBREE_ARM)
+  SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
+  SET(FLAGS_SSE42 "/D__SSE4_2__  /D__SSE4_1__")
+  SET(FLAGS_AVX "/D__AVX__ /D__SSE4_2__  /D__SSE4_1__  /D__BMI__ /D__BMI2__ /D__LZCNT__")
+  SET(FLAGS_AVX2 "/D__AVX2__ /D__AVX__ /D__SSE4_2__  /D__SSE4_1__  /D__BMI__ /D__BMI2__ /D__LZCNT__")
+ELSE()
+  SET(FLAGS_SSE2  "/D__SSE__ /D__SSE2__")
+  SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
+  SET(FLAGS_AVX   "${FLAGS_SSE42} /arch:AVX")
+  SET(FLAGS_AVX2  "${FLAGS_SSE42} /arch:AVX2")
+  SET(FLAGS_AVX512  "${FLAGS_AVX2} /arch:AVX512")
+ENDIF()
 
 SET(COMMON_CXX_FLAGS "")
 SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /EHsc")        # catch C++ exceptions only and extern "C" functions never throw a C++ exception
@@ -17,6 +24,10 @@ IF (EMBREE_STACK_PROTECTOR)
 ELSE()
   SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /GS-")          # do not protect against return address overrides
 ENDIF()
+IF (EMBREE_ARM)
+  # sse2neon uses the new preprocessor
+  SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /Zc:preprocessor")
+ENDIF()
 MACRO(DISABLE_STACK_PROTECTOR_FOR_FILE file)
   IF (EMBREE_STACK_PROTECTOR)
     SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "/GS-")
diff --git a/common/math/bbox.h b/common/math/bbox.h
index 651b29a8fe..6c0c77c82a 100644
--- a/common/math/bbox.h
+++ b/common/math/bbox.h
@@ -82,7 +82,7 @@ namespace embree
     return lower > upper;
   }
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<> __forceinline bool BBox<Vec3fa>::empty() const {
     return !all(le_mask(lower,upper));
   }
@@ -233,7 +233,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE__) || defined(__ARM_NEON)
+#if defined (__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 
diff --git a/common/math/color.h b/common/math/color.h
index 8b28ff9447..67081155cd 100644
--- a/common/math/color.h
+++ b/common/math/color.h
@@ -160,7 +160,7 @@ namespace embree
   }
   __forceinline const Color rcp  ( const Color& a )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __m128 reciprocal = _mm_rcp_ps(a.m128);
     reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
     reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
@@ -173,11 +173,11 @@ namespace embree
 #endif
     return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r))));   // computes r + r * (1 - a * r)
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
   __forceinline const Color rsqrt( const Color& a )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __m128 r = _mm_rsqrt_ps(a.m128);
     r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
     r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
@@ -191,7 +191,7 @@ namespace embree
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
   __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
 
diff --git a/common/math/emath.h b/common/math/emath.h
index 22a89a7669..9f97387e7e 100644
--- a/common/math/emath.h
+++ b/common/math/emath.h
@@ -12,7 +12,7 @@
 #  include "math_sycl.h"
 #else
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/arm/emulation.h"
 #else
 #include <emmintrin.h>
@@ -60,14 +60,22 @@ namespace embree
 
   __forceinline float rcp  ( const float x )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // Move scalar to vector register and do rcp.
       __m128 a;
+#if !defined(_M_ARM64)
       a[0] = x;
+#else
+      a.n128_f32[0] = x;
+#endif
       float32x4_t reciprocal = vrecpeq_f32(a);
       reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
       reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+#if !defined(_M_ARM64)
       return reciprocal[0];
+#else
+      return reciprocal.n128_f32[0];
+#endif
 #else
 
     const __m128 a = _mm_set_ss(x);
@@ -84,58 +92,93 @@ namespace embree
     return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
 #endif
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
 
   __forceinline float signmsk ( const float x ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128i b;
+#if !defined(_M_ARM64)
       a[0] = x;
       b[0] = 0x80000000;
+#else
+      a.n128_f32[0] = x;
+      b.n128_i32[0] = 0x80000000;
+#endif
       a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
 #endif
   }
   __forceinline float xorf( const float x, const float y ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128 b;
+#if !defined(_M_ARM64)
       a[0] = x;
       b[0] = y;
+#else
+      a.n128_f32[0] = x;
+      b.n128_f32[0] = y;
+#endif
       a = _mm_xor_ps(a, b);
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
 #endif
   }
   __forceinline float andf( const float x, const unsigned y ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128i b;
+#if  !defined(_M_ARM64)
       a[0] = x;
       b[0] = y;
+#else
+      a.n128_f32[0] = x;
+      b.n128_u32[0] = y;
+#endif
       a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
 #endif
   }
   __forceinline float rsqrt( const float x )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
+#if !defined(_M_ARM64)
       a[0] = x;
+#else
+      a.n128_f32[0] = x;
+#endif
       __m128 value = _mm_rsqrt_ps(a);
       value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
       value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+#if !defined(_M_ARM64)  
       return value[0];
+#else
+      return value.n128_f32[0];
+#endif
 #else
 
     const __m128 a = _mm_set_ss(x);
@@ -204,15 +247,24 @@ namespace embree
   __forceinline double floor( const double x ) { return ::floor (x); }
   __forceinline double ceil ( const double x ) { return ::ceil (x); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __forceinline float mini(float a, float b) {
         // FP and Neon shares same vector register in arm64
         __m128 x;
         __m128 y;
+#if !defined(_M_ARM64)
         x[0] = a;
         y[0] = b;
+#else
+      x.n128_f32[0] = a;
+      y.n128_f32[0] = b;
+#endif
         x = _mm_min_ps(x, y);
-        return x[0];
+#if !defined(_M_ARM64)
+      return x[0];
+#else
+      return x.n128_f32[0];
+#endif
     }
 #elif defined(__SSE4_1__)
   __forceinline float mini(float a, float b) {
@@ -223,15 +275,24 @@ namespace embree
   }
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __forceinline float maxi(float a, float b) {
         // FP and Neon shares same vector register in arm64
         __m128 x;
         __m128 y;
+#if !defined(_M_ARM64)
         x[0] = a;
         y[0] = b;
+#else
+      x.n128_f32[0] = a;
+      y.n128_f32[0] = b;
+#endif
         x = _mm_max_ps(x, y);
-        return x[0];
+#if !defined(_M_ARM64)
+      return x[0];
+#else
+      return x.n128_f32[0];
+#endif
     }
 #elif defined(__SSE4_1__)
   __forceinline float maxi(float a, float b) {
@@ -250,7 +311,7 @@ namespace embree
   __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
   __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
   __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
-#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
   __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
 #endif
 #if defined(__EMSCRIPTEN__)
@@ -270,7 +331,7 @@ namespace embree
   __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
   __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
   __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
-#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
   __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
 #endif
 #if defined(__EMSCRIPTEN__)
@@ -423,7 +484,7 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
     return x | (y << 1) | (z << 2);
   }
 
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
 
   template<>
     __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
diff --git a/common/math/linearspace3.h b/common/math/linearspace3.h
index f6d2318fa0..7685b007dc 100644
--- a/common/math/linearspace3.h
+++ b/common/math/linearspace3.h
@@ -96,7 +96,7 @@ namespace embree
   /*! compute transposed matrix */
   template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
     vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
-    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+    return LinearSpace3<Vec3fa>(Vec3fa(rx.m128()),Vec3fa(ry.m128()),Vec3fa(rz.m128())); 
   }
 #endif
   
diff --git a/common/math/vec2.h b/common/math/vec2.h
index 4e641ec249..776f47ca39 100644
--- a/common/math/vec2.h
+++ b/common/math/vec2.h
@@ -205,7 +205,7 @@ namespace embree
 
 #include "vec2fa.h"
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 
@@ -221,7 +221,7 @@ namespace embree
 {
   template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 #endif
 
diff --git a/common/math/vec2fa.h b/common/math/vec2fa.h
index d57e549e68..3eb517d853 100644
--- a/common/math/vec2fa.h
+++ b/common/math/vec2fa.h
@@ -25,7 +25,7 @@ namespace embree
     typedef float Scalar;
     enum { N = 2 };
     union {
-      __m128 m128;
+      __m128 v;
       struct { float x,y,az,aw; };
     };
 
@@ -34,21 +34,25 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline Vec2fa( ) {}
-    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+    __forceinline Vec2fa( const __m128 a ) : v(a) {}
 
     __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
     __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
 
-    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
-    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+    __forceinline Vec2fa            ( const Vec2fa& other ) { v = other.v; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { v = other.v; return *this; }
 
-    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+    __forceinline explicit Vec2fa( const float a ) : v(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : v(_mm_set_ps(y, y, y, x)) {}
 
-    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+#if !defined(_M_ARM64) || defined(__clang__)
+    __forceinline explicit Vec2fa( const __m128i a ) : v(_mm_cvtepi32_ps(a)) {}
+#endif
+
+    __forceinline const __m128& m128() const { return v; }
+    __forceinline __m128& m128()       { return v; }
 
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
+    __forceinline operator vfloat4() const { return vfloat4(m128()); }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Loads and Stores
@@ -63,17 +67,17 @@ namespace embree
     }
 
     static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
-      _mm_storeu_ps((float*)ptr,v);
+      _mm_storeu_ps((float*)ptr,v.m128());
     }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
 
-    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+    __forceinline Vec2fa( ZeroTy   ) : v(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : v(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : v(_mm_set1_ps(neg_inf)) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -90,66 +94,66 @@ namespace embree
   __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
   __forceinline Vec2fa operator -( const Vec2fa& a ) {
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
+    return _mm_xor_ps(a.v, mask);
   }
   __forceinline Vec2fa abs  ( const Vec2fa& a ) {
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
+    return _mm_and_ps(a.v, mask);
   }
   __forceinline Vec2fa sign ( const Vec2fa& a ) {
-    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+    return blendv_ps(Vec2fa(one).m128(), (-Vec2fa(one)).m128(), _mm_cmplt_ps (a.m128(),Vec2fa(zero).m128()));
   }
 
   __forceinline Vec2fa rcp  ( const Vec2fa& a )
   {
-#if defined(__aarch64__)
-        __m128 reciprocal = _mm_rcp_ps(a.m128);
-        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
-        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+#if defined(__aarch64__) || defined(_M_ARM64)
+        __m128 reciprocal = _mm_rcp_ps(a.v);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.v, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.v, reciprocal), reciprocal);
         return (const Vec2fa)reciprocal;
 #else
 #if defined(__AVX512VL__)
-    const Vec2fa r = _mm_rcp14_ps(a.m128);
+    const Vec2fa r = _mm_rcp14_ps(a.v);
 #else
-    const Vec2fa r = _mm_rcp_ps(a.m128);
+    const Vec2fa r = _mm_rcp_ps(a.v);
 #endif
 
 #if defined(__AVX2__)
-    const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
-    const Vec2fa res = _mm_fmadd_ps(r, h_n, r);            // Then compute r + r * h_n
+    const Vec2fa h_n = _mm_fnmadd_ps(a.m128(), r.m128(), vfloat4(1.0).m128());  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_fmadd_ps(r.m128(), h_n.m128(), r.m128());            // Then compute r + r * h_n
 #else
-    const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r));  // First, compute 1 - a * r (which will be very close to 0)
-    const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n));             // Then compute r + r * h_n  
+    const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f).m128(), _mm_mul_ps(a.m128(), r.m128()));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec2fa res = _mm_add_ps(r.m128(),_mm_mul_ps(r.m128(), h_n.m128()));             // Then compute r + r * h_n  
 #endif
 
     return res;
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
 
-  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.v); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a.m128(),a.m128()); }
 
   __forceinline Vec2fa rsqrt( const Vec2fa& a )
   {
-#if defined(__aarch64__)
-        __m128 r = _mm_rsqrt_ps(a.m128);
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+#if defined(__aarch64__) || defined(_M_ARM64)
+        __m128 r = _mm_rsqrt_ps(a.v);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.v, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.v, r), r));
         return r;
 #else
 
 #if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
+    __m128 r = _mm_rsqrt14_ps(a.v);
 #else
-    __m128 r = _mm_rsqrt_ps(a.m128);
+    __m128 r = _mm_rsqrt_ps(a.v);
 #endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128(), _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 
 #endif
   }
 
   __forceinline Vec2fa zero_fix(const Vec2fa& a) {
-    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+    return blendv_ps(a.m128(), _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).v, _mm_set1_ps(min_rcp_input)));
   }
   __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
     return rcp(zero_fix(a));
@@ -166,33 +170,33 @@ namespace embree
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.v, b.v); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.v, b.v); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.v, b.v); }
   __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
   __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
-  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.v,b.v); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.v,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.v); }
 
-  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.v,b.v); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.v,b.v); }
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
     __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_min_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
     __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_max_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
@@ -205,10 +209,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX2__)
-  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
-  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
-  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
-  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a.m128(),b.m128(),c.m128()); }
 #else
   __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
   __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
@@ -245,8 +249,8 @@ namespace embree
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
-  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.v, b.v)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.v, b.v)) & 3) != 0; }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Euclidean Space Operators
@@ -254,7 +258,7 @@ namespace embree
 
 #if defined(__SSE4_1__)
   __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128(),b.m128(),0x3F));
   }
 #else
   __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
@@ -279,7 +283,7 @@ namespace embree
 
   __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
     __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f, t, mask);
+    return blendv_ps(f.m128(), t.m128(), mask);
   }
 
   __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
@@ -297,14 +301,14 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
-  __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
-  __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+  __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a.v); }
+  __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a.v); }
 #elif defined (__SSE4_1__)
   //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_POS_INF    ); }
 #else
   //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
   __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
diff --git a/common/math/vec3.h b/common/math/vec3.h
index d5e78befe8..45be442054 100644
--- a/common/math/vec3.h
+++ b/common/math/vec3.h
@@ -265,7 +265,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 
@@ -292,14 +292,14 @@ namespace embree
   template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
     x = a.x; y = a.y; z = a.z;
   }
-#elif defined(__SSE__) || defined(__ARM_NEON)
+#elif defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<>
   __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
-    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+    const vfloat4 v = vfloat4(a.v); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
   }
 #endif
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<>
   __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
     return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
diff --git a/common/math/vec3ba.h b/common/math/vec3ba.h
index bf24a2a3b6..53b5e258a8 100644
--- a/common/math/vec3ba.h
+++ b/common/math/vec3ba.h
@@ -23,7 +23,7 @@ namespace embree
     ALIGNED_STRUCT_(16);
     
     union {
-      __m128 m128;
+      __m128 v;
       struct { int x,y,z; };
     };
 
@@ -35,24 +35,24 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline Vec3ba( ) {}
-    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
-    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
-    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+    __forceinline Vec3ba( const __m128  input ) : v(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : v(other.v) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { v = other.v; return *this; }
 
     __forceinline explicit Vec3ba( bool a )
-      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
     __forceinline Vec3ba( bool a, bool b, bool c)
-      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+      : v(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
 
-    __forceinline operator const __m128&() const { return m128; }
-    __forceinline operator       __m128&()       { return m128; }
+    __forceinline const __m128& m128() const { return v; }
+    __forceinline __m128& m128()       { return v; }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
 
-    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+    __forceinline Vec3ba( FalseTy ) : v(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -67,15 +67,15 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.v, Vec3ba(embree::True).v); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
-  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
-  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.v, b.v); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.v, b.v); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.v, b.v); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -90,10 +90,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   
   __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
-    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)))) & 7) == 7; 
   }
   __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
-    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)))) & 7) != 7; 
   }
   __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
     if (a.x != b.x) return a.x < b.x;
@@ -106,14 +106,14 @@ namespace embree
   /// Reduction Operations
   ////////////////////////////////////////////////////////////////////////////////
     
-  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
-  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a.m128()) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a.m128()) & 0x7) != 0x0; }
 
-  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
-  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
-  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b.m128()) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b.m128()) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b.m128()) & 0x7) == 0x0; }
 
-  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a.m128()) & 0x7; }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Output Operators
diff --git a/common/math/vec3fa.h b/common/math/vec3fa.h
index 967e75da74..68d84164f3 100644
--- a/common/math/vec3fa.h
+++ b/common/math/vec3fa.h
@@ -25,7 +25,7 @@ namespace embree
     typedef float Scalar;
     enum { N = 3 };
     union {
-      __m128 m128;
+      __m128 v;
       struct { float x,y,z; };
     };
 
@@ -34,23 +34,25 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline Vec3fa( ) {}
-    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+    __forceinline Vec3fa( const __m128 a ) : v(a) {}
 
-    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
-    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { v  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { v  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
 
-    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
-    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+    __forceinline Vec3fa            ( const Vec3fa& other ) { v = other.v; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { v = other.v; return *this; }
 
-    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+    __forceinline explicit Vec3fa( const float a ) : v(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : v(_mm_set_ps(0, z, y, x)) {}
 
-    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+#if !defined(_M_ARM64) || defined(__clang__)
+    __forceinline explicit Vec3fa( const __m128i a ) : v(_mm_cvtepi32_ps(a)) {}
+#endif
 
-    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
-    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
-    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
-    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const vfloat4() const { return vfloat4(v); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(v)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(v); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(v)); }
     
     //__forceinline operator const __m128&() const { return m128; }
     //__forceinline operator       __m128&()       { return m128; }
@@ -60,9 +62,13 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline Vec3fa load( const void* const a ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
         __m128 t = _mm_load_ps((float*)a);
+#if !defined(_M_ARM64)
         t[3] = 0.0f;
+#else
+        t.n128_f32[3] = 0.0f;
+#endif
         return Vec3fa(t);
 #else
       return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
@@ -74,17 +80,17 @@ namespace embree
     }
 
     static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
-      _mm_storeu_ps((float*)ptr,v.m128);
+      _mm_storeu_ps((float*)ptr,v.v);
     }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
 
-    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+    __forceinline Vec3fa( ZeroTy   ) : v(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : v(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : v(_mm_set1_ps(neg_inf)) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -100,72 +106,72 @@ namespace embree
 
   __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
   __forceinline Vec3fa operator -( const Vec3fa& a ) {
-#if defined(__aarch64__)
-    return vnegq_f32(a.m128);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vnegq_f32(a.v);
 #else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
+    return _mm_xor_ps(a.v, mask);
 #endif
   }
   __forceinline Vec3fa abs  ( const Vec3fa& a ) {
-#if defined(__aarch64__)
-    return _mm_abs_ps(a.m128);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_abs_ps(a.v);
 #else
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
+    return _mm_and_ps(a.v, mask);
 #endif
   }
   __forceinline Vec3fa sign ( const Vec3fa& a ) {
-    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+    return blendv_ps(Vec3fa(one).v, (-Vec3fa(one)).v, _mm_cmplt_ps (a.v,Vec3fa(zero).v));
   }
 
   __forceinline Vec3fa rcp  ( const Vec3fa& a )
   {
-#if defined(__aarch64__)
-  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#if defined(__aarch64__) || defined(_M_ARM64)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.v);
 #else
 
 #if defined(__AVX512VL__)
-    const Vec3fa r = _mm_rcp14_ps(a.m128);
+    const Vec3fa r = _mm_rcp14_ps(a.v);
 #else
-    const Vec3fa r = _mm_rcp_ps(a.m128);
+    const Vec3fa r = _mm_rcp_ps(a.v);
 #endif
 
 #if defined(__AVX2__)
-    const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0));  // First, compute 1 - a * r (which will be very close to 0)
-    const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128);       // Then compute r + r * h_n
+    const Vec3fa h_n = _mm_fnmadd_ps(a.v, r.v, vfloat4(1.0).m128());  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_fmadd_ps(r.v, h_n.v, r.v);       // Then compute r + r * h_n
 #else
-    const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128));  // First, compute 1 - a * r (which will be very close to 0)
-    const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128));        // Then compute r + r * h_n  
+    const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f).m128(), _mm_mul_ps(a.v, r.v));  // First, compute 1 - a * r (which will be very close to 0)
+    const Vec3fa res = _mm_add_ps(r.v,_mm_mul_ps(r.v, h_n.v));        // Then compute r + r * h_n  
 #endif
 
     return res;
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
 
-  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.v); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.v,a.v); }
 
   __forceinline Vec3fa rsqrt( const Vec3fa& a )
   {
-#if defined(__aarch64__)
-        __m128 r = _mm_rsqrt_ps(a.m128);
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
-        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+#if defined(__aarch64__) || defined(_M_ARM64)
+        __m128 r = _mm_rsqrt_ps(a.v);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.v, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.v, r), r));
         return r;
 #else
 
 #if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
+    __m128 r = _mm_rsqrt14_ps(a.v);
 #else
-    __m128 r = _mm_rsqrt_ps(a.m128);
+    __m128 r = _mm_rsqrt_ps(a.v);
 #endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.v, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 #endif
   }
 
   __forceinline Vec3fa zero_fix(const Vec3fa& a) {
-    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+    return blendv_ps(a.v, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).v, _mm_set1_ps(min_rcp_input)));
   }
   __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
     return rcp(zero_fix(a));
@@ -182,33 +188,33 @@ namespace embree
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.v, b.v); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.v, b.v); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.v, b.v); }
   __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
   __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
-  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.v,b.v); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.v,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.v); }
 
-  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.v,b.v); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.v,b.v); }
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
     __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.v);
+      const vint4 bi = _mm_castps_si128(b.v);
+      const vint4 ci = _mm_min_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
     __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.v);
+      const vint4 bi = _mm_castps_si128(b.v);
+      const vint4 ci = _mm_max_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
@@ -220,11 +226,11 @@ namespace embree
   /// Ternary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
-  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#if defined(__AVX2__) || defined(__ARM_NEON) || defined(_M_ARM64)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.v,b.v,c.v); }
 #else
   __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
   __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
@@ -251,30 +257,42 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   __forceinline float reduce_add(const Vec3fa& v) {
-    float32x4_t t = v.m128;
+    float32x4_t t = v.v;
+#if !defined(_M_ARM64)
     t[3] = 0.0f;
+#else
+    t.n128_f32[3] = 0.0f;
+#endif
     return vaddvq_f32(t);
   }
 
   __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
   __forceinline float reduce_min(const Vec3fa& v) {
-    float32x4_t t = v.m128;
+    float32x4_t t = v.v;
+#if !defined(_M_ARM64)
       t[3] = t[2];
+#else
+      t.n128_f32[3] = t.n128_f32[2];
+#endif
     return vminvq_f32(t);
   }
   __forceinline float reduce_max(const Vec3fa& v) {
-    float32x4_t t = v.m128;
+    float32x4_t t = v.v;
+#if !defined(_M_ARM64)
       t[3] = t[2];
+#else
+      t.n128_f32[3] = t.n128_f32[2];
+#endif
     return vmaxvq_f32(t);
   }
 #else
   __forceinline float reduce_add(const Vec3fa& v) {
-    const vfloat4 a(v.m128);
+    const vfloat4 a(v.v);
     const vfloat4 b = shuffle<1>(a);
     const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c); 
+    return _mm_cvtss_f32((a+b+c).m128()); 
   }
 
   __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
@@ -286,19 +304,19 @@ namespace embree
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.v, b.v)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.v, b.v)) & 7) != 0; }
 
-  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
-  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
-  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
- #if defined(__aarch64__)
-  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.v, b.v); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.v, b.v); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.v, b.v); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.v, b.v); }
+ #if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.v, b.v); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.v, b.v); }
 #else
-  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.v, b.v); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.v, b.v); }
 #endif
 
   __forceinline bool isvalid ( const Vec3fa& v ) {
@@ -310,11 +328,11 @@ namespace embree
   }
 
   __forceinline bool isvalid4 ( const Vec3fa& v ) {
-    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+    return all((vfloat4(v.v) > vfloat4(-FLT_LARGE)) & (vfloat4(v.v) < vfloat4(+FLT_LARGE)));
   }
 
   __forceinline bool is_finite4 ( const Vec3fa& a ) {
-    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+    return all((vfloat4(a.v) >= vfloat4(-FLT_MAX)) & (vfloat4(a.v) <= vfloat4(+FLT_MAX)));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -323,7 +341,7 @@ namespace embree
 
 #if defined(__SSE4_1__)
   __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+    return _mm_cvtss_f32(_mm_dp_ps(a.v,b.v,0x7F));
   }
 #else
   __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
@@ -333,11 +351,11 @@ namespace embree
 
   __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
   {
-    vfloat4 a0 = vfloat4(a.m128);
-    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
-    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
-    vfloat4 b1 = vfloat4(b.m128);
-    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+    vfloat4 a0 = vfloat4(a.v);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.v));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.v));
+    vfloat4 b1 = vfloat4(b.v);
+    return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)).m128());
   }
 
   __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
@@ -367,11 +385,11 @@ namespace embree
 
   __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
     __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f.m128, t.m128, mask);
+    return blendv_ps(f.v, t.v, mask);
   }
 
   __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
-    return blendv_ps(f.m128, t.m128, s);
+    return blendv_ps(f.v, t.v, s.v);
   }
 
   __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
@@ -392,14 +410,14 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__)
-  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
-  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
-  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.v); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.v); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.v); }
 #elif defined (__SSE4_1__)
-  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_POS_INF    ); }
 #else
   __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
   __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
@@ -428,7 +446,7 @@ namespace embree
     typedef float Scalar;
     enum { N = 3 };
     union {
-      __m128 m128;
+      __m128 v;
       struct { float x,y,z; union { int a; unsigned u; float w; }; };
     };
 
@@ -437,45 +455,47 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline Vec3fx( ) {}
-    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+    __forceinline Vec3fx( const __m128 a ) : v(a) {}
 
-    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
-    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+    __forceinline explicit Vec3fx(const Vec3fa& v) : v(v.v) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(v); }
         
-    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
-    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { v  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { v  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
 
-    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
-    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+    __forceinline Vec3fx            ( const Vec3fx& other ) { v = other.v; }
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { v = other.v; return *this; }
 
-    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
-    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+    __forceinline explicit Vec3fx( const float a ) : v(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : v(_mm_set_ps(0, z, y, x)) {}
 
-    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
-    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { v = other.v; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { v = other.v; u = a1; }
     __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
 #if defined (__aarch64__)
-      m128 = other.m128; m128[3] = w1;
+      v = other.v; v[3] = w1;
+#elif defined(_M_ARM64)
+      v = other.v; v.n128_f32[3] = w1;
 #elif defined (__SSE4_1__)
-      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+      v = _mm_insert_ps(other.v, _mm_set_ss(w1),3 << 4);
 #else
       const vint4 mask(-1,-1,-1,0);
-      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+      v = select(vboolf4(_mm_castsi128_ps(mask.m128i())),vfloat4(other.v),vfloat4(w1)).m128();
 #endif
     }
     //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
     //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
-    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : v(_mm_set_ps(w, z, y, x)) {}
     
-    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+    //__forceinline explicit Vec3fx( const __m128i a ) : v(_mm_cvtepi32_ps(a)) {}
 
-    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
-    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
-    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
-    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const vfloat4() const { return vfloat4(v); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(v)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(v); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(v)); }
     
-    //__forceinline operator const __m128&() const { return m128; }
-    //__forceinline operator       __m128&()       { return m128; }
+    //__forceinline operator const __m128&() const { return v; }
+    //__forceinline operator       __m128&()       { return v; }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Loads and Stores
@@ -490,17 +510,17 @@ namespace embree
     }
 
     static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
-      _mm_storeu_ps((float*)ptr,v.m128);
+      _mm_storeu_ps((float*)ptr,v.v);
     }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
 
-    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
-    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
-    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
-    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+    __forceinline Vec3fx( ZeroTy   ) : v(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : v(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : v(_mm_set1_ps(neg_inf)) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -517,49 +537,49 @@ namespace embree
   __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
   __forceinline Vec3fx operator -( const Vec3fx& a ) {
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
-    return _mm_xor_ps(a.m128, mask);
+    return _mm_xor_ps(a.v, mask);
   }
   __forceinline Vec3fx abs  ( const Vec3fx& a ) {
     const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-    return _mm_and_ps(a.m128, mask);
+    return _mm_and_ps(a.v, mask);
   }
   __forceinline Vec3fx sign ( const Vec3fx& a ) {
-    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+    return blendv_ps(Vec3fx(one).v, (-Vec3fx(one)).v, _mm_cmplt_ps (a.v,Vec3fx(zero).v));
   }
 
   __forceinline Vec3fx rcp  ( const Vec3fx& a )
   {
 #if defined(__AVX512VL__)
-    const Vec3fx r = _mm_rcp14_ps(a.m128);
+    const Vec3fx r = _mm_rcp14_ps(a.v);
 #else
-    const Vec3fx r = _mm_rcp_ps(a.m128);
+    const Vec3fx r = _mm_rcp_ps(a.v);
 #endif
 
 #if defined(__AVX2__)
-    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+    const Vec3fx res = _mm_mul_ps(r.v,_mm_fnmadd_ps(r.v, a.v, vfloat4(2.0f).m128()));
 #else
-    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    const Vec3fx res = _mm_mul_ps(r.v,_mm_sub_ps(vfloat4(2.0f).m128(), _mm_mul_ps(r.v, a.v)));
     //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
 #endif
 
     return res;
   }
 
-  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
-  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.v); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.v,a.v); }
 
   __forceinline Vec3fx rsqrt( const Vec3fx& a )
   {
 #if defined(__AVX512VL__)
-    __m128 r = _mm_rsqrt14_ps(a.m128);
+    __m128 r = _mm_rsqrt14_ps(a.v);
 #else
-    __m128 r = _mm_rsqrt_ps(a.m128);
+    __m128 r = _mm_rsqrt_ps(a.v);
 #endif
-    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.v, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
   }
 
   __forceinline Vec3fx zero_fix(const Vec3fx& a) {
-    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+    return blendv_ps(a.v, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).v, _mm_set1_ps(min_rcp_input)));
   }
   __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
     return rcp(zero_fix(a));
@@ -576,33 +596,33 @@ namespace embree
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
-  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
-  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.v, b.v); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.v, b.v); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.v, b.v); }
   __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
   __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
-  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
-  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
-  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.v,b.v); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.v,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.v); }
 
-  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
-  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.v,b.v); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.v,b.v); }
 
-#if defined(__SSE4_1__) || defined(__aarch64__)
+#if defined(__SSE4_1__) || defined(__aarch64__) || defined(_M_ARM64)
     __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.v);
+      const vint4 bi = _mm_castps_si128(b.v);
+      const vint4 ci = _mm_min_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
-#if defined(__SSE4_1__) || defined(__aarch64__)
+#if defined(__SSE4_1__) || defined(__aarch64__) || defined(_M_ARM64)
     __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
-      const vint4 ai = _mm_castps_si128(a.m128);
-      const vint4 bi = _mm_castps_si128(b.m128);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.v);
+      const vint4 bi = _mm_castps_si128(b.v);
+      const vint4 ci = _mm_max_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #endif
 
@@ -615,10 +635,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX2__)
-  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
-  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.v,b.v,c.v); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.v,b.v,c.v); }
 #else
   __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
   __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
@@ -647,10 +667,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline float reduce_add(const Vec3fx& v) { 
-    const vfloat4 a(v.m128);
+    const vfloat4 a(v.v);
     const vfloat4 b = shuffle<1>(a);
     const vfloat4 c = shuffle<2>(a);
-    return _mm_cvtss_f32(a+b+c); 
+    return _mm_cvtss_f32((a+b+c).m128()); 
   }
 
   __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
@@ -661,15 +681,15 @@ namespace embree
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.v, b.v)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.v, b.v)) & 7) != 0; }
 
-  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
-  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
-  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
-  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
-  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
-  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.v, b.v); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.v, b.v); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.v, b.v); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.v, b.v); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.v, b.v); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.v, b.v); }
 
   __forceinline bool isvalid ( const Vec3fx& v ) {
     return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
@@ -680,11 +700,11 @@ namespace embree
   }
 
   __forceinline bool isvalid4 ( const Vec3fx& v ) {
-    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+    return all((vfloat4(v.v) > vfloat4(-FLT_LARGE)) & (vfloat4(v.v) < vfloat4(+FLT_LARGE)));
   }
 
   __forceinline bool is_finite4 ( const Vec3fx& a ) {
-    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+    return all((vfloat4(a.v) >= vfloat4(-FLT_MAX)) & (vfloat4(a.v) <= vfloat4(+FLT_MAX)));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -693,7 +713,7 @@ namespace embree
 
 #if defined(__SSE4_1__)
   __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
-    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+    return _mm_cvtss_f32(_mm_dp_ps(a.v,b.v,0x7F));
   }
 #else
   __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
@@ -703,11 +723,11 @@ namespace embree
 
   __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
   {
-    vfloat4 a0 = vfloat4(a.m128);
-    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
-    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
-    vfloat4 b1 = vfloat4(b.m128);
-    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+    vfloat4 a0 = vfloat4(a.v);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.v));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.v));
+    vfloat4 b1 = vfloat4(b.v);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)).m128());
   }
 
   __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
@@ -737,11 +757,11 @@ namespace embree
 
   __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
     __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
-    return blendv_ps(f.m128, t.m128, mask);
+    return blendv_ps(f.v, t.v, mask);
   }
 
   __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
-    return blendv_ps(f.m128, t.m128, s);
+    return blendv_ps(f.v, t.v, s.m128());
   }
 
   __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
@@ -762,14 +782,14 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__)
-  __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); }
-  __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); }
-  __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.v); }
+  __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.v); }
+  __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.v); }
 #elif defined (__SSE4_1__)
-  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
-  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.v, _MM_FROUND_TO_POS_INF    ); }
 #else
   __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
   __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
diff --git a/common/math/vec3ia.h b/common/math/vec3ia.h
index 1472fe9135..6c0705fdd4 100644
--- a/common/math/vec3ia.h
+++ b/common/math/vec3ia.h
@@ -23,7 +23,7 @@ namespace embree
     ALIGNED_STRUCT_(16);
 
     union {
-      __m128i m128;
+      __m128i v;
       struct { int x,y,z; };
     };
 
@@ -35,25 +35,31 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline Vec3ia( ) {}
-    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
-    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
-    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+    __forceinline Vec3ia( const __m128i a ) : v(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : v(other.v) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { v = other.v; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : v(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : v(_mm_set_epi32(z, z, y, x)) {}
+#if !defined(_M_ARM64) || defined(__clang__)
+    __forceinline explicit Vec3ia( const __m128 a ) : v(_mm_cvtps_epi32(a)) {}
+#endif
+
+    __forceinline const __m128i& m128i() const { return v; }
+    __forceinline __m128i& m128i()       { return v; }
 
-    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
-    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
-    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+    __forceinline __m128 m128() const { return _mm_cvtepi32_ps(v);  }
 
-    __forceinline operator const __m128i&() const { return m128; }
-    __forceinline operator       __m128i&()       { return m128; }
+    __forceinline operator vint4() const { return vint4(m128i()); }
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constants
     ////////////////////////////////////////////////////////////////////////////////
 
-    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
-    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
-    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
-    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+    __forceinline Vec3ia( ZeroTy   ) : v(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : v(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : v(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : v(_mm_set1_epi32(neg_inf)) {}
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Array Access
@@ -69,49 +75,49 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
-  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
-#if (defined(__aarch64__))
-  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.v); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.v); }
 #elif defined(__SSSE3__)
-  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.v); }
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.v, b.v); }
   __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
   __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
 
-  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.v, b.v); }
   __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
   __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.v, b.v); }
   __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
   __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
 #endif
 
-  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.v, b.v); }
   __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
   __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
 
-  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.v, b.v); }
   __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
   __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
 
-  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.v, b.v); }
   __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
   __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
 
-  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
-  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.v, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.v, n); }
 
-  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
-  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
-  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.v, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.v, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.v, b); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -123,7 +129,7 @@ namespace embree
   __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
   __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
   
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
   __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
   __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
 #endif
@@ -134,7 +140,7 @@ namespace embree
   __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
   __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
   
-#if !defined(__ARM_NEON)
+#if !defined(__ARM_NEON) && !defined(_M_ARM64)
   __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
   __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
 #endif
@@ -144,21 +150,21 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f.m128i()), _mm_castsi128_ps(t.m128i()), m.m128()));
 #else
-    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m.m128()), t.m128i()), _mm_andnot_si128(_mm_castps_si128(m.m128()), f.m128i())); 
 #endif
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__)
-  __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0))); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline int reduce_add(const Vec3ia& v) { return vaddvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0)).v); }
   __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
-  __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF))); }
-  __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000))); }
+  __forceinline int reduce_min(const Vec3ia& v) { return vminvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x7FFFFFFF)).v); }
+  __forceinline int reduce_max(const Vec3ia& v) { return vmaxvq_s32(select(Vec3ba(1,1,1),v,Vec3ia(0x80000000)).v); }
 #else
   __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
   __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
@@ -170,8 +176,8 @@ namespace embree
   /// Comparison Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
-  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.v, b.v))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.v, b.v))) & 7) != 7; }
   __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
     if (a.x != b.x) return a.x < b.x;
     if (a.y != b.y) return a.y < b.y;
@@ -179,13 +185,13 @@ namespace embree
     return false;
   }
 
-  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
-  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
-  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.v, b.v)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.v, b.v)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.v, b.v)); }
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
-  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.v,b.v); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.v,b.v); }
 #else
   __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
   __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
diff --git a/common/math/vec4.h b/common/math/vec4.h
index 5647859257..110def1627 100644
--- a/common/math/vec4.h
+++ b/common/math/vec4.h
@@ -205,7 +205,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 
@@ -227,9 +227,9 @@ namespace embree
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
     x = a.x; y = a.y; z = a.z; w = a.w;
   }
-#elif defined(__SSE__) || defined(__ARM_NEON)
+#elif defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
-    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+    const vfloat4 v = vfloat4(a.v); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
   }
 #endif
 
diff --git a/common/simd/arm/avx2neon.h b/common/simd/arm/avx2neon.h
index dd321d3d64..fdb665022f 100644
--- a/common/simd/arm/avx2neon.h
+++ b/common/simd/arm/avx2neon.h
@@ -1,12 +1,17 @@
 #pragma once
 
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #error "avx2neon is only supported for AARCH64"
 #endif
 
 #include "sse2neon.h"
 
+#if defined(_MSC_VER)
+#define AVX2NEON_ABI static inline  __forceinline
+#include <intrin.h>
+#else
 #define AVX2NEON_ABI static inline  __attribute__((always_inline))
+#endif
 
 
 struct __m256 {
@@ -63,7 +68,11 @@ __m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
 AVX2NEON_ABI
 int _mm_movemask_popcnt(__m128 a)
 {
+#if defined(_MSC_VER)
+    return _CountOneBits(_mm_movemask_ps(a));
+#else
     return __builtin_popcount(_mm_movemask_ps(a));
+#endif
 }
 
 AVX2NEON_ABI
@@ -72,7 +81,11 @@ __m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
     float32x4_t res;
     uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
     for (int i=0;i<4;i++) {
+#if !defined(_M_ARM64)
         if (mask_u32[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+#else
+        if (mask_u32.n128_u32[i] & 0x80000000) res.n128_f32[i] = mem_addr[i]; else res.n128_f32[i] = 0;
+#endif
     }
     return vreinterpretq_m128_f32(res);
 }
@@ -83,7 +96,11 @@ void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
     float32x4_t a_f32 = vreinterpretq_f32_m128(a);
     uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
     for (int i=0;i<4;i++) {
+#if !defined(_M_ARM64)
         if (mask_u32[i] & 0x80000000) mem_addr[i] = a_f32[i];
+#else
+        if (mask_u32.n128_u32[i] & 0x80000000) mem_addr[i] = a_f32.n128_f32[i];
+#endif
     }
 }
 
@@ -93,7 +110,11 @@ void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
     uint32x4_t mask_u32 = vreinterpretq_u32_m128i(mask);
     int32x4_t a_s32 = vreinterpretq_s32_m128i(a);
     for (int i=0;i<4;i++) {
+#if !defined(_M_ARM64)
         if (mask_u32[i] & 0x80000000) mem_addr[i] = a_s32[i];
+#else
+        if (mask_u32.n128_u32[i] & 0x80000000) mem_addr[i] = a_s32.n128_i32[i];
+#endif
     }
 }
 
@@ -125,7 +146,11 @@ inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
 {
     float v;
     float32x4_t m = _mm_mul_ps(a,b);
+#if !defined(_M_ARM64)
     m[3] = 0;
+#else
+    m.n128_f32[3] = 0;
+#endif
     v = vaddvq_f32(m);
     return _mm_set1_ps(v);
 }
@@ -149,7 +174,11 @@ __m128 _mm_permutevar_ps (__m128 a, __m128i b)
     float32x4_t x;
     for (int i=0;i<4;i++)
     {
+#if !defined(_M_ARM64)
         x[i] = a[b_u32[i]];
+#else
+        x.n128_f32[i] = a.n128_f32[b_u32.n128_u32[i]];
+#endif
     }
     return vreinterpretq_m128_f32(x);
 }
@@ -618,8 +647,18 @@ AVX2NEON_ABI
 __m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
 {
     __m256i res;
+#if !defined(_M_ARM64)
     int64x2_t t0 = {e0,e1};
     int64x2_t t1 = {e2,e3};
+#else
+    int64x2_t t0;
+    t0.n128_i64[0] = e0;
+    t0.n128_i64[1] = e1;
+
+    int64x2_t t1;
+    t1.n128_i64[0] = e2;
+    t1.n128_i64[1] = e3;
+#endif
     res.lo = __m128i(t0);
     res.hi = __m128i(t1);
     return res;
@@ -628,8 +667,18 @@ AVX2NEON_ABI
 __m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3)
 {
     __m256i res;
+#if !defined(_M_ARM64)
     int64x2_t t0 = {e0,e1};
     int64x2_t t1 = {e2,e3};
+#else
+    int64x2_t t0;
+    t0.n128_i64[0] = e0;
+    t0.n128_i64[1] = e1;
+
+    int64x2_t t1;
+    t1.n128_i64[0] = e2;
+    t1.n128_i64[1] = e3;
+#endif
     res.lo = __m128i(t0);
     res.hi = __m128i(t1);
     return res;
@@ -640,8 +689,19 @@ __m256i _mm256_setr_epi64x (int64_t e0, int64_t e1, int64_t e2, int64_t e3)
 AVX2NEON_ABI
 __m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
 {
+#if(_M_ARM64)
+    char lo_arr[16] = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
+    char hi_arr[16] = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+    int8x16_t lo;
+    int8x16_t hi;
+    for(char i = 0; i < 16; i++) {
+        lo.n128_i8[i] = lo_arr[i];
+        hi.n128_i8[i] = hi_arr[i];
+    }
+#else
     int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
     int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+#endif
     __m256i res;
     res.lo = lo; res.hi = hi;
     return res;
@@ -650,8 +710,19 @@ __m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char
 AVX2NEON_ABI
 __m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5, char e6, char e7, char e8, char e9, char e10, char e11, char e12, char e13, char e14, char e15, char e16, char e17, char e18, char e19, char e20, char e21, char e22, char e23, char e24, char e25, char e26, char e27, char e28, char e29, char e30, char e31)
 {
+#if(_M_ARM64)
+    char lo_arr[16] = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
+    char hi_arr[16] = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+    int8x16_t lo;
+    int8x16_t hi;
+    for(char i = 0; i < 16; i++) {
+        lo.n128_i8[i] = lo_arr[i];
+        hi.n128_i8[i] = hi_arr[i];
+    }
+#else
     int8x16_t lo = {e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15};
     int8x16_t hi = {e16,e17,e18,e19,e20,e21,e22,e23,e24,e25,e26,e27,e28,e29,e30,e31};
+#endif
     __m256i res;
     res.lo = lo; res.hi = hi;
     return res;
@@ -661,8 +732,19 @@ __m256i _mm256_setr_epi8 (char e0, char e1, char e2, char e3, char e4, char e5,
 AVX2NEON_ABI
 __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
 {
+#if(_M_ARM64)
+    short lo_arr[8] = {e0,e1,e2,e3,e4,e5,e6,e7};
+    short hi_arr[8] = {e8,e9,e10,e11,e12,e13,e14,e15};
+    int16x8_t lo;
+    int16x8_t hi;
+    for(char i = 0; i < 8; i++) {
+        lo.n128_i16[i] = lo_arr[i];
+        hi.n128_i16[i] = hi_arr[i];
+    }
+#else
     int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
     int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
+#endif
     __m256i res;
     res.lo = lo; res.hi = hi;
     return res;
@@ -671,8 +753,19 @@ __m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11,
 AVX2NEON_ABI
 __m256i _mm256_setr_epi16 (short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15)
 {
+#if(_M_ARM64)
+    short lo_arr[8] = {e0,e1,e2,e3,e4,e5,e6,e7};
+    short hi_arr[8] = {e8,e9,e10,e11,e12,e13,e14,e15};
+    int16x8_t lo;
+    int16x8_t hi;
+    for(char i = 0; i < 8; i++) {
+        lo.n128_i16[i] = lo_arr[i];
+        hi.n128_i16[i] = hi_arr[i];
+    }
+#else
     int16x8_t lo = {e0,e1,e2,e3,e4,e5,e6,e7};
     int16x8_t hi = {e8,e9,e10,e11,e12,e13,e14,e15};
+#endif
     __m256i res;
     res.lo = lo; res.hi = hi;
     return res;
@@ -767,15 +860,19 @@ double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
 {
     switch (imm8 & 3) {
         case 0:
-            return ((float64x2_t)a.lo)[0];
+            return vgetq_lane_f64(a.lo, 0);
         case 1:
-            return ((float64x2_t)a.lo)[1];
+            return vgetq_lane_f64(a.lo, 1);
         case 2:
-            return ((float64x2_t)a.hi)[0];
+            return vgetq_lane_f64(a.hi, 0);
         case 3:
-            return ((float64x2_t)a.hi)[1];
+            return vgetq_lane_f64(a.hi, 1);
     }
+#if !defined(_M_ARM64)
     __builtin_unreachable();
+#else
+    __assume(0);
+#endif
     return 0;
 }
 
@@ -783,10 +880,10 @@ AVX2NEON_ABI
 __m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
 {
     float64x2_t lo,hi;
-    lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
-    lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
-    hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
-    hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+    lo = vsetq_lane_f64(_mm256_permute4x64_pd_select(a,imm8 >> 0), lo, 0);
+    lo = vsetq_lane_f64(_mm256_permute4x64_pd_select(a,imm8 >> 2), lo, 1);
+    hi = vsetq_lane_f64(_mm256_permute4x64_pd_select(a,imm8 >> 4), hi, 0);
+    hi = vsetq_lane_f64(_mm256_permute4x64_pd_select(a,imm8 >> 6), hi, 1);
 
     __m256d res;
     res.lo = lo; res.hi = hi;
@@ -846,6 +943,13 @@ __m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8)
         (uint8_t)(((imm8 >> 6) & 0x3) * sz),
     };
 
+    // We have to do this shifting because MSVC is strictly adhering to the CPP
+    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    // unions must be initialized by their first member type.
+    //
+    // NOTE: We assume little endian here, as per the Windows ARM64 ABI.
+
+#if !defined(_M_ARM64)
     uint8x16_t idx_lo = {
         // lo[0] bytes
         (uint8_t)(u64[0]+0), (uint8_t)(u64[0]+1), (uint8_t)(u64[0]+2), (uint8_t)(u64[0]+3),
@@ -864,6 +968,26 @@ __m256i _mm256_permute4x64_epi64 (const __m256i a, const int imm8)
         (uint8_t)(u64[3]+0), (uint8_t)(u64[3]+1), (uint8_t)(u64[3]+2), (uint8_t)(u64[3]+3),
         (uint8_t)(u64[3]+4), (uint8_t)(u64[3]+5), (uint8_t)(u64[3]+6), (uint8_t)(u64[3]+7),
     };
+#else
+    uint8x16_t idx_lo = {
+        // lo[0] bytes
+        ((uint64_t)(u64[0]+0) <<  0) | ((uint64_t)(u64[0]+1) <<  8) | ((uint64_t)(u64[0]+2) << 16) | ((uint64_t)(u64[0]+3) << 24) |
+        ((uint64_t)(u64[0]+4) << 32) | ((uint64_t)(u64[0]+5) << 40) | ((uint64_t)(u64[0]+6) << 48) | ((uint64_t)(u64[0]+7) << 56),
+
+        // lo[1] bytes
+        ((uint64_t)(u64[1]+0) <<  0) | ((uint64_t)(u64[1]+1) <<  8) | ((uint64_t)(u64[1]+2) << 16) | ((uint64_t)(u64[1]+3) << 24) |
+        ((uint64_t)(u64[1]+4) << 32) | ((uint64_t)(u64[1]+5) << 40) | ((uint64_t)(u64[1]+6) << 48) | ((uint64_t)(u64[1]+7) << 56)
+    };
+    uint8x16_t idx_hi = {
+        // hi[0] bytes
+        ((uint64_t)(u64[2]+0) <<  0) | ((uint64_t)(u64[2]+1) <<  8) | ((uint64_t)(u64[2]+2) << 16) | ((uint64_t)(u64[2]+3) << 24) |
+        ((uint64_t)(u64[2]+4) << 32) | ((uint64_t)(u64[2]+5) << 40) | ((uint64_t)(u64[2]+6) << 48) | ((uint64_t)(u64[2]+7) << 56),
+
+        // hi[1] bytes
+        ((uint64_t)(u64[3]+0) <<  0) | ((uint64_t)(u64[3]+1) <<  8) | ((uint64_t)(u64[3]+2) << 16) | ((uint64_t)(u64[3]+3) << 24) |
+        ((uint64_t)(u64[3]+4) << 32) | ((uint64_t)(u64[3]+5) << 40) | ((uint64_t)(u64[3]+6) << 48) | ((uint64_t)(u64[3]+7) << 56)
+    };
+#endif
 
     uint8x16_t lo = vqtbl2q_u8(tbl, idx_lo);
     uint8x16_t hi = vqtbl2q_u8(tbl, idx_hi);
@@ -1114,8 +1238,13 @@ void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
     float32x4_t a_hi = a.hi;
 
     for (int i=0;i<4;i++) {
+#if !defined(_M_ARM64)
         if (mask_lo[i] & 0x80000000) mem_addr[i] = a_lo[i];
         if (mask_hi[i] & 0x80000000) mem_addr[i+4] = a_hi[i];
+#else
+        if (mask_lo.n128_u32[i] & 0x80000000) mem_addr[i] = a_lo.n128_f32[i];
+        if (mask_hi.n128_u32[i] & 0x80000000) mem_addr[i+4] = a_hi.n128_f32[i];
+#endif
     }
 }
 
@@ -1146,6 +1275,15 @@ __m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
 
 }
 
+#if defined(_M_ARM64)
+// This is required as sse2neon's implementation uses the type "uint16_t"
+// which is a narrowing covnersion that MSVC doesn't like, and produces noisy warnings.
+// However, we know this is constrained to 0-255 as per the docs, so can disable it.
+//
+// TL;DR: Get rid of noisy MSVC warning that we can safely ignore.
+#pragma warning(push)
+#pragma warning(disable: 4838)
+#endif
 AVX2NEON_ABI
 __m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
 {
@@ -1154,7 +1292,9 @@ __m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
     res.hi = _mm_blend_epi16(a.hi,b.hi,imm8);
     return res;
 }
-
+#if defined(_M_ARM64)
+#pragma warning(pop)
+#endif
 
 
 AVX2NEON_ABI
@@ -1165,8 +1305,13 @@ __m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int
     int32x4_t lo,hi;
     for (int i=0;i<4;i++)
     {
+#if !defined(_M_ARM64)
         lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
         hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
+#else
+        lo.n128_i32[i] = *(int32_t *)((char *) base_addr + (vindex_lo.n128_i32[i]*scale));
+        hi.n128_i32[i] = *(int32_t *)((char *) base_addr + (vindex_hi.n128_i32[i]*scale));
+#endif
     }
 
     __m256i res;
@@ -1186,8 +1331,13 @@ __m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i
     lo = hi = _mm_setzero_si128();
     for (int i=0;i<4;i++)
     {
+#if !defined(_M_ARM64)
         if (mask_lo[i] >> 31) lo[i] = *(int32_t *)((char *) base_addr + (vindex_lo[i]*scale));
         if (mask_hi[i] >> 31) hi[i] = *(int32_t *)((char *) base_addr + (vindex_hi[i]*scale));
+#else
+        if (mask_lo.n128_u32[i] >> 31) lo.n128_i32[i] = *(int32_t *)((char *) base_addr + (vindex_lo.n128_i32[i]*scale));
+        if (mask_hi.n128_u32[i] >> 31) hi.n128_i32[i] = *(int32_t *)((char *) base_addr + (vindex_hi.n128_i32[i]*scale));
+#endif
     }
 
     __m256i res;
diff --git a/common/simd/arm/emulation.h b/common/simd/arm/emulation.h
index 8eea1ffe71..f9632d742f 100644
--- a/common/simd/arm/emulation.h
+++ b/common/simd/arm/emulation.h
@@ -4,7 +4,7 @@
 #pragma once
 
 /* Make precision match SSE, at the cost of some performance */
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
 #  define SSE2NEON_PRECISE_DIV 1
 #  define SSE2NEON_PRECISE_SQRT 1
 #endif
@@ -26,14 +26,6 @@ __forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
 // AVX2 emulation leverages Intel FMA defs above.  Include after them.
 #include "avx2neon.h"
 
-/* Dummy defines for floating point control */
-#define _MM_MASK_MASK 0x1f80
-#define _MM_MASK_DIV_ZERO 0x200
-// #define _MM_FLUSH_ZERO_ON 0x8000
-#define _MM_MASK_DENORM 0x100
-#define _MM_SET_EXCEPTION_MASK(x)
-// #define _MM_SET_FLUSH_ZERO_MODE(x)
-
 /*
 __forceinline int _mm_getcsr()
 {
diff --git a/common/simd/arm/sse2neon.h b/common/simd/arm/sse2neon.h
index 35e50a6e3e..2c87942c71 100644
--- a/common/simd/arm/sse2neon.h
+++ b/common/simd/arm/sse2neon.h
@@ -1,6 +1,30 @@
 #ifndef SSE2NEON_H
 #define SSE2NEON_H
 
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Copyright (c) 2015-2026 SSE2NEON Contributors.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 // This header file provides a simple API translation layer
 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
 //
@@ -26,62 +50,320 @@
 //   Jonathan Hue <jhue@adobe.com>
 //   Cuda Chen <clh960524@gmail.com>
 //   Aymen Qader <aymen.qader@arm.com>
+//   Anthony Roberts <anthony.roberts@linaro.org>
+//   Sean Luchen <seanluchen@google.com>
+//   Marcin Serwin <marcin@serwin.dev>
+//   Ben Niu <beniu@microsoft.com>
+//   Even Rouault <even.rouault@spatialys.com>
+//   Marcus Buretorp <marcus.buretorp@machinegames.com>
 
-/*
- * sse2neon is freely redistributable under the MIT License.
+/* Tunable configurations */
+
+/* PRECISION FLAGS
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
+ * These flags control the precision/performance trade-off for operations where
+ * NEON behavior diverges from x86 SSE. Default is 0 (performance over
+ * precision). Set to 1 before including this header for x86-compatible
+ * behavior.
  *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
+ * Example:
+ *   #define SSE2NEON_PRECISE_MINMAX 1  // Enable before include
+ *   #include "sse2neon.h"
  *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Recommended configurations:
+ *   - Performance: No flags (default)
+ *   - Balanced:    SSE2NEON_PRECISE_MINMAX=1, SSE2NEON_PRECISE_SQRT=1
+ *                  (ARMv7: also consider SSE2NEON_PRECISE_DIV=1 for division)
+ *   - Exact:       All flags set to 1
  */
 
-/* Tunable configurations */
-
-/* Enable precise implementation of math operations
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+/* SSE2NEON_PRECISE_MINMAX
+ * Affects: _mm_min_ps, _mm_max_ps, _mm_min_ss, _mm_max_ss,
+ *          _mm_min_pd, _mm_max_pd, _mm_min_sd, _mm_max_sd
+ *
+ * Issue: NEON fmin/fmax propagate NaN differently than SSE. When one operand
+ *        is NaN, SSE returns the second operand while NEON may return NaN.
+ *
+ * Default (0): Fast NEON min/max, potential NaN divergence
+ * Enabled (1): Additional comparison to match x86 NaN handling
+ *
+ * Symptoms when disabled: NaN "holes" in rendered images, unexpected NaN
+ * propagation in signal processing
  */
-/* _mm_min|max_ps|ss|pd|sd */
 #ifndef SSE2NEON_PRECISE_MINMAX
 #define SSE2NEON_PRECISE_MINMAX (0)
 #endif
-/* _mm_rcp_ps and _mm_div_ps */
+
+/* SSE2NEON_PRECISE_DIV
+ * Affects: _mm_rcp_ps, _mm_rcp_ss (all architectures)
+ *          _mm_div_ps, _mm_div_ss (ARMv7 only, ARMv8 uses native vdivq_f32)
+ *
+ * Issue: NEON reciprocal estimate (vrecpe) has ~11-bit precision. SSE's rcpps
+ *        provides ~12-bit precision. For division on ARMv7, we use reciprocal
+ *        approximation since there's no native divide instruction.
+ *
+ * Default (0): Single Newton-Raphson refinement (~12-bit precision)
+ * Enabled (1): Two N-R refinements (~24-bit precision)
+ *
+ * Note on reciprocals: Enabling this flag makes _mm_rcp_ps MORE accurate than
+ * SSE's specified ~12-bit precision. This improves ARMv7 division accuracy but
+ * may differ from code expecting SSE's coarser reciprocal approximation.
+ *
+ * WARNING: This flag improves numerical precision only. It does NOT fix
+ * IEEE-754 corner-case divergence (NaN propagation, signed zero, infinity
+ * handling). ARMv7 division behavior will still differ from x86 SSE for these
+ * edge cases.
+ *
+ * Symptoms when disabled: Slight precision differences in division-heavy code
+ */
 #ifndef SSE2NEON_PRECISE_DIV
 #define SSE2NEON_PRECISE_DIV (0)
 #endif
-/* _mm_sqrt_ps and _mm_rsqrt_ps */
+
+/* SSE2NEON_PRECISE_SQRT
+ * Affects: _mm_sqrt_ps, _mm_sqrt_ss, _mm_rsqrt_ps, _mm_rsqrt_ss
+ *
+ * Issue: NEON reciprocal square root estimate (vrsqrte) has lower precision
+ *        than x86 SSE's rsqrtps/sqrtps.
+ *
+ * Default (0): Single Newton-Raphson refinement
+ * Enabled (1): Two N-R refinements for improved precision
+ *
+ * Symptoms when disabled: Precision loss in physics simulations, graphics
+ * normalization, or iterative algorithms
+ */
 #ifndef SSE2NEON_PRECISE_SQRT
 #define SSE2NEON_PRECISE_SQRT (0)
 #endif
-/* _mm_dp_pd */
+
+/* SSE2NEON_PRECISE_DP
+ * Affects: _mm_dp_ps, _mm_dp_pd
+ *
+ * Issue: The dot product mask parameter controls which elements participate.
+ *        When an element is masked out, x86 multiplies by 0.0 while NEON
+ *        skips the multiply entirely.
+ *
+ * Default (0): Skip masked elements (faster, but 0.0 * NaN = NaN divergence)
+ * Enabled (1): Multiply masked elements by 0.0 (matches x86 NaN propagation)
+ *
+ * Symptoms when disabled: Different results when dot product inputs contain
+ * NaN in masked-out lanes
+ */
 #ifndef SSE2NEON_PRECISE_DP
 #define SSE2NEON_PRECISE_DP (0)
 #endif
 
+/* SSE2NEON_UNDEFINED_ZERO
+ * Affects: _mm_undefined_ps, _mm_undefined_si128, _mm_undefined_pd
+ *
+ * Issue: These intrinsics return vectors with "undefined" contents per Intel
+ *        spec. On x86, this means truly uninitialized memory (garbage values).
+ *
+ * MSVC Semantic Drift: MSVC on ARM forces zero-initialization for these
+ *        intrinsics, which differs from x86 behavior where garbage is returned.
+ *        GCC/Clang on ARM match x86 by returning uninitialized memory.
+ *
+ * This macro provides explicit control over the behavior:
+ *   Default (0): Compiler-dependent (MSVC=zero, GCC/Clang=undefined)
+ *   Enabled (1): Force zero-initialization on all compilers (safer, portable)
+ *
+ * When to enable:
+ *   - Deterministic behavior across compilers is required
+ *   - Debugging memory-related issues where undefined values cause problems
+ *   - Security-sensitive code where uninitialized memory is a concern
+ *
+ * Note: Using undefined values without first writing to them is undefined
+ * behavior. Well-formed code should not depend on either behavior.
+ */
+#ifndef SSE2NEON_UNDEFINED_ZERO
+#define SSE2NEON_UNDEFINED_ZERO (0)
+#endif
+
+/* SSE2NEON_MWAIT_POLICY
+ * Affects: _mm_mwait
+ *
+ * Issue: x86 MONITOR/MWAIT allows a thread to sleep until a write occurs to a
+ *        monitored address range. ARM has no userspace equivalent for address-
+ *        range monitoring. _mm_monitor is a no-op; _mm_mwait can only provide
+ *        low-power wait hints without true "wake on store" semantics.
+ *
+ * Note: The x86 extensions/hints parameters (C-state hints) are ignored on ARM
+ *       as there is no architectural equivalent. No memory ordering is provided
+ *       beyond what the hint instruction itself offers.
+ *
+ * WARNING: Policies 1 and 2 (WFE/WFI) may cause issues:
+ *   - WFE: May sleep until event/interrupt; can wake spuriously. Always check
+ *          your condition in a loop. May trap in EL0 (SCTLR_EL1.nTWE).
+ *   - WFI: May trap (SIGILL) in EL0 on Linux, iOS, macOS (SCTLR_EL1.nTWI).
+ *   - Neither provides "wake on address write" semantics.
+ *
+ * Policy values:
+ *   0 (default): yield - Safe everywhere, never blocks, just a hint
+ *   1:           wfe   - Event wait, may sleep until event/interrupt
+ *   2:           wfi   - Interrupt wait, may trap in EL0 on many platforms
+ *
+ * Recommended usage:
+ *   - Policy 0: General-purpose code, spin-wait loops (safe default)
+ *   - Policy 1: Only if you control both reader/writer and use SEV/SEVL
+ *   - Policy 2: Only for bare-metal or kernel code with known OS support
+ *
+ * Migration note: Code relying on x86 MONITOR/MWAIT for lock-free waiting
+ * should migrate to proper atomics + OS wait primitives (futex, condition
+ * variables) for correct cross-platform behavior.
+ */
+#ifndef SSE2NEON_MWAIT_POLICY
+#define SSE2NEON_MWAIT_POLICY (0)
+#endif
+
+/* Enable inclusion of windows.h on MSVC platforms
+ * This makes _mm_clflush functional on windows, as there is no builtin.
+ */
+#ifndef SSE2NEON_INCLUDE_WINDOWS_H
+#define SSE2NEON_INCLUDE_WINDOWS_H (0)
+#endif
+
+/* Consolidated Platform Detection
+ *
+ * These macros simplify platform-specific code throughout the header by
+ * providing single-point definitions for architecture and compiler detection.
+ * This reduces the 147+ verbose architecture checks to simple macro usage.
+ *
+ * Architecture:
+ *   SSE2NEON_ARCH_AARCH64 - 64-bit ARM (AArch64, including Apple Silicon)
+ *     Encompasses: __aarch64__, __arm64__, _M_ARM64, _M_ARM64EC
+ *
+ * Compiler:
+ *   SSE2NEON_COMPILER_GCC_COMPAT - GCC or Clang (supports GNU extensions)
+ *   SSE2NEON_COMPILER_MSVC       - Microsoft Visual C++
+ *   SSE2NEON_COMPILER_CLANG      - Clang specifically (subset of GCC_COMPAT)
+ */
+
+/* Compiler detection
+ *
+ * Check Clang first: it defines __GNUC__ for compatibility.
+ * Clang-CL also defines _MSC_VER for MSVC ABI compatibility.
+ *
+ * Compiler matrix:
+ *   Compiler   | GCC_COMPAT | CLANG | MSVC
+ *   -----------+------------+-------+------
+ *   GCC        |     1      |   0   |   0
+ *   Clang      |     1      |   1   |   0
+ *   Clang-CL   |     1      |   1   |   1
+ *   MSVC       |     0      |   0   |   1
+ */
+#if defined(__clang__)
+/* Clang compiler detected (including Apple Clang) */
+#define SSE2NEON_COMPILER_CLANG 1
+#define SSE2NEON_COMPILER_GCC_COMPAT 1 /* Clang supports GCC extensions */
+#if defined(_MSC_VER)
+#define SSE2NEON_COMPILER_MSVC 1 /* Clang-CL: Clang with MSVC on Windows */
+#else
+#define SSE2NEON_COMPILER_MSVC 0
+#endif
+/* Clang < 11 has known NEON codegen bugs (issue #622) */
+#if __clang_major__ < 11
+#error "Clang versions earlier than 11 are not supported."
+#endif
+
+#elif defined(__GNUC__)
+/* GCC compiler (only reached if not Clang, since Clang also defines __GNUC__)
+ */
+#define SSE2NEON_COMPILER_CLANG 0
+#define SSE2NEON_COMPILER_GCC_COMPAT 1
+#define SSE2NEON_COMPILER_MSVC 0
+/* GCC < 10 has incomplete ARM intrinsics support */
+#if __GNUC__ < 10
+#error "GCC versions earlier than 10 are not supported."
+#endif
+
+#elif defined(_MSC_VER)
+/* Microsoft Visual C++ (native, not Clang-CL) */
+#define SSE2NEON_COMPILER_CLANG 0
+#define SSE2NEON_COMPILER_GCC_COMPAT 0 /* No GCC extensions available */
+#define SSE2NEON_COMPILER_MSVC 1
+
+#else
+#error "Unsupported compiler. SSE2NEON requires GCC 10+, Clang 11+, or MSVC."
+#endif
+
+/* Architecture detection */
+#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || \
+    defined(_M_ARM64EC)
+#define SSE2NEON_ARCH_AARCH64 1
+#else
+#define SSE2NEON_ARCH_AARCH64 0
+#endif
+
+/* ARM64EC Support - EXPERIMENTAL with known limitations
+ *
+ * ARM64EC is Microsoft's hybrid ABI bridging x64 and ARM64 within a single
+ * Windows process, enabling incremental migration of x64 applications to ARM64.
+ * Compiler support remains incomplete (limited LLVM/GCC coverage).
+ *
+ * Compiler behavior:
+ * - MSVC defines both _M_AMD64 and _M_ARM64EC (but NOT _M_ARM64)
+ * - Requires arm64_neon.h instead of arm_neon.h
+ *
+ * Known limitations:
+ * 1. Windows headers: SSE2NEON_INCLUDE_WINDOWS_H must be 0 (default).
+ *    Include sse2neon.h BEFORE any Windows headers to avoid type conflicts.
+ * 2. Include order: sse2neon.h must be included BEFORE <intrin.h> or any C++
+ *    standard headers that pull it in (e.g., <cmath>, <algorithm>).
+ * 3. ABI boundary: __m128/SSE types must NOT cross x64/ARM64EC module
+ *    boundaries (exports/imports) as layouts differ between ABIs.
+ *    Users needing cross-ABI SIMD interop should use MSVC's softintrin.
+ * 4. CRC32 hardware intrinsics are disabled; software fallback is used.
+ *
+ * SSE2NEON_ARM64EC is 1 when compiling for ARM64EC with MSVC, 0 otherwise.
+ * Note: clang-cl ARM64EC builds are not currently detected by this macro.
+ *
+ * Recommendation: Use native ARM64 compilation when possible.
+ */
+#if SSE2NEON_COMPILER_MSVC && defined(_M_ARM64EC)
+#define SSE2NEON_ARM64EC 1
+#else
+#define SSE2NEON_ARM64EC 0
+#endif
+
+/* Early ARM64EC + SSE2NEON_INCLUDE_WINDOWS_H check.
+ * This must come BEFORE any standard includes because <intrin.h> and other
+ * headers can trigger winnt.h, which fails with "Must define a target
+ * architecture" on ARM64EC before we could emit our own error.
+ */
+#if SSE2NEON_ARM64EC && SSE2NEON_INCLUDE_WINDOWS_H
+#error \
+    "SSE2NEON_INCLUDE_WINDOWS_H=1 is not supported on ARM64EC. " \
+    "Include <windows.h> separately AFTER sse2neon.h instead."
+#endif
+
+/* Endianness check
+ *
+ * SSE2NEON assumes little-endian byte ordering for lane-to-memory mappings.
+ * Big-endian ARM targets would produce silently incorrect results because
+ * SSE intrinsics define lane ordering relative to little-endian memory layout.
+ *
+ * GCC/Clang define __BYTE_ORDER__. For compilers that don't (e.g., MSVC),
+ * we check for explicit big-endian ARM macros. MSVC only targets little-endian
+ * ARM, so no additional check is needed there.
+ */
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__)
+#error "sse2neon requires little-endian target; big-endian is not supported"
+#elif defined(__ARMEB__) || defined(__AARCH64EB__) || defined(__BIG_ENDIAN__)
+#error "sse2neon requires little-endian target; big-endian is not supported"
+#endif
+
 /* compiler specific definitions */
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma push_macro("FORCE_INLINE")
 #pragma push_macro("ALIGN_STRUCT")
 #define FORCE_INLINE static inline __attribute__((always_inline))
 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
 #define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
 #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
-#else /* non-GNU / non-clang compilers */
-#warning "Macro name collisions may happen with unsupported compiler."
+#elif SSE2NEON_COMPILER_MSVC
+#if _MSVC_TRADITIONAL
+#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
+#endif
 #ifndef FORCE_INLINE
 #define FORCE_INLINE static inline
 #endif
@@ -99,31 +381,178 @@
 #define _sse2neon_const const
 #endif
 
+#if defined(__cplusplus)
+#define _sse2neon_reinterpret_cast(t, e) reinterpret_cast<t>(e)
+#define _sse2neon_static_cast(t, e) static_cast<t>(e)
+#define _sse2neon_const_cast(t, e) const_cast<t>(e)
+#else
+#define _sse2neon_reinterpret_cast(t, e) ((t) (e))
+#define _sse2neon_static_cast(t, e) ((t) (e))
+#define _sse2neon_const_cast(t, e) ((t) (e))
+#endif
+
+/* ARM64EC winnt.h workaround: define architecture macros before any headers
+ * that might include winnt.h. Windows SDK 10.0.26100.0+ requires _ARM64EC_ or
+ * _ARM64_ but MSVC 17.x only defines _M_ARM64EC.
+ */
+#if SSE2NEON_ARM64EC
+/* Warn if winnt.h was already included - the workaround won't help */
+#ifdef _WINNT_
+#pragma message( \
+    "warning: sse2neon.h included after winnt.h; ARM64EC workaround may fail")
+#endif
+/* Define _ARM64EC_ for winnt.h architecture check (kept for user detection) */
+#if !defined(_ARM64EC_)
+#define _ARM64EC_ 1
+#define _SSE2NEON_DEFINED_ARM64EC_
+#endif
+/* Define _M_ARM64 temporarily for headers that derive _ARM64_ from it */
+#if !defined(_M_ARM64)
+#define _M_ARM64 1
+#define _SSE2NEON_DEFINED_M_ARM64
+#endif
+#endif /* SSE2NEON_ARM64EC */
+
+#include <fenv.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
 
-#if defined(_WIN32)
-/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
- * from both MinGW-w64 and MSVC.
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t val)
+{
+    double tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double val)
+{
+    int64_t tmp;
+    memcpy(&tmp, &val, sizeof(uint64_t));
+    return tmp;
+}
+
+/* MSVC provides _mm_{malloc,free} in <malloc.h>; MinGW needs our definitions
+ * but still uses _aligned_malloc/_aligned_free from <malloc.h>.
  */
+#if SSE2NEON_COMPILER_MSVC
 #define SSE2NEON_ALLOC_DEFINED
 #endif
 
 /* If using MSVC */
-#ifdef _MSC_VER
+#if SSE2NEON_COMPILER_MSVC
+
+/* ARM64EC SSE header blocking: pre-define include guards to prevent MSVC SSE
+ * headers (mmintrin.h, xmmintrin.h, etc.) and Windows SDK softintrin.h from
+ * loading, as their __m128 union types conflict with sse2neon's NEON types.
+ */
+#if SSE2NEON_ARM64EC || defined(_M_ARM64EC)
+/* Detect if <intrin.h> was already included - SSE types may have leaked.
+ * Check both _INTRIN_H_ and _INTRIN_H to cover different MSVC versions. */
+#if defined(_INTRIN_H_) || defined(_INTRIN_H)
+#error \
+    "sse2neon.h must be included BEFORE <intrin.h> or C++ headers on ARM64EC. " \
+    "SSE type definitions from <intrin.h> conflict with sse2neon's NEON types."
+#endif
+#define _INCLUDED_MM2
+#define _MMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+#define _EMMINTRIN_H_INCLUDED
+#define _PMMINTRIN_H_INCLUDED
+#define _TMMINTRIN_H_INCLUDED
+#define _SMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+#define _WMMINTRIN_H_INCLUDED
+#define _IMMINTRIN_H_INCLUDED
+#define _ZMMINTRIN_H_INCLUDED
+#define _AMMINTRIN_H_INCLUDED
+/* Block Windows SDK softintrin */
+#define _SOFTINTRIN_H_
+#define _DISABLE_SOFTINTRIN_ 1
+#endif /* SSE2NEON_ARM64EC */
 #include <intrin.h>
-#if (defined(_M_AMD64) || defined(__x86_64__)) || \
-    (defined(_M_ARM) || defined(__arm__))
+
+/* Windows headers inclusion.
+ * ARM64EC case is blocked by early check near SSE2NEON_ARM64EC definition.
+ */
+#if SSE2NEON_INCLUDE_WINDOWS_H
+#include <processthreadsapi.h>
+#include <windows.h>
+#endif
+
+/* Clean up _M_ARM64 (could mislead into pure ARM64 paths). Keep _ARM64EC_. */
+#ifdef _SSE2NEON_DEFINED_ARM64EC_
+#undef _SSE2NEON_DEFINED_ARM64EC_
+#endif
+#ifdef _SSE2NEON_DEFINED_M_ARM64
+#undef _M_ARM64
+#undef _SSE2NEON_DEFINED_M_ARM64
+#endif
+
+#if !defined(__cplusplus)
+#error "SSE2NEON only supports C++ compilation with this compiler"
+#endif
+
+#ifdef SSE2NEON_ALLOC_DEFINED
+#include <malloc.h>
+#endif
+
+/* 64-bit bit scanning available on x64 and AArch64 (including ARM64EC) */
+#if (defined(_M_AMD64) || defined(__x86_64__)) || SSE2NEON_ARCH_AARCH64
 #define SSE2NEON_HAS_BITSCAN64
 #endif
+
+#endif /* SSE2NEON_COMPILER_MSVC */
+
+/* MinGW uses _aligned_malloc/_aligned_free from <malloc.h> */
+#if defined(__MINGW32__)
+#include <malloc.h>
+#endif
+
+/* Statement expression helpers for macro-based intrinsics.
+ *
+ * For GCC/Clang: Uses __extension__({}) statement expressions which have
+ * natural access to all surrounding variables.
+ *
+ * For MSVC: Uses immediately-invoked lambdas. The distinction between
+ * _sse2neon_define[02] ([=] capture) and _sse2neon_define1 ([] no capture)
+ * exists for lambda capture semantics, though in practice both work the
+ * same since 'imm' parameters are compile-time constants that get
+ * substituted before the lambda is created.
+ */
+#if SSE2NEON_COMPILER_GCC_COMPAT
+#define _sse2neon_define0(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define1(type, s, body) _sse2neon_define0(type, s, body)
+#define _sse2neon_define2(type, a, b, body) \
+    __extension__({                         \
+        type _a = (a), _b = (b);            \
+        body                                \
+    })
+#define _sse2neon_return(ret) (ret)
+#else
+#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
+#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
+#define _sse2neon_define2(type, a, b, body) \
+    [=](type _a, type _b) { body }((a), (b))
+#define _sse2neon_return(ret) return ret
 #endif
 
+#define _sse2neon_init(...) {__VA_ARGS__}
+
 /* Compiler barrier */
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+#define SSE2NEON_BARRIER() _ReadWriteBarrier()
+#else
 #define SSE2NEON_BARRIER()                     \
     do {                                       \
         __asm__ __volatile__("" ::: "memory"); \
         (void) 0;                              \
     } while (0)
+#endif
 
 /* Memory barriers
  * __atomic_thread_fence does not include a compiler barrier; instead,
@@ -140,55 +569,62 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
     !defined(__STDC_NO_ATOMICS__)
     atomic_thread_fence(memory_order_seq_cst);
-#elif defined(__GNUC__) || defined(__clang__)
+#elif SSE2NEON_COMPILER_GCC_COMPAT
     __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#else
-    /* FIXME: MSVC support */
+#else /* MSVC */
+    __dmb(_ARM64_BARRIER_ISH);
 #endif
 }
 
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
+/* Architecture-specific build options.
+ * #pragma GCC push_options/target are GCC-specific; Clang ignores these.
+ * MSVC on ARM always has NEON/SIMD available.
  */
+#if SSE2NEON_COMPILER_GCC_COMPAT
+#if defined(__arm__)
+/* 32-bit ARM: ARMv7-A or ARMv8-A in AArch32 mode */
 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
 #endif
-#if !defined(__clang__)
+#if !SSE2NEON_COMPILER_CLANG
 #pragma GCC push_options
+#if __ARM_ARCH >= 8
+#pragma GCC target("fpu=neon-fp-armv8")
+#else
 #pragma GCC target("fpu=neon")
 #endif
-#elif defined(__aarch64__)
-#if !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#endif
-#elif __ARM_ARCH == 8
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error \
-    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
 #endif
-#if !defined(__clang__)
+#elif SSE2NEON_ARCH_AARCH64
+#if !SSE2NEON_COMPILER_CLANG
 #pragma GCC push_options
+#pragma GCC target("+simd")
 #endif
 #else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#error "Unsupported target. Must be ARMv7-A+NEON, ARMv8-A, or AArch64."
 #endif
 #endif
 
+/* ARM64EC: use arm64_neon.h (arm_neon.h guards with _M_ARM||_M_ARM64) */
+#if SSE2NEON_ARM64EC || defined(_M_ARM64EC)
+#include <arm64_neon.h>
+#else
 #include <arm_neon.h>
-#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#endif
+
+/* Include ACLE for CRC32 and other intrinsics on ARMv8+ */
+#if SSE2NEON_ARCH_AARCH64 || __ARM_ARCH >= 8
 #if defined __has_include && __has_include(<arm_acle.h>)
 #include <arm_acle.h>
+#define SSE2NEON_HAS_ACLE 1
+#else
+#define SSE2NEON_HAS_ACLE 0
 #endif
+#else
+#define SSE2NEON_HAS_ACLE 0
 #endif
 
 /* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
- * and other Arm microarchtectures use.
+ * and other Arm microarchitectures use.
  * From sysctl -a on Apple M1:
  * hw.cachelinesize: 128
  */
@@ -198,41 +634,30 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define SSE2NEON_CACHELINE_SIZE 64
 #endif
 
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
+/* Rounding functions require either Aarch64 instructions or libm fallback */
+#if !SSE2NEON_ARCH_AARCH64
 #include <math.h>
 #endif
 
-/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
- * or even not accessible in user mode.
- * To write or access to these registers in user mode,
- * we have to perform syscall instead.
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only or
+ * even not accessible in user mode.
+ * To write or access to these registers in user mode, we have to perform
+ * syscall instead.
  */
-#if !defined(__aarch64__)
+#if !SSE2NEON_ARCH_AARCH64
 #include <sys/time.h>
 #endif
 
 /* "__has_builtin" can be used to query support for built-in functions
  * provided by gcc/clang and other compilers that support it.
+ * GCC 10+ and Clang 11+ have native __has_builtin support.
+ * MSVC does not provide these GCC/Clang builtins.
  */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if defined(__GNUC__) && (__GNUC__ <= 9)
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-
-// __builtin_shuffle introduced in GCC 4.7.0
-#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
-#define HAS__builtin_shuffle 1
-#else
-#define HAS__builtin_shuffle 0
-#endif
-
-#define HAS__builtin_shufflevector 0
-#define HAS__builtin_nontemporal_store 0
-#else
+#ifndef __has_builtin
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
 #define __has_builtin(x) 0
+#else
+#error "Unsupported compiler: __has_builtin not available"
 #endif
 #endif
 
@@ -250,10 +675,30 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
 2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
 4)+16+3) } )
+#elif defined(_M_ARM64)
+#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( uint8x16_t{ \
+     ((uint64_t)(((fp3)*4)+0) << 0) | ((uint64_t)(((fp3)*4)+1) << 8) | ((uint64_t)(((fp3)*4)+2) << 16)| ((uint64_t)(((fp3)*4)+3) << 24)| ((uint64_t)(((fp2)*4)+0) << 32)| ((uint64_t)(((fp2)*4)+1) << 40)| ((uint64_t)(((fp2)*4)+2) << 48)| ((uint64_t)(((fp2)*4)+3) << 56), \
+     ((uint64_t)(((fp1)*4)+0) << 0) | ((uint64_t)(((fp1)*4)+1) << 8) | ((uint64_t)(((fp1)*4)+2) << 16)| ((uint64_t)(((fp1)*4)+3) << 24)| ((uint64_t)(((fp0)*4)+0) << 32)| ((uint64_t)(((fp0)*4)+1) << 40)| ((uint64_t)(((fp0)*4)+2) << 48)| ((uint64_t)(((fp0)*4)+3) << 56) } )
+#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( uint8x16_t{ \
+     ((uint64_t)(((fp3)*4)+0) << 0) | ((uint64_t)(((fp3)*4)+1) << 8) | ((uint64_t)(((fp3)*4)+2) << 16)| ((uint64_t)(((fp3)*4)+3) << 24)| ((uint64_t)(((fp2)*4)+0) << 32)| ((uint64_t)(((fp2)*4)+1) << 40)| ((uint64_t)(((fp2)*4)+2) << 48)| ((uint64_t)(((fp2)*4)+3) << 56), \
+     ((uint64_t)(((fp1)*4)+16+0) << 0) | ((uint64_t)(((fp1)*4)+16+1) << 8) | ((uint64_t)(((fp1)*4)+16+2) << 16)| ((uint64_t)(((fp1)*4)+16+3) << 24)| ((uint64_t)(((fp0)*4)+16+0) << 32)| ((uint64_t)(((fp0)*4)+16+1) << 40)| ((uint64_t)(((fp0)*4)+16+2) << 48)| ((uint64_t)(((fp0)*4)+16+3) << 56) } )
 #endif
 
+#ifndef _MM_SHUFFLE
 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_pd().
+ * Argument fp1 is a digit[01] that represents the fp from argument "b"
+ * of mm_shuffle_pd that will be placed in fp1 of result.
+ * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd
+ * that will be placed in fp0 of result.
+ */
+#ifndef _MM_SHUFFLE2
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+#endif
 
 #if __has_builtin(__builtin_shufflevector)
 #define _sse2neon_shuffle(type, a, b, ...) \
@@ -283,27 +728,215 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
 #define _MM_FROUND_CUR_DIRECTION 0x04
 #define _MM_FROUND_NO_EXC 0x08
 #define _MM_FROUND_RAISE_EXC 0x00
+#ifndef _MM_FROUND_NINT
 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#endif
+#ifndef _MM_FROUND_FLOOR
 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#endif
+#ifndef _MM_FROUND_CEIL
 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#endif
+#ifndef _MM_FROUND_TRUNC
 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#endif
+#ifndef _MM_FROUND_RINT
 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#endif
+#ifndef _MM_FROUND_NEARBYINT
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#endif
+#ifndef _MM_ROUND_NEAREST
 #define _MM_ROUND_NEAREST 0x0000
+#endif
+#ifndef _MM_ROUND_DOWN
 #define _MM_ROUND_DOWN 0x2000
+#endif
+#ifndef _MM_ROUND_UP
 #define _MM_ROUND_UP 0x4000
+#endif
+#ifndef _MM_ROUND_TOWARD_ZERO
 #define _MM_ROUND_TOWARD_ZERO 0x6000
-/* Flush zero mode macros. */
+#endif
+#ifndef _MM_ROUND_MASK
+#define _MM_ROUND_MASK 0x6000
+#endif
+/* Flush-to-zero (FTZ) mode macros.
+ * On x86, FTZ (MXCSR bit 15) flushes denormal outputs to zero.
+ * On ARM, FPCR/FPSCR bit 24 provides unified FZ+DAZ behavior.
+ * ARMv7 NEON: Per ARM ARM, Advanced SIMD has "Flush-to-zero mode always
+ *   enabled" - denormals flush regardless of FPSCR.FZ (some impls may vary).
+ * ARMv8: FPCR.FZ correctly controls denormal handling for NEON ops.
+ */
+#ifndef _MM_FLUSH_ZERO_MASK
 #define _MM_FLUSH_ZERO_MASK 0x8000
+#endif
+#ifndef _MM_FLUSH_ZERO_ON
 #define _MM_FLUSH_ZERO_ON 0x8000
+#endif
+#ifndef _MM_FLUSH_ZERO_OFF
 #define _MM_FLUSH_ZERO_OFF 0x0000
-/* Denormals are zeros mode macros. */
+#endif
+/* Denormals-are-zero (DAZ) mode macros.
+ * On x86, DAZ (MXCSR bit 6) treats denormal inputs as zero.
+ * On ARM, setting DAZ enables the same FPCR/FPSCR bit 24 as FTZ,
+ * providing unified handling for both input and output denormals.
+ */
+#ifndef _MM_DENORMALS_ZERO_MASK
 #define _MM_DENORMALS_ZERO_MASK 0x0040
+#endif
+#ifndef _MM_DENORMALS_ZERO_ON
 #define _MM_DENORMALS_ZERO_ON 0x0040
+#endif
+#ifndef _MM_DENORMALS_ZERO_OFF
 #define _MM_DENORMALS_ZERO_OFF 0x0000
+#endif
+
+/* MXCSR Exception Flags - NOT EMULATED
+ *
+ * SSE provides floating-point exception flags in the MXCSR register (bits 0-5)
+ * that are NOT emulated on ARM NEON. Code relying on _mm_getcsr() to detect
+ * floating-point exceptions will silently fail to detect them.
+ *
+ * MXCSR Exception Flag Layout (x86):
+ *   Bit 0 (IE): Invalid Operation Exception    - NOT EMULATED
+ *   Bit 1 (DE): Denormal Exception             - NOT EMULATED
+ *   Bit 2 (ZE): Divide-by-Zero Exception       - NOT EMULATED
+ *   Bit 3 (OE): Overflow Exception             - NOT EMULATED
+ *   Bit 4 (UE): Underflow Exception            - NOT EMULATED
+ *   Bit 5 (PE): Precision Exception            - NOT EMULATED
+ *
+ * MXCSR Exception Mask Layout (x86):
+ *   Bits 7-12: Exception masks (mask = suppress exception)  - NOT EMULATED
+ *
+ * Why Not Emulated:
+ * - ARM NEON does not set sticky exception flags like x86 SSE
+ * - ARM FPSR (Floating-Point Status Register) has different semantics
+ * - Emulating per-operation exception tracking would require wrapping every
+ *   floating-point intrinsic with software checks, severely impacting
+ * performance
+ * - Thread-local exception state tracking would add significant complexity
+ *
+ * Impact:
+ * - Scientific computing code checking for overflow/underflow will miss events
+ * - Financial applications validating precision will not detect precision loss
+ * - Numerical code checking for invalid operations (NaN generation) won't
+ * detect them
+ *
+ * Workarounds:
+ * - Use explicit NaN/Inf checks after critical operations: isnan(), isinf()
+ * - Implement application-level range validation for overflow detection
+ * - Use higher precision arithmetic where precision loss is critical
+ *
+ * The macros below are defined for API compatibility but provide no
+ * functionality.
+ */
+
+/* Exception flag macros (MXCSR bits 0-5) - defined for API compatibility only
+ */
+#ifndef _MM_EXCEPT_INVALID
+#define _MM_EXCEPT_INVALID 0x0001
+#endif
+#ifndef _MM_EXCEPT_DENORM
+#define _MM_EXCEPT_DENORM 0x0002
+#endif
+#ifndef _MM_EXCEPT_DIV_ZERO
+#define _MM_EXCEPT_DIV_ZERO 0x0004
+#endif
+#ifndef _MM_EXCEPT_OVERFLOW
+#define _MM_EXCEPT_OVERFLOW 0x0008
+#endif
+#ifndef _MM_EXCEPT_UNDERFLOW
+#define _MM_EXCEPT_UNDERFLOW 0x0010
+#endif
+#ifndef _MM_EXCEPT_INEXACT
+#define _MM_EXCEPT_INEXACT 0x0020
+#endif
+#ifndef _MM_EXCEPT_MASK
+#define _MM_EXCEPT_MASK                                             \
+    (_MM_EXCEPT_INVALID | _MM_EXCEPT_DENORM | _MM_EXCEPT_DIV_ZERO | \
+     _MM_EXCEPT_OVERFLOW | _MM_EXCEPT_UNDERFLOW | _MM_EXCEPT_INEXACT)
+#endif
+
+/* Exception mask macros (MXCSR bits 7-12) - defined for API compatibility only
+ */
+#ifndef _MM_MASK_INVALID
+#define _MM_MASK_INVALID 0x0080
+#endif
+#ifndef _MM_MASK_DENORM
+#define _MM_MASK_DENORM 0x0100
+#endif
+#ifndef _MM_MASK_DIV_ZERO
+#define _MM_MASK_DIV_ZERO 0x0200
+#endif
+#ifndef _MM_MASK_OVERFLOW
+#define _MM_MASK_OVERFLOW 0x0400
+#endif
+#ifndef _MM_MASK_UNDERFLOW
+#define _MM_MASK_UNDERFLOW 0x0800
+#endif
+#ifndef _MM_MASK_INEXACT
+#define _MM_MASK_INEXACT 0x1000
+#endif
+#ifndef _MM_MASK_MASK
+#define _MM_MASK_MASK                                         \
+    (_MM_MASK_INVALID | _MM_MASK_DENORM | _MM_MASK_DIV_ZERO | \
+     _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_INEXACT)
+#endif
 
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
+/* Exception state accessor macros - silent stubs for API compatibility.
+ * These macros exist for API compatibility but provide NO functionality.
+ * On ARM, exception flags are never set by sse2neon intrinsics.
+ *
+ * _MM_GET_EXCEPTION_STATE() - Always returns 0 (no exceptions detected)
+ * _MM_SET_EXCEPTION_STATE() - Silently ignored (cannot clear nonexistent flags)
+ * _MM_GET_EXCEPTION_MASK()  - Always returns all-masked (0x1F80)
+ * _MM_SET_EXCEPTION_MASK()  - Silently ignored (no effect on ARM)
+ */
+#ifndef _MM_GET_EXCEPTION_STATE
+#define _MM_GET_EXCEPTION_STATE() (0)
+#endif
+#ifndef _MM_SET_EXCEPTION_STATE
+#define _MM_SET_EXCEPTION_STATE(x) ((void) (x))
+#endif
+#ifndef _MM_GET_EXCEPTION_MASK
+#define _MM_GET_EXCEPTION_MASK() (_MM_MASK_MASK)
+#endif
+#ifndef _MM_SET_EXCEPTION_MASK
+#define _MM_SET_EXCEPTION_MASK(x) ((void) (x))
+#endif
+
+/* Compile-time validation for immediate constant arguments.
+ * This macro validates that:
+ * 1. The argument is a compile-time constant (via __builtin_constant_p)
+ * 2. The argument is within the specified range [min, max]
+ *
+ * When validation fails, __builtin_unreachable() is called to trigger
+ * compiler diagnostics. This pattern follows SIMDe's approach but adapted
+ * for use within macro bodies rather than as function attributes.
+ *
+ * Usage: Place at the beginning of macro bodies that require immediate
+ * constant arguments. The macro expands to a statement, so use a semicolon:
+ *   SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);
+ */
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_constant_p) && __has_builtin(__builtin_unreachable)
+#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max)                         \
+    (void) ((__builtin_constant_p(arg) && ((arg) < (min) || (arg) > (max))) \
+                ? (__builtin_unreachable(), 0)                              \
+                : 0)
+#endif
+#endif
+#if !defined(SSE2NEON_REQUIRE_CONST_RANGE)
+/* Fallback: no compile-time validation */
+#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max) ((void) 0)
+#endif
+
+/* Allow users to disable constant validation if needed for testing */
+#ifdef SSE2NEON_DISABLE_CONSTANT_VALIDATION
+#undef SSE2NEON_REQUIRE_CONST_RANGE
+#define SSE2NEON_REQUIRE_CONST_RANGE(arg, min, max) ((void) 0)
+#endif
 
 /* A few intrinsics accept traditional data types like ints or floats, but
  * most operate on data types that are specific to SSE.
@@ -316,13 +949,18 @@ typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
 // On ARM 32-bit architecture, the float64x2_t is not supported.
 // The data type __m128d should be represented in a different way for related
 // intrinsic conversion.
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
 #else
 typedef float32x4_t __m128d;
 #endif
 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
 // __int64 is defined in the Intrinsics Guide which maps to different datatype
 // in different data model
 #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
@@ -412,7 +1050,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 
 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
 
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
 
@@ -449,7 +1087,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
 // by applications which attempt to access the contents of an __m128 struct
 // directly.  It is important to note that accessing the __m128 struct directly
 // is bad coding practice by Microsoft: @see:
-// https://docs.microsoft.com/en-us/cpp/cpp/m128
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
 //
 // However, some legacy source code may try to access the contents of an __m128
 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
@@ -480,10 +1118,128 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
 } SIMDVec;
 
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) \
+    (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) \
+    (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) \
+    (_sse2neon_reinterpret_cast(SIMDVec *, &x)->m128_u8[n])
+
+/* Portable infinity check using IEEE 754 bit representation.
+ * Infinity has all exponent bits set and zero mantissa bits.
+ * This avoids dependency on math.h INFINITY macro or compiler builtins.
+ */
+FORCE_INLINE int _sse2neon_isinf_f32(float v)
+{
+    union {
+        float f;
+        uint32_t u;
+    } u = {v};
+    /* Mask out sign bit, check if remaining bits equal infinity pattern */
+    return (u.u & 0x7FFFFFFF) == 0x7F800000;
+}
+
+FORCE_INLINE int _sse2neon_isinf_f64(double v)
+{
+    union {
+        double d;
+        uint64_t u;
+    } u = {v};
+    return (u.u & 0x7FFFFFFFFFFFFFFFULL) == 0x7FF0000000000000ULL;
+}
+
+/* Safe helper to load double[2] as float32x4_t without strict aliasing
+ * violation. Used in ARMv7 fallback paths where float64x2_t is not natively
+ * supported.
+ */
+FORCE_INLINE float32x4_t sse2neon_vld1q_f32_from_f64pair(const double *p)
+{
+    float32x4_t tmp;
+    memcpy(&tmp, p, sizeof(tmp));
+    return tmp;
+}
+
+/* Safe float/double to integer conversion with x86 SSE semantics.
+ * x86 SSE returns the "integer indefinite" value (0x80000000 for int32,
+ * 0x8000000000000000 for int64) for all out-of-range conversions including
+ * NaN, infinity, and values exceeding the representable range.
+ * ARM NEON differs by saturating to INT_MAX/INT_MIN for overflows and
+ * returning 0 for NaN, so we need these helpers to ensure x86 compatibility.
+ */
+FORCE_INLINE int32_t _sse2neon_cvtd_s32(double v)
+{
+    /* Check for NaN or infinity first */
+    if (v != v || _sse2neon_isinf_f64(v))
+        return INT32_MIN;
+    /* INT32_MAX is exactly representable as double (2147483647.0) */
+    if (v >= _sse2neon_static_cast(double, INT32_MAX) + 1.0)
+        return INT32_MIN;
+    if (v < _sse2neon_static_cast(double, INT32_MIN))
+        return INT32_MIN;
+    return _sse2neon_static_cast(int32_t, v);
+}
+
+FORCE_INLINE int32_t _sse2neon_cvtf_s32(float v)
+{
+    if (v != v || _sse2neon_isinf_f32(v))
+        return INT32_MIN;
+    /* (float)INT32_MAX rounds up to 2147483648.0f, which is out of range.
+     * Use the double representation for accurate comparison.
+     */
+    if (v >= _sse2neon_static_cast(double, INT32_MAX) + 1.0)
+        return INT32_MIN;
+    if (v < _sse2neon_static_cast(double, INT32_MIN))
+        return INT32_MIN;
+    return _sse2neon_static_cast(int32_t, v);
+}
+
+FORCE_INLINE int64_t _sse2neon_cvtd_s64(double v)
+{
+    if (v != v || _sse2neon_isinf_f64(v))
+        return INT64_MIN;
+    /* (double)INT64_MAX rounds up to 2^63 which is out of range.
+     * Any double >= 2^63 is out of range for int64.
+     */
+    if (v >= _sse2neon_static_cast(double, INT64_MAX))
+        return INT64_MIN;
+    if (v < _sse2neon_static_cast(double, INT64_MIN))
+        return INT64_MIN;
+    return _sse2neon_static_cast(int64_t, v);
+}
+
+FORCE_INLINE int64_t _sse2neon_cvtf_s64(float v)
+{
+    if (v != v || _sse2neon_isinf_f32(v))
+        return INT64_MIN;
+    /* (float)INT64_MAX rounds up significantly beyond INT64_MAX */
+    if (v >= _sse2neon_static_cast(float, INT64_MAX))
+        return INT64_MIN;
+    if (v < _sse2neon_static_cast(float, INT64_MIN))
+        return INT64_MIN;
+    return _sse2neon_static_cast(int64_t, v);
+}
+
+/* Vectorized helper: apply x86 saturation semantics to NEON conversion result.
+ * ARM returns 0 for NaN and INT32_MAX for positive overflow, but x86 returns
+ * INT32_MIN ("integer indefinite") for both. This function fixes up the result.
+ */
+FORCE_INLINE int32x4_t _sse2neon_cvtps_epi32_fixup(float32x4_t f, int32x4_t cvt)
+{
+    /* Detect values >= 2147483648.0f (out of INT32 range) */
+    float32x4_t max_f = vdupq_n_f32(2147483648.0f);
+    uint32x4_t overflow = vcgeq_f32(f, max_f);
+
+    /* Detect NaN: x != x for NaN values */
+    uint32x4_t is_nan = vmvnq_u32(vceqq_f32(f, f));
+
+    /* Combine: any overflow or NaN should produce INT32_MIN */
+    uint32x4_t need_indefinite = vorrq_u32(overflow, is_nan);
+
+    /* Blend: select INT32_MIN where needed */
+    int32x4_t indefinite = vdupq_n_s32(INT32_MIN);
+    return vbslq_s32(need_indefinite, indefinite, cvt);
+}
 
 /* SSE macros */
 #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
@@ -493,7 +1249,9 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 
 // Function declaration
 // SSE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void);
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int);
 FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
 FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
 FORCE_INLINE __m128 _mm_set_ps1(float);
@@ -509,7 +1267,7 @@ FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
 FORCE_INLINE __m128d _mm_set_pd(double, double);
 FORCE_INLINE __m128i _mm_set1_epi32(int);
-FORCE_INLINE __m128i _mm_setzero_si128();
+FORCE_INLINE __m128i _mm_setzero_si128(void);
 // SSE4.1
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
@@ -523,10 +1281,9 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 /* Backwards compatibility for compilers with lack of specific type support */
 
 // Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__) &&                        \
-    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
-     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
-     (__GNUC__ <= 9 && defined(__aarch64__)))
+#if defined(__GNUC__) && !defined(__clang__) && \
+    ((__GNUC__ <= 13 && defined(__arm__)) ||    \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)))
 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 {
     uint8x16x4_t ret;
@@ -544,7 +1301,39 @@ FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
 }
 #endif
 
-#if !defined(__aarch64__)
+/* Wrapper for vcreate_u64 to handle Apple iOS toolchain variations.
+ * On iOS, vcreate_u64 may be defined as a macro in arm_neon.h, which can
+ * cause parsing issues in complex macro expansions.
+ * This wrapper provides a function-call interface using vdup_n_u64(), which
+ * is bit-exact and avoids macro expansion pitfalls.
+ *
+ * Other AArch64 platforms (Linux, macOS, Android) use native vcreate_u64.
+ *
+ * User override: Define SSE2NEON_IOS_COMPAT=1 to enable,
+ *                or SSE2NEON_IOS_COMPAT=0 to disable.
+ */
+#if defined(__APPLE__) && SSE2NEON_ARCH_AARCH64
+#include <TargetConditionals.h>
+#endif
+
+#ifndef SSE2NEON_IOS_COMPAT
+#if defined(__APPLE__) && SSE2NEON_ARCH_AARCH64 && TARGET_OS_IOS
+#define SSE2NEON_IOS_COMPAT 1
+#else
+#define SSE2NEON_IOS_COMPAT 0
+#endif
+#endif
+
+#if SSE2NEON_IOS_COMPAT
+FORCE_INLINE uint64x1_t _sse2neon_vcreate_u64(uint64_t a)
+{
+    return vdup_n_u64(a);
+}
+#else
+#define _sse2neon_vcreate_u64(a) vcreate_u64(a)
+#endif
+
+#if !SSE2NEON_ARCH_AARCH64
 /* emulate vaddv u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 {
@@ -559,7 +1348,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
 }
 #endif
 
-#if !defined(__aarch64__)
+#if !SSE2NEON_ARCH_AARCH64
 /* emulate vaddvq u8 variant */
 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 {
@@ -577,7 +1366,7 @@ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
 }
 #endif
 
-#if !defined(__aarch64__)
+#if !SSE2NEON_ARCH_AARCH64
 /* emulate vaddvq u16 variant */
 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 {
@@ -585,7 +1374,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
     uint64x2_t n = vpaddlq_u32(m);
     uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
 
-    return vget_lane_u32((uint32x2_t) o, 0);
+    return vget_lane_u32(vreinterpret_u32_u64(o), 0);
 }
 #else
 // Wraps vaddvq_u16
@@ -595,6 +1384,33 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
 }
 #endif
 
+/* Fast "any nonzero" check for horizontal reduction in PCMPXSTR operations.
+ * These helpers are optimized for the "any match" test pattern common in
+ * string comparison intrinsics. On ARMv7, OR-based reduction is used instead
+ * of max-based reduction for slightly better performance on some cores.
+ *
+ * For NEON comparison results (0x00 or 0xFF per lane), OR-based reduction
+ * correctly detects any nonzero element because: max(a,b) > 0 IFF OR(a,b) != 0
+ */
+#if !SSE2NEON_ARCH_AARCH64
+/* ARMv7: OR-based reduction - 3 ops vs 4 ops for vpmax cascade */
+FORCE_INLINE uint32_t _sse2neon_any_nonzero_u8x16(uint8x16_t v)
+{
+    uint32x4_t as_u32 = vreinterpretq_u32_u8(v);
+    uint32x2_t or_half = vorr_u32(vget_low_u32(as_u32), vget_high_u32(as_u32));
+    uint32x2_t or_final = vorr_u32(or_half, vrev64_u32(or_half));
+    return vget_lane_u32(or_final, 0);
+}
+
+FORCE_INLINE uint32_t _sse2neon_any_nonzero_u16x8(uint16x8_t v)
+{
+    uint32x4_t as_u32 = vreinterpretq_u32_u16(v);
+    uint32x2_t or_half = vorr_u32(vget_low_u32(as_u32), vget_high_u32(as_u32));
+    uint32x2_t or_final = vorr_u32(or_half, vrev64_u32(or_half));
+    return vget_lane_u32(or_final, 0);
+}
+#endif
+
 /* Function Naming Conventions
  * The naming convention of SSE intrinsics is straightforward. A generic SSE
  * intrinsic function is given as follows:
@@ -607,7 +1423,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  * This last part, <data_type>, is a little complicated. It identifies the
  * content of the input values, and can be set to any of the following values:
  * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
  *                            signed integers
  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
@@ -629,50 +1445,16 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
  *                                  4, 5, 12, 13, 6, 7, 14, 15);
  *   // Shuffle packed 8-bit integers
  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- *
- * Data (Number, Binary, Byte Index):
-    +------+------+-------------+------+------+-------------+
-    |      1      |      2      |      3      |      4      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |      5      |      6      |      7      |      8      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
- * Index (Byte Index):
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
-    +------+------+------+------+------+------+------+------+
- * Result:
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
-    +------+------+------+------+------+------+------+------+
-    |     256     |      2      |      5      |      6      | Number
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |      3      |      7      |      4      |      8      | Number
-    +------+------+------+------+------+------+-------------+
  */
 
 /* Constants for use with _mm_prefetch. */
+#if SSE2NEON_ARM64EC
+/* winnt.h defines these as macros; undef to allow our enum definition */
+#undef _MM_HINT_NTA
+#undef _MM_HINT_T0
+#undef _MM_HINT_T1
+#undef _MM_HINT_T2
+#endif
 enum _mm_hint {
     _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
     _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
@@ -688,7 +1470,7 @@ typedef struct {
     uint8_t bit23 : 1;
     uint8_t bit24 : 1;
     uint8_t res2 : 7;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint32_t res3;
 #endif
 } fpcr_bitfield;
@@ -785,11 +1567,16 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
 
 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
 {
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    /* vtrn interleaves elements: trn1({a[2],a[3]}, {a[0],a[1]}) = {a[2], a[0]}
+     */
+#if SSE2NEON_ARCH_AARCH64
+    float32x2_t a02 = vtrn1_f32(vget_high_f32(_a), vget_low_f32(_a));
+#else
+    float32x2_t a02 = vtrn_f32(vget_high_f32(_a), vget_low_f32(_a)).val[0];
+#endif
+    float32x2_t b32 = vget_high_f32(_b);
     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
 }
 
@@ -828,24 +1615,24 @@ FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
 }
 
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-#if defined(__ARM_FEATURE_CRYPTO) && \
-    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// For MSVC, we check only if it is ARM64, as every single ARM64 processor
+// supported by WoA has crypto extensions. If this changes in the future,
+// this can be verified via the runtime-only method of:
+// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
+#if ((defined(_M_ARM64) || SSE2NEON_ARM64EC) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&                                   \
+     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
 // Wraps vmull_p64
 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    __n64 a1 = {a}, b1 = {b};
+    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
+#else
     return vreinterpretq_u64_p128(vmull_p64(a, b));
+#endif
 }
 #else  // ARMv7 polyfill
 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
@@ -896,7 +1683,7 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 
     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
     // instructions.
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
@@ -924,7 +1711,7 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
 
     // De-interleave
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t t0 = vreinterpretq_u8_u64(
         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
     uint8x16_t t1 = vreinterpretq_u8_u64(
@@ -955,29 +1742,25 @@ static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 #endif  // ARMv7 polyfill
 
 // C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
+//   __m128i _mm_shuffle_epi32_default(__m128i a, const int imm) {
+//       // imm must be a compile-time constant in range [0, 255]
 //       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = a[((imm) >> 4) & 0x03];  ret[3] = a[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 #define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
+    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
+        vsetq_lane_s32(                                                     \
             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
+            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
+                                          ((imm) >> 2) & 0x3),              \
+                           vmovq_n_s32(vgetq_lane_s32(                      \
+                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
+                           1),                                              \
+            2),                                                             \
+        3))
 
 // Takes the upper 64 bits of a and places it in the low end of the result
 // Takes the lower 64 bits of a and places it into the high end of the result.
@@ -1061,62 +1844,49 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
+#if SSE2NEON_ARCH_AARCH64
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
 #else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(            \
+        vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
 #endif
 
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
 //
 // C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, const int imm) {
+//       // imm must be a compile-time constant in range [0, 255]
 //       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       ret[0] = a[(imm)        & 0x3];   ret[1] = a[((imm) >> 2) & 0x3];
+//       ret[2] = b[((imm) >> 4) & 0x03];  ret[3] = b[((imm) >> 6) & 0x03];
 //       return ret;
 //   }
 //
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
+        vsetq_lane_f32(                                                        \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
+            vsetq_lane_f32(                                                    \
+                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+                vmovq_n_f32(                                                   \
+                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
+                1),                                                            \
+            2),                                                                \
+        3))
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
 #define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
         int16x4_t lowBits = vget_low_s16(ret);                                \
         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
@@ -1125,18 +1895,15 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
                              2);                                              \
         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
                              3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
 
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
 #define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+    _sse2neon_define1(                                                         \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
         int16x4_t highBits = vget_high_s16(ret);                               \
         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
@@ -1145,8 +1912,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
                              6);                                               \
         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
                              7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
 
 /* MMX */
 
@@ -1155,22 +1921,19 @@ FORCE_INLINE void _mm_empty(void) {}
 
 /* SSE */
 
-// Adds the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_f32(
         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
 {
     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
@@ -1179,30 +1942,18 @@ FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
     return vreinterpretq_m128_f32(vaddq_f32(a, value));
 }
 
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
-//
-//   r0 := a0 & b0
-//   r1 := a1 & b1
-//   r2 := a2 & b2
-//   r3 := a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_s32(
         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 }
 
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
-//
-//   r0 := ~a0 & b0
-//   r1 := ~a1 & b1
-//   r2 := ~a2 & b2
-//   r3 := ~a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_s32(
@@ -1212,12 +1963,6 @@ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 
 // Average packed unsigned 16-bit integers in a and b, and store the results in
 // dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
 {
@@ -1227,12 +1972,6 @@ FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
 
 // Average packed unsigned 8-bit integers in a and b, and store the results in
 // dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
 {
@@ -1240,173 +1979,192 @@ FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
 }
 
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(
         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for equality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
 }
 
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(
         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
 }
 
-// Compares for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(
         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
 }
 
-// Compares for less than or equal.
-//
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(
         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmple_ps(a, b));
 }
 
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(
         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for less than
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
 }
 
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(vmvnq_u32(
         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Compares for inequality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
 }
 
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(vmvnq_u32(
         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
 }
 
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(vmvnq_u32(
         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
 }
 
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(vmvnq_u32(
         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
 }
 
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_u32(vmvnq_u32(
         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
 }
 
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
@@ -1421,15 +2179,18 @@ FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
 }
 
-// Compares for ordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
 }
 
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
 {
     uint32x4_t f32a =
@@ -1439,16 +2200,18 @@ FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
 }
 
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
 {
     uint32x4_t a_eq_b =
@@ -1456,9 +2219,9 @@ FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
     return vgetq_lane_u32(a_eq_b, 0) & 0x1;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
 {
     uint32x4_t a_ge_b =
@@ -1466,9 +2229,9 @@ FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
     return vgetq_lane_u32(a_ge_b, 0) & 0x1;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
 {
     uint32x4_t a_gt_b =
@@ -1476,9 +2239,9 @@ FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
     return vgetq_lane_u32(a_gt_b, 0) & 0x1;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
 {
     uint32x4_t a_le_b =
@@ -1486,11 +2249,9 @@ FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
     return vgetq_lane_u32(a_le_b, 0) & 0x1;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
 {
     uint32x4_t a_lt_b =
@@ -1498,9 +2259,9 @@ FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
     return vgetq_lane_u32(a_lt_b, 0) & 0x1;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
 {
     return !_mm_comieq_ss(a, b);
@@ -1510,12 +2271,6 @@ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
 // (32-bit) floating-point elements, store the results in the lower 2 elements
 // of dst, and copy the upper 2 packed elements from a to the upper elements of
 // dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 {
@@ -1526,16 +2281,10 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//       i := 32*j
-//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpret_m64_s32(
         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
 #else
@@ -1547,15 +2296,11 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
 // Convert the signed 32-bit integer b to a single-precision (32-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 {
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        _sse2neon_static_cast(float, b), vreinterpretq_f32_m128(a), 0));
 }
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
@@ -1563,25 +2308,18 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
                           0);
 #else
     float32_t data = vgetq_lane_f32(
         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int32_t) data;
+    return _sse2neon_static_cast(int32_t, data);
 #endif
 }
 
 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
 // floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
 {
@@ -1592,12 +2330,6 @@ FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
 // floating-point elements, store the results in the lower 2 elements of dst,
 // and copy the upper 2 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 {
@@ -1611,12 +2343,6 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
 // of dst, then convert the packed signed 32-bit integers in b to
 // single-precision (32-bit) floating-point element, and store the results in
 // the upper 2 elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
-//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
 {
@@ -1626,13 +2352,6 @@ FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
 
 // Convert the lower packed 8-bit integers in a to packed single-precision
 // (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
 {
@@ -1644,17 +2363,6 @@ FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
 // 0x7FFFFFFF.
-//
-//   FOR j := 0 to 3
-//     i := 16*j
-//     k := 32*j
-//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
-//       dst[i+15:i] := 0x7FFF
-//     ELSE
-//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 {
@@ -1664,12 +2372,6 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//       i := 32*j
-//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
 
@@ -1677,17 +2379,6 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
 // packed 8-bit integers, and store the results in lower 4 elements of dst.
 // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
 // between 0x7F and 0x7FFFFFFF.
-//
-//   FOR j := 0 to 3
-//     i := 8*j
-//     k := 32*j
-//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
-//       dst[i+7:i] := 0x7F
-//     ELSE
-//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
 {
@@ -1697,13 +2388,6 @@ FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
 
 // Convert packed unsigned 16-bit integers in a to packed single-precision
 // (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
 {
@@ -1714,13 +2398,6 @@ FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
 // Convert the lower packed unsigned 8-bit integers in a to packed
 // single-precision (32-bit) floating-point elements, and store the results in
 // dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 {
@@ -1731,31 +2408,20 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
 // Convert the signed 32-bit integer b to a single-precision (32-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
 
 // Convert the signed 64-bit integer b to a single-precision (32-bit)
 // floating-point element, store the result in the lower element of dst, and
 // copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
 {
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        _sse2neon_static_cast(float, b), vreinterpretq_f32_m128(a), 0));
 }
 
 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 {
@@ -1764,111 +2430,95 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 32-bit integer, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 64-bit integer, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return _sse2neon_static_cast(
+        int64_t, vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0));
 #else
     float32_t data = vgetq_lane_f32(
         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
-    return (int64_t) data;
+    return _sse2neon_static_cast(int64_t, data);
 #endif
 }
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
 {
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+    float32x4_t f = vreinterpretq_f32_m128(a);
+    int32x4_t cvt = vcvtq_s32_f32(f);
+    int32x4_t result = _sse2neon_cvtps_epi32_fixup(f, cvt);
+    return vreinterpret_m64_s32(vget_low_s32(result));
 }
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
+// x86 returns INT32_MIN for NaN and out-of-range values.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
 {
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+    return _sse2neon_cvtf_s32(vgetq_lane_f32(vreinterpretq_f32_m128(a), 0));
 }
 
 // Convert packed single-precision (32-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
 
 // Convert the lower single-precision (32-bit) floating-point element in a to a
 // 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
-//
+// x86 returns INT64_MIN for NaN and out-of-range values.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
 {
-    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    return _sse2neon_cvtf_s64(vgetq_lane_f32(vreinterpretq_f32_m128(a), 0));
 }
 
-// Divides the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 / b0
-//   r1 := a1 / b1
-//   r2 := a2 / b2
-//   r3 := a3 / b3
-//
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results. Use SSE2NEON_PRECISE_DIV for improved
+// precision on ARMv7.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    float32x4_t recip = vrecpeq_f32(_b);
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, _b));
 #if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    // Additional Newton-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, _b));
 #endif
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+    return vreinterpretq_m128_f32(vmulq_f32(_a, recip));
 #endif
 }
 
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 {
     float32_t value =
@@ -1880,35 +2530,65 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 // Extract a 16-bit integer from a, selected with imm8, and store the result in
 // the lower element of dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
-#define _mm_extract_pi16(a, imm) \
-    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_extract_pi16(a, imm)              \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \
+     _sse2neon_static_cast(int32_t,           \
+                           vget_lane_u16(vreinterpret_u16_m64(a), (imm))))
 
 // Free aligned memory that was allocated with _mm_malloc.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+//
+// WARNING: Only use on pointers from _mm_malloc(). On Windows, passing memory
+// from malloc/calloc/new corrupts the heap. See _mm_malloc() for details.
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void _mm_free(void *addr)
 {
+#if defined(_WIN32)
+    _aligned_free(addr);
+#else
     free(addr);
+#endif
+}
+#endif
+
+FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
+{
+    uint64_t value;
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    value = _ReadStatusReg(ARM64_FPCR);
+#else
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
+#endif
+    return value;
 }
+
+FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
+{
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    _WriteStatusReg(ARM64_FPCR, value);
+#else
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
 #endif
+}
 
 // Macro: Get the flush zero bits from the MXCSR control and status register.
 // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
 // _MM_FLUSH_ZERO_OFF
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
-FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#if SSE2NEON_ARCH_AARCH64
+    r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
@@ -1920,41 +2600,38 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
-FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
-{
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
-    if (r.field.bit22) {
-        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
-    } else {
-        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
+{
+    const int mask = FE_TONEAREST | FE_DOWNWARD | FE_UPWARD | FE_TOWARDZERO;
+    switch (fegetround() & mask) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
     }
 }
 
 // Copy a to dst, and insert the 16-bit integer i into dst at the location
 // specified by imm8.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_insert_pi16(a, b, imm)            \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \
+     vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))))
 
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
 FORCE_INLINE __m128 _mm_load_ps(const float *p)
 {
     return vreinterpretq_m128_f32(vld1q_f32(p));
@@ -1971,64 +2648,48 @@ FORCE_INLINE __m128 _mm_load_ps(const float *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
 #define _mm_load_ps1 _mm_load1_ps
 
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
 FORCE_INLINE __m128 _mm_load_ss(const float *p)
 {
     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
 }
 
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
 {
     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
 }
 
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
-//
-//   r0 := a0
-//   r1 := a1
-//   r2 := *p0
-//   r3 := *p1
-//
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
 {
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+    return vreinterpretq_m128_f32(vcombine_f32(
+        vget_low_f32(a),
+        vld1_f32(_sse2neon_reinterpret_cast(const float32_t *, p))));
 }
 
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
-//
-// Return Value
-//   r0 := *p0
-//   r1 := *p1
-//   r2 := a2
-//   r3 := a3
-//
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
 {
     return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+        vcombine_f32(vld1_f32(_sse2neon_reinterpret_cast(const float32_t *, p)),
+                     vget_high_f32(a)));
 }
 
 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
 // general-protection exception may be generated.
-//
-//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
-//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
-//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
 {
@@ -2036,8 +2697,10 @@ FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
 }
 
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 {
     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
@@ -2046,35 +2709,48 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 }
 
 // Load unaligned 16-bit integer from memory into the first element of dst.
-//
-//   dst[15:0] := MEM[mem_addr+15:mem_addr]
-//   dst[MAX:16] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
 {
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+    return vreinterpretq_m128i_s16(vsetq_lane_s16(
+        *_sse2neon_reinterpret_cast(const unaligned_int16_t *, p),
+        vdupq_n_s16(0), 0));
 }
 
 // Load unaligned 64-bit integer from memory into the first element of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[MAX:64] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
 {
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(
+        *_sse2neon_reinterpret_cast(const unaligned_int64_t *, p),
+        vdupq_n_s64(0), 0));
 }
 
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+//
+// Memory allocated by this function MUST be freed with _mm_free(), NOT with
+// standard free() or delete. Mixing allocators:
+//   - Windows: CORRUPTS HEAP (free on _aligned_malloc memory is invalid)
+//   - Other platforms: Works (maps to free), but pair for Windows portability
+//
+// Incorrect usage (causes memory corruption on Windows):
+//   void *ptr = _mm_malloc(1024, 16);
+//   free(ptr);  // WRONG - use _mm_free() instead
+//
+// Implementation notes:
+//   - Windows: Uses _aligned_malloc()
+//   - Other platforms: Uses posix_memalign() or malloc() for small alignments
+//
+// See also: _mm_free() for deallocation requirements.
 #if !defined(SSE2NEON_ALLOC_DEFINED)
 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 {
+#if defined(_WIN32)
+    return _aligned_malloc(size, align);
+#else
     void *ptr;
     if (align == 1)
         return malloc(size);
@@ -2083,6 +2759,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
     if (!posix_memalign(&ptr, align, size))
         return ptr;
     return NULL;
+#endif
 }
 #endif
 
@@ -2093,11 +2770,11 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 {
     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
+    __m128 b = _mm_load_ps(_sse2neon_reinterpret_cast(const float *, mem_addr));
     int8x8_t masked =
         vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
                 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
-    vst1_s8((int8_t *) mem_addr, masked);
+    vst1_s8(_sse2neon_reinterpret_cast(int8_t *, mem_addr), masked);
 }
 
 // Conditionally store 8-bit integer elements from a into memory using mask
@@ -2108,12 +2785,6 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
 
 // Compare packed signed 16-bit integers in a and b, and store packed maximum
 // values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
 {
@@ -2121,9 +2792,11 @@ FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
 }
 
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 {
 #if SSE2NEON_PRECISE_MINMAX
@@ -2138,12 +2811,6 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 
 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
 // values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
 {
@@ -2151,9 +2818,12 @@ FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
 }
 
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
 {
     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
@@ -2162,13 +2832,7 @@ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
 }
 
 // Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
+// values in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
 {
@@ -2176,9 +2840,11 @@ FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
 }
 
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 {
 #if SSE2NEON_PRECISE_MINMAX
@@ -2193,12 +2859,6 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 
 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
 // values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
 {
@@ -2206,9 +2866,12 @@ FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
 }
 
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
 {
     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
@@ -2216,8 +2879,10 @@ FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-// Sets the low word to the single-precision, floating-point value of b
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_f32(
@@ -2225,25 +2890,26 @@ FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
                        vreinterpretq_f32_m128(a), 0));
 }
 
-// Moves the upper two values of B into the lower two values of A.
-//
-//   r3 := a3
-//   r2 := a2
-//   r1 := b3
-//   r0 := b2
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
 }
 
-// Moves the lower two values of B into the upper two values of A.
-//
-//   r3 := b1
-//   r2 := b0
-//   r1 := a1
-//   r0 := a0
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 {
     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
@@ -2257,53 +2923,45 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
 {
     uint8x8_t input = vreinterpret_u8_m64(a);
-#if defined(__aarch64__)
-    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+#if SSE2NEON_ARCH_AARCH64
+    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint8x8_t tmp = vshr_n_u8(input, 7);
-    return vaddv_u8(vshl_u8(tmp, shift));
+    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
 #else
-    // Refer the implementation of `_mm_movemask_epi8`
-    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
-    uint32x2_t paired16 =
-        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
-    uint8x8_t paired32 =
-        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
-    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+    // Note: Uses the same method as _mm_movemask_epi8.
+    uint8x8_t msbs = vshr_n_u8(input, 7);
+    uint32x2_t bits = vreinterpret_u32_u8(msbs);
+    bits = vsra_n_u32(bits, bits, 7);
+    bits = vsra_n_u32(bits, bits, 14);
+    uint8x8_t output = vreinterpret_u8_u32(bits);
+    return (vget_lane_u8(output, 4) << 4) | vget_lane_u8(output, 0);
 #endif
 }
 
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
 FORCE_INLINE int _mm_movemask_ps(__m128 a)
 {
     uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
+#if SSE2NEON_ARCH_AARCH64
+    static const int32_t shift[4] = {0, 1, 2, 3};
     uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
+    return _sse2neon_static_cast(int,
+                                 vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))));
 #else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+    // Note: Uses the same method as _mm_movemask_epi8.
+    uint32x4_t msbs = vshrq_n_u32(input, 31);
+    uint64x2_t bits = vreinterpretq_u64_u32(msbs);
+    bits = vsraq_n_u64(bits, bits, 31);
+    uint8x16_t output = vreinterpretq_u8_u64(bits);
+    return (vgetq_lane_u8(output, 8) << 2) | vgetq_lane_u8(output, 0);
 #endif
 }
 
-// Multiplies the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 * b0
-//   r1 := a1 * b1
-//   r2 := a2 * b2
-//   r3 := a3 * b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_f32(
@@ -2313,10 +2971,6 @@ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
 // Multiply the lower single-precision (32-bit) floating-point element in a and
 // b, store the result in the lower element of dst, and copy the upper 3 packed
 // elements from a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] * b[31:0]
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
 {
@@ -2333,9 +2987,9 @@ FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
 }
 
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_s32(
@@ -2344,23 +2998,11 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 
 // Average packed unsigned 8-bit integers in a and b, and store the results in
 // dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
 
 // Average packed unsigned 16-bit integers in a and b, and store the results in
 // dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
 
@@ -2406,10 +3048,27 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
 
 // Fetch the line of data from memory that contains address p to a location in
-// the cache heirarchy specified by the locality hint i.
+// the cache hierarchy specified by the locality hint i.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
 FORCE_INLINE void _mm_prefetch(char const *p, int i)
 {
+    (void) i;
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    switch (i) {
+    case _MM_HINT_NTA:
+        __prefetch2(p, 1);
+        break;
+    case _MM_HINT_T0:
+        __prefetch2(p, 0);
+        break;
+    case _MM_HINT_T1:
+        __prefetch2(p, 2);
+        break;
+    case _MM_HINT_T2:
+        __prefetch2(p, 4);
+        break;
+    }
+#else
     switch (i) {
     case _MM_HINT_NTA:
         __builtin_prefetch(p, 0, 0);
@@ -2424,6 +3083,7 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
         __builtin_prefetch(p, 0, 1);
         break;
     }
+#endif
 }
 
 // Compute the absolute differences of packed unsigned 8-bit integers in a and
@@ -2444,11 +3104,12 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 {
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    float32x4_t _in = vreinterpretq_f32_m128(in);
+    float32x4_t recip = vrecpeq_f32(_in);
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, _in));
 #if SSE2NEON_PRECISE_DIV
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    // Additional Newton-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, _in));
 #endif
     return vreinterpretq_m128_f32(recip);
 }
@@ -2457,30 +3118,41 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 // floating-point element in a, store the result in the lower element of dst,
 // and copy the upper 3 packed elements from a to the upper elements of dst. The
 // maximum relative error for this approximation is less than 1.5*2^-12.
-//
-//   dst[31:0] := (1.0 / a[31:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
 {
     return _mm_move_ss(a, _mm_rcp_ps(a));
 }
 
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// The current precision is 1% error.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
 {
-    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t _in = vreinterpretq_f32_m128(in);
+    float32x4_t out = vrsqrteq_f32(_in);
+
+    // Generate masks for detecting whether input has any 0.0f/-0.0f
+    // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
+    const uint32x4_t has_pos_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
+    const uint32x4_t has_neg_zero =
+        vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
+
+    out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(_in, out), out));
 #if SSE2NEON_PRECISE_SQRT
-    // Additional Netwon-Raphson iteration for accuracy
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
-    out = vmulq_f32(
-        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    // Additional Newton-Raphson iteration for accuracy
+    out = vmulq_f32(out, vrsqrtsq_f32(vmulq_f32(_in, out), out));
 #endif
+
+    // Set output vector element to infinity/negative-infinity if
+    // the corresponding input vector element is 0.0f/-0.0f.
+    out = vbslq_f32(has_pos_zero, vreinterpretq_f32_u32(pos_inf), out);
+    out = vbslq_f32(has_neg_zero, vreinterpretq_f32_u32(neg_inf), out);
+
     return vreinterpretq_m128_f32(out);
 }
 
@@ -2504,7 +3176,8 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
     uint64x1_t t = vpaddl_u32(vpaddl_u16(
         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
     return vreinterpret_m64_u16(
-        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+        vset_lane_u16(_sse2neon_static_cast(uint16_t, vget_lane_u64(t, 0)),
+                      vdup_n_u16(0), 0));
 }
 
 // Macro: Set the flush zero bits of the MXCSR control and status register to
@@ -2517,38 +3190,40 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#if SSE2NEON_ARCH_AARCH64
+    r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#if SSE2NEON_ARCH_AARCH64
+    _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
 {
     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
     return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
 FORCE_INLINE __m128 _mm_set_ps1(float _w)
 {
     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
@@ -2561,44 +3236,26 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
 {
-    union {
-        fpcr_bitfield field;
-#if defined(__aarch64__)
-        uint64_t value;
-#else
-        uint32_t value;
-#endif
-    } r;
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
-#else
-    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
-#endif
-
     switch (rounding) {
-    case _MM_ROUND_TOWARD_ZERO:
-        r.field.bit22 = 1;
-        r.field.bit23 = 1;
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
         break;
     case _MM_ROUND_DOWN:
-        r.field.bit22 = 0;
-        r.field.bit23 = 1;
+        rounding = FE_DOWNWARD;
         break;
     case _MM_ROUND_UP:
-        r.field.bit22 = 1;
-        r.field.bit23 = 0;
+        rounding = FE_UPWARD;
+        break;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
         break;
-    default:  //_MM_ROUND_NEAREST
-        r.field.bit22 = 0;
-        r.field.bit23 = 0;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
     }
-
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
-#else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
-#endif
+    fesetround(rounding);
 }
 
 // Copy single-precision (32-bit) floating-point element a to the lower element
@@ -2609,39 +3266,77 @@ FORCE_INLINE __m128 _mm_set_ss(float a)
     return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
 }
 
-// Sets the four single-precision, floating-point values to w.
-//
-//   r0 := r1 := r2 := r3 := w
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
 FORCE_INLINE __m128 _mm_set1_ps(float _w)
 {
     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
-// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+//
+// Supported MXCSR fields:
+// - Bits 13-14: Rounding mode (RM) - SUPPORTED via ARM FPCR/FPSCR
+// - Bit 15 (FZ): Flush-to-zero mode - SUPPORTED via ARM FPCR/FPSCR bit 24
+// - Bit 6 (DAZ): Denormals-are-zero mode - SUPPORTED (unified with FZ on ARM)
+//
+// Unsupported MXCSR fields (silently ignored):
+// - Bits 0-5: Exception flags (IE, DE, ZE, OE, UE, PE) - NOT EMULATED
+// - Bits 7-12: Exception masks - NOT EMULATED
+// See "MXCSR Exception Flags - NOT EMULATED" documentation block for details.
+//
+// ARM Platform Behavior:
+// - ARM FPCR/FPSCR bit 24 provides unified FZ+DAZ behavior. Setting either
+//   _MM_FLUSH_ZERO_ON or _MM_DENORMALS_ZERO_ON enables the same ARM bit.
+// - ARMv7 NEON: "Flush-to-zero mode always enabled" per ARM ARM (impl may vary)
+// - ARMv8: FPCR.FZ correctly controls denormal handling for NEON operations
 FORCE_INLINE void _mm_setcsr(unsigned int a)
 {
-    _MM_SET_ROUNDING_MODE(a);
+    _MM_SET_ROUNDING_MODE(a & _MM_ROUND_MASK);
+    // ARM FPCR.bit24 handles both FZ and DAZ - set if either is requested
+    _MM_SET_FLUSH_ZERO_MODE(
+        (a & _MM_FLUSH_ZERO_MASK) |
+        ((a & _MM_DENORMALS_ZERO_MASK) ? _MM_FLUSH_ZERO_ON : 0));
 }
 
-// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
-FORCE_INLINE unsigned int _mm_getcsr()
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+//
+// Returned MXCSR fields:
+// - Bits 13-14: Rounding mode (RM) - Reflects current ARM FPCR/FPSCR setting
+// - Bit 15 (FZ): Flush-to-zero mode - Reflects ARM FPCR/FPSCR bit 24
+// - Bit 6 (DAZ): Denormals-are-zero mode - Mirrors FZ (unified on ARM)
+//
+// Fields always returned as zero (NOT EMULATED):
+// - Bits 0-5: Exception flags - ALWAYS 0 (exceptions not tracked)
+// - Bits 7-12: Exception masks - ALWAYS 0 (use _MM_GET_EXCEPTION_MASK()
+// instead) See "MXCSR Exception Flags - NOT EMULATED" documentation block for
+// details.
+//
+// ARM Platform Behavior:
+// - When ARM FPCR/FPSCR bit 24 is enabled, both FZ and DAZ bits are reported
+//   as set (the original setting cannot be distinguished).
+// - ARMv7 NEON: Returned bits reflect FPSCR, but NEON always flushes denormals
+FORCE_INLINE unsigned int _mm_getcsr(void)
 {
-    return _MM_GET_ROUNDING_MODE();
+    return _MM_GET_ROUNDING_MODE() | _MM_GET_FLUSH_ZERO_MODE() |
+           _MM_GET_DENORMALS_ZERO_MODE();
 }
 
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
 {
     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
     return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
 FORCE_INLINE __m128 _mm_setzero_ps(void)
 {
     return vreinterpretq_m128_f32(vdupq_n_f32(0));
@@ -2650,30 +3345,32 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
 // Shuffle 16-bit integers in a using the control in imm8, and store the results
 // in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+// imm must be a compile-time constant in range [0, 255]
 #ifdef _sse2neon_shuffle
 #define _mm_shuffle_pi16(a, imm)                                           \
     __extension__({                                                        \
-        vreinterpret_m64_s16(vshuffle_s16(                                 \
-            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
-            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                         \
+        vreinterpret_m64_s16(                                              \
+            vshuffle_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), \
+                         ((imm) & 0x3), (((imm) >> 2) & 0x3),              \
+                         (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3)));     \
     })
 #else
-#define _mm_shuffle_pi16(a, imm)                                               \
-    __extension__({                                                            \
-        int16x4_t ret;                                                         \
-        ret =                                                                  \
-            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
-            1);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
-            2);                                                                \
-        ret = vset_lane_s16(                                                   \
-            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
-            3);                                                                \
-        vreinterpret_m64_s16(ret);                                             \
-    })
+#define _mm_shuffle_pi16(a, imm)                                              \
+    _sse2neon_define1(                                                        \
+        __m64, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int16x4_t ret;   \
+        ret = vmov_n_s16(                                                     \
+            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
+            1);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
+            2);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
+            3);                                                               \
+        _sse2neon_return(vreinterpret_m64_s16(ret));)
 #endif
 
 // Perform a serializing operation on all store-to-memory instructions that were
@@ -2707,11 +3404,12 @@ FORCE_INLINE void _mm_lfence(void)
     _sse2neon_smp_mb();
 }
 
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int imm)
+// imm must be a compile-time constant in range [0, 255]
 #ifdef _sse2neon_shuffle
 #define _mm_shuffle_ps(a, b, imm)                                              \
     __extension__({                                                            \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                             \
         float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
         float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
         float32x4_t _shuf =                                                    \
@@ -2720,114 +3418,105 @@ FORCE_INLINE void _mm_lfence(void)
         vreinterpretq_m128_f32(_shuf);                                         \
     })
 #else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
-//
-//   r0 := sqrt(a0)
-//   r1 := sqrt(a1)
-//   r2 := sqrt(a2)
-//   r3 := sqrt(a3)
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+#define _mm_shuffle_ps(a, b, imm)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128 ret; \
+        switch (imm) {                                                       \
+            case _MM_SHUFFLE(1, 0, 3, 2):                                    \
+                ret = _mm_shuffle_ps_1032(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 3, 0, 1):                                    \
+                ret = _mm_shuffle_ps_2301(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(0, 3, 2, 1):                                    \
+                ret = _mm_shuffle_ps_0321(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 1, 0, 3):                                    \
+                ret = _mm_shuffle_ps_2103(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(1, 0, 1, 0):                                    \
+                ret = _mm_movelh_ps(_a, _b);                                 \
+                break;                                                       \
+            case _MM_SHUFFLE(1, 0, 0, 1):                                    \
+                ret = _mm_shuffle_ps_1001(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(0, 1, 0, 1):                                    \
+                ret = _mm_shuffle_ps_0101(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(3, 2, 1, 0):                                    \
+                ret = _mm_shuffle_ps_3210(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(0, 0, 1, 1):                                    \
+                ret = _mm_shuffle_ps_0011(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(0, 0, 2, 2):                                    \
+                ret = _mm_shuffle_ps_0022(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 2, 0, 0):                                    \
+                ret = _mm_shuffle_ps_2200(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(3, 2, 0, 2):                                    \
+                ret = _mm_shuffle_ps_3202(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(3, 2, 3, 2):                                    \
+                ret = _mm_movehl_ps(_b, _a);                                 \
+                break;                                                       \
+            case _MM_SHUFFLE(1, 1, 3, 3):                                    \
+                ret = _mm_shuffle_ps_1133(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 0, 1, 0):                                    \
+                ret = _mm_shuffle_ps_2010(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 0, 0, 1):                                    \
+                ret = _mm_shuffle_ps_2001(_a, _b);                           \
+                break;                                                       \
+            case _MM_SHUFFLE(2, 0, 3, 2):                                    \
+                ret = _mm_shuffle_ps_2032(_a, _b);                           \
+                break;                                                       \
+            default:                                                         \
+                ret = _mm_shuffle_ps_default(_a, _b, (imm));                 \
+                break;                                                       \
+        } _sse2neon_return(ret);)
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
+// square root by multiplying input in with its reciprocal square root before
+// using the Newton-Raphson method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-#if SSE2NEON_PRECISE_SQRT
-    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_ARCH_AARCH64 && !SSE2NEON_PRECISE_SQRT
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t _in = vreinterpretq_f32_m128(in);
+    float32x4_t recip = vrsqrteq_f32(_in);
 
-    // Test for vrsqrteq_f32(0) -> positive infinity case.
-    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    // Test for vrsqrteq_f32(0) -> infinity case (both +Inf and -Inf).
+    // vrsqrteq_f32(+0) = +Inf, vrsqrteq_f32(-0) = -Inf
+    // Change recip to zero so that s * 1/sqrt(s) preserves signed zero:
+    //   +0 * 0 = +0, -0 * 0 = -0 (IEEE-754 sign rule)
+    const uint32x4_t abs_mask = vdupq_n_u32(0x7FFFFFFF);
     const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
     const uint32x4_t div_by_zero =
-        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+        vceqq_u32(pos_inf, vandq_u32(abs_mask, vreinterpretq_u32_f32(recip)));
     recip = vreinterpretq_f32_u32(
         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
 
-    // Additional Netwon-Raphson iteration for accuracy
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
-    recip = vmulq_f32(
-        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-        recip);
+    recip = vmulq_f32(vrsqrtsq_f32(vmulq_f32(recip, recip), _in), recip);
+    // Additional Newton-Raphson iteration for accuracy
+    recip = vmulq_f32(vrsqrtsq_f32(vmulq_f32(recip, recip), _in), recip);
 
     // sqrt(s) = s * 1/sqrt(s)
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
-#elif defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    return vreinterpretq_m128_f32(sq);
+    return vreinterpretq_m128_f32(vmulq_f32(_in, recip));
 #endif
 }
 
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
 {
     float32_t value =
@@ -2836,8 +3525,10 @@ FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
 }
 
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 {
     vst1q_f32(p, vreinterpretq_f32_m128(a));
@@ -2846,12 +3537,6 @@ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 // Store the lower single-precision (32-bit) floating-point element from a into
 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
 // boundary or a general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[31:0]
-//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
-//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
 {
@@ -2859,8 +3544,9 @@ FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
     vst1q_f32(p, vdupq_n_f32(a0));
 }
 
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 {
     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
@@ -2869,34 +3555,20 @@ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 // Store the lower single-precision (32-bit) floating-point element from a into
 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
 // boundary or a general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[31:0]
-//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
-//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
 #define _mm_store1_ps _mm_store_ps1
 
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
-//
-//   *p0 := a2
-//   *p1 := a3
-//
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
 {
     *p = vreinterpret_m64_f32(vget_high_f32(a));
 }
 
-// Stores the lower two single-precision floating point values of a to the
-// address p.
-//
-//   *p0 := a0
-//   *p1 := a1
-//
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 {
     *p = vreinterpret_m64_f32(vget_low_f32(a));
@@ -2905,12 +3577,6 @@ FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 // Store 4 single-precision (32-bit) floating-point elements from a into memory
 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
 // general-protection exception may be generated.
-//
-//   MEM[mem_addr+31:mem_addr] := a[127:96]
-//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
-//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
-//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
 {
@@ -2919,8 +3585,10 @@ FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
     vst1q_f32(p, rev);
 }
 
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 {
     vst1q_f32(p, vreinterpretq_f32_m128(a));
@@ -2930,44 +3598,52 @@ FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
 {
-    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+    vst1q_lane_s16(_sse2neon_reinterpret_cast(int16_t *, p),
+                   vreinterpretq_s16_m128i(a), 0);
 }
 
 // Stores 64-bits of integer data a at the address p.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
 {
-    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+    vst1q_lane_s64(_sse2neon_reinterpret_cast(int64_t *, p),
+                   vreinterpretq_s64_m128i(a), 0);
 }
 
 // Store 64-bits of integer data from a into memory using a non-temporal memory
 // hint.
+// Note: ARM lacks direct non-temporal store for single 64-bit value. STNP
+// requires pair stores; __builtin_nontemporal_store may generate regular store
+// on AArch64 for sub-128-bit types.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
 {
-    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1_s64(_sse2neon_reinterpret_cast(int64_t *, p), vreinterpret_s64_m64(a));
+#endif
 }
 
 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
 // point elements) from a into memory using a non-temporal memory hint.
+// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store
+// Non-temporal Pair), providing true non-temporal hint for 128-bit stores.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
+    __builtin_nontemporal_store(a,
+                                _sse2neon_reinterpret_cast(float32x4_t *, p));
 #else
     vst1q_f32(p, vreinterpretq_f32_m128(a));
 #endif
 }
 
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_f32(
@@ -2978,10 +3654,6 @@ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
 // the lower single-precision (32-bit) floating-point element in a, store the
 // result in the lower element of dst, and copy the upper 3 packed elements from
 // a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] - b[31:0]
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 {
@@ -2992,6 +3664,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
 // transposed matrix in these vectors (row0 now contains column 0, etc.).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#ifndef _MM_TRANSPOSE4_PS
 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
     do {                                                  \
         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
@@ -3005,6 +3678,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
                             vget_high_f32(ROW23.val[1])); \
     } while (0)
+#endif
 
 // according to the documentation, these intrinsics behave the same as the
 // non-'u' versions.  We'll just alias them here.
@@ -3016,47 +3690,55 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
 #define _mm_ucomineq_ss _mm_comineq_ss
 
 // Return vector of type __m128i with undefined elements.
+// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined
+// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
 FORCE_INLINE __m128i _mm_undefined_si128(void)
 {
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_UNDEFINED_ZERO || \
+    (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG)
+    return _mm_setzero_si128();
+#else
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
     __m128i a;
     return a;
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic pop
 #endif
+#endif
 }
 
 // Return vector of type __m128 with undefined elements.
+// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined
+// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
 FORCE_INLINE __m128 _mm_undefined_ps(void)
 {
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_UNDEFINED_ZERO || \
+    (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG)
+    return _mm_setzero_ps();
+#else
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
     __m128 a;
     return a;
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic pop
 #endif
+#endif
 }
 
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a2
-//   r1 := b2
-//   r2 := a3
-//   r3 := b3
-//
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -3067,18 +3749,12 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 #endif
 }
 
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -3089,9 +3765,9 @@ FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 #endif
 }
 
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_s32(
@@ -3100,42 +3776,32 @@ FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
 
 /* SSE2 */
 
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s64(
         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 }
 
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -3147,45 +3813,46 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
+    return sse2neon_vld1q_f32_from_f64pair(c);
 #endif
 }
 
 // Add the lower double-precision (64-bit) floating-point element in a and b,
 // store the result in the lower element of dst, and copy the upper element from
 // a to the upper element of dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//   dst[127:64] := a[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
-    return vld1q_f32((float32_t *) c);
+    c[0] = a0 + b0;
+    c[1] = a1;
+    return sse2neon_vld1q_f32_from_f64pair(c);
 #endif
 }
 
 // Add 64-bit integers a and b, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
 {
@@ -3193,15 +3860,9 @@ FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
 }
 
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-//   r0 := SignedSaturate(a0 + b0)
-//   r1 := SignedSaturate(a1 + b1)
-//   ...
-//   r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
@@ -3210,12 +3871,6 @@ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
 
 // Add packed signed 8-bit integers in a and b using saturation, and store the
 // results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
 {
@@ -3232,9 +3887,9 @@ FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
 }
 
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -3243,12 +3898,6 @@ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
 
 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
 // elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
 {
@@ -3256,12 +3905,9 @@ FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-//   r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
@@ -3270,12 +3916,6 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 
 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 // elements in a and then AND with b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-// 	     i := j*64
-// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
 {
@@ -3284,12 +3924,9 @@ FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
 }
 
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
-//
-//   r := (~a) & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
@@ -3297,30 +3934,18 @@ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
 }
 
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
 {
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
+    return vreinterpretq_m128i_u16(
+        vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
 }
 
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -3337,6 +3962,16 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
 
+/* Cast Intrinsics - Zero-Cost Type Reinterpretation
+ *
+ * The _mm_cast* intrinsics reinterpret vector types (__m128, __m128d, __m128i)
+ * without generating any instructions. These are pure type annotations that
+ * perform bitwise reinterpretation, NOT value conversion.
+ *
+ * Maps to ARM NEON vreinterpret_* / vreinterpretq_* (also zero-cost bitcasts).
+ * https://developer.arm.com/architectures/instruction-sets/intrinsics/#q=vreinterpret
+ */
+
 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
 // compilation and does not generate any instructions, thus it has zero latency.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
@@ -3361,9 +3996,9 @@ FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
 }
 
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 {
     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
@@ -3374,16 +4009,16 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
 #else
     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
 #endif
 }
 
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
 {
     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
@@ -3404,19 +4039,21 @@ FORCE_INLINE void _mm_clflush(void const *p)
      * compilation is successful.
      */
 #if defined(__APPLE__)
-    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
-#elif defined(__GNUC__) || defined(__clang__)
-    uintptr_t ptr = (uintptr_t) p;
-    __builtin___clear_cache((char *) ptr,
-                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
-#else
-    /* FIXME: MSVC support */
+    sys_icache_invalidate(_sse2neon_const_cast(void *, p),
+                          SSE2NEON_CACHELINE_SIZE);
+#elif SSE2NEON_COMPILER_GCC_COMPAT
+    uintptr_t ptr = _sse2neon_reinterpret_cast(uintptr_t, p);
+    __builtin___clear_cache(
+        _sse2neon_reinterpret_cast(char *, ptr),
+        _sse2neon_reinterpret_cast(char *, ptr) + SSE2NEON_CACHELINE_SIZE);
+#elif SSE2NEON_COMPILER_MSVC && SSE2NEON_INCLUDE_WINDOWS_H
+    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
 #endif
 }
 
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
@@ -3424,16 +4061,17 @@ FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
 }
 
 // Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u32(
         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -3445,15 +4083,22 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 == b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 == b1 ? ~UINT64_C(0) : UINT64_C(0);
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
@@ -3471,17 +4116,21 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3493,54 +4142,43 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffff : 0x0
-//   r1 := (a1 > b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 > b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u32(
         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xff : 0x0
-//   r1 := (a1 > b1) ? 0xff : 0x0
-//   ...
-//   r15 := (a15 > b15) ? 0xff : 0x0
-//
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -3552,17 +4190,21 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3574,15 +4216,16 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3594,17 +4237,21 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3616,49 +4263,46 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
-//
-//   r0 := (a0 < b0) ? 0xffff : 0x0
-//   r1 := (a1 < b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 < b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u32(
         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -3670,17 +4314,21 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3692,14 +4340,15 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3711,15 +4360,22 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
 #else
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 != b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 != b1 ? ~UINT64_C(0) : UINT64_C(0);
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
 
@@ -3737,20 +4393,22 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
 FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3770,20 +4428,22 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
 FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(veorq_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3803,20 +4463,22 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
 FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(veorq_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3836,20 +4498,22 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
 FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_u64(veorq_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3869,7 +4533,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     // Excluding NaNs, any two floating point numbers can be compared.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3877,19 +4541,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3901,17 +4563,15 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3923,7 +4583,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     // Two NaNs are not equal in comparison operation.
     uint64x2_t not_nan_a =
         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
@@ -3932,19 +4592,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3956,17 +4614,15 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3978,13 +4634,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3993,13 +4649,14 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 > *(double *) &b0);
+    return a0 > b0;
 #endif
 }
 
@@ -4008,13 +4665,14 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 <= *(double *) &b0);
+    return a0 <= b0;
 #endif
 }
 
@@ -4023,13 +4681,14 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 < *(double *) &b0);
+    return a0 < b0;
 #endif
 }
 
@@ -4038,19 +4697,14 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
 #else
-    uint32x4_t a_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
-    uint32x4_t b_not_nan =
-        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
-    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
-                                       vreinterpretq_u64_u32(a_eq_b));
-    return vgetq_lane_u64(and_results, 0) & 0x1;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 == b0 ? 1 : 0;
 #endif
 }
 
@@ -4064,29 +4718,24 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
 
 // Convert packed signed 32-bit integers in a to packed double-precision
 // (64-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*32
-//     m := j*64
-//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
 #else
-    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    double a0 = _sse2neon_static_cast(
+        double, vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0));
+    double a1 = _sse2neon_static_cast(
+        double, vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1));
     return _mm_set_pd(a1, a0);
 #endif
 }
 
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 {
     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
@@ -4094,153 +4743,135 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      k := 64*j
-//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 {
-// vrnd32xq_f64 not supported on clang
-#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
-    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
-    int64x2_t integers = vcvtq_s64_f64(rounded);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
-#else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
-#endif
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, _sse2neon_cvtd_s32(d1), _sse2neon_cvtd_s32(d0));
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      k := 64*j
-//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {
+        _sse2neon_cvtd_s32(d0),
+        _sse2neon_cvtd_s32(d1),
+    };
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed single-precision (32-bit) floating-point elements, and store the
 // results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 32*j
-//     k := 64*j
-//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
-//   ENDFOR
-//   dst[127:64] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, _sse2neon_static_cast(float, a1),
+                      _sse2neon_static_cast(float, a0));
 #endif
 }
 
 // Convert packed signed 32-bit integers in a to packed double-precision
 // (64-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*32
-//     m := j*64
-//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
 #else
-    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
-    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    double a0 = _sse2neon_static_cast(
+        double, vget_lane_s32(vreinterpret_s32_m64(a), 0));
+    double a1 = _sse2neon_static_cast(
+        double, vget_lane_s32(vreinterpret_s32_m64(a), 1));
     return _mm_set_pd(a1, a0);
 #endif
 }
 
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
-//
-//   r0 := (int) a0
-//   r1 := (int) a1
-//   r2 := (int) a2
-//   r3 := (int) a3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// x86 returns INT32_MIN ("integer indefinite") for NaN and out-of-range values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
 // does not support! It is supported on ARMv8-A however.
 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__ARM_FEATURE_FRINT)
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
-#elif defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    float32x4_t f = vreinterpretq_f32_m128(a);
+    int32x4_t cvt = vcvtq_s32_f32(vrnd32xq_f32(f));
+    return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt));
+#elif SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    float32x4_t f = vreinterpretq_f32_m128(a);
+    int32x4_t cvt;
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST:
-        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+        cvt = vcvtnq_s32_f32(f);
+        break;
     case _MM_ROUND_DOWN:
-        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+        cvt = vcvtmq_s32_f32(f);
+        break;
     case _MM_ROUND_UP:
-        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+        cvt = vcvtpq_s32_f32(f);
+        break;
     default:  // _MM_ROUND_TOWARD_ZERO
-        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+        cvt = vcvtq_s32_f32(f);
+        break;
     }
+    return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt));
 #else
-    float *f = (float *) &a;
+    float *f = _sse2neon_reinterpret_cast(float *, &a);
     switch (_MM_GET_ROUNDING_MODE()) {
     case _MM_ROUND_NEAREST: {
+        float32x4_t fv = vreinterpretq_f32_m128(a);
         uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        float32x4_t half =
+            vbslq_f32(signmask, fv, vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal =
+            vcvtq_s32_f32(vaddq_f32(fv, half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(fv);  /* truncate to integer: [a] */
         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
         float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+            fv, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
         uint32x4_t is_delta_half =
             vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128i_s32(
-            vbslq_s32(is_delta_half, r_even, r_normal));
+        int32x4_t result = vbslq_s32(is_delta_half, r_even, r_normal);
+        return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(fv, result));
     }
     case _MM_ROUND_DOWN:
-        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
-                             floorf(f[0]));
+        return _mm_set_epi32(
+            _sse2neon_cvtf_s32(floorf(f[3])), _sse2neon_cvtf_s32(floorf(f[2])),
+            _sse2neon_cvtf_s32(floorf(f[1])), _sse2neon_cvtf_s32(floorf(f[0])));
     case _MM_ROUND_UP:
-        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
-                             ceilf(f[0]));
+        return _mm_set_epi32(
+            _sse2neon_cvtf_s32(ceilf(f[3])), _sse2neon_cvtf_s32(ceilf(f[2])),
+            _sse2neon_cvtf_s32(ceilf(f[1])), _sse2neon_cvtf_s32(ceilf(f[0])));
     default:  // _MM_ROUND_TOWARD_ZERO
-        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
-                             (int32_t) f[0]);
+        return _mm_set_epi32(_sse2neon_cvtf_s32(f[3]), _sse2neon_cvtf_s32(f[2]),
+                             _sse2neon_cvtf_s32(f[1]),
+                             _sse2neon_cvtf_s32(f[0]));
     }
 #endif
 }
@@ -4248,79 +4879,59 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 // Convert packed single-precision (32-bit) floating-point elements in a to
 // packed double-precision (64-bit) floating-point elements, and store the
 // results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 64*j
-//     k := 32*j
-//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
 #else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    double a0 = _sse2neon_static_cast(
+        double, vgetq_lane_f32(vreinterpretq_f32_m128(a), 0));
+    double a1 = _sse2neon_static_cast(
+        double, vgetq_lane_f32(vreinterpretq_f32_m128(a), 1));
     return _mm_set_pd(a1, a0);
 #endif
 }
 
 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 {
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#if SSE2NEON_ARCH_AARCH64
+    return _sse2neon_static_cast(double,
+                                 vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0));
 #else
-    return ((double *) &a)[0];
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 32-bit integer, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
 {
-#if defined(__aarch64__)
-    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
-#endif
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return _sse2neon_cvtd_s32(ret);
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 64-bit integer, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 {
-#if defined(__aarch64__)
-    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
-#endif
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return _sse2neon_cvtd_s64(ret);
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 64-bit integer, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
 #define _mm_cvtsd_si64x _mm_cvtsd_si64
 
@@ -4331,20 +4942,19 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(vsetq_lane_f32(
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        _sse2neon_static_cast(float, b0), vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
 // Copy the lower 32-bit integer in a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 {
@@ -4352,9 +4962,6 @@ FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 }
 
 // Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 {
@@ -4371,32 +4978,23 @@ FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(
+        _sse2neon_static_cast(double, b), vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64(_sse2neon_static_cast(double, b));
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
 // Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
 
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
-//
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 {
     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
@@ -4408,21 +5006,19 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
 {
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(
+        _sse2neon_static_cast(double, b), vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64(_sse2neon_static_cast(double, b));
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 {
     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
@@ -4443,20 +5039,17 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 // double-precision (64-bit) floating-point element, store the result in the
 // lower element of dst, and copy the upper element from a to the upper element
 // of dst.
-//
-//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
-//   dst[127:64] := a[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-#if defined(__aarch64__)
+    double d = _sse2neon_static_cast(
+        double, vgetq_lane_f32(vreinterpretq_f32_m128(b), 0));
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4465,9 +5058,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, _sse2neon_cvtd_s32(a1), _sse2neon_cvtd_s32(a0));
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
@@ -4475,77 +5069,105 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+#if SSE2NEON_ARCH_AARCH64
+    /* Vectorized AArch64 path - branchless, no memory round-trip */
+    float64x2_t f = vreinterpretq_f64_m128d(a);
+
+    /* Convert f64 to i64 with truncation toward zero.
+     * Out-of-range values produce undefined results, but we mask them below.
+     */
+    int64x2_t i64 = vcvtq_s64_f64(f);
+
+    /* Detect values outside INT32 range: >= 2147483648.0 or < -2147483648.0
+     * x86 returns INT32_MIN (0x80000000) for these cases.
+     */
+    float64x2_t max_f = vdupq_n_f64(2147483648.0); /* INT32_MAX + 1 */
+    float64x2_t min_f = vdupq_n_f64(-2147483648.0);
+    uint64x2_t overflow = vorrq_u64(vcgeq_f64(f, max_f), vcltq_f64(f, min_f));
+
+    /* Detect NaN: a value is NaN if it's not equal to itself.
+     * Use XOR with all-ones since vmvnq_u64 doesn't exist. */
+    uint64x2_t eq_self = vceqq_f64(f, f);
+    uint64x2_t is_nan = veorq_u64(eq_self, vdupq_n_u64(UINT64_MAX));
+
+    /* Combine: any overflow or NaN should produce INT32_MIN */
+    uint64x2_t need_indefinite = vorrq_u64(overflow, is_nan);
+
+    /* Narrow i64 to i32 (simple truncation of upper 32 bits) */
+    int32x2_t i32 = vmovn_s64(i64);
+
+    /* Blend: select INT32_MIN where needed, otherwise use converted value */
+    uint32x2_t mask32 = vmovn_u64(need_indefinite);
+    int32x2_t indefinite = vdup_n_s32(INT32_MIN);
+    return vreinterpret_m64_s32(vbsl_s32(mask32, indefinite, i32));
+#else
+    /* Scalar fallback for ARMv7 (no f64 SIMD support) */
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {_sse2neon_cvtd_s32(a0),
+                                        _sse2neon_cvtd_s32(a1)};
     return vreinterpret_m64_s32(vld1_s32(data));
+#endif
 }
 
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// x86 returns INT32_MIN ("integer indefinite") for NaN and out-of-range values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 {
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+    float32x4_t f = vreinterpretq_f32_m128(a);
+    int32x4_t cvt = vcvtq_s32_f32(f);
+    return vreinterpretq_m128i_s32(_sse2neon_cvtps_epi32_fixup(f, cvt));
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    double ret = *((double *) &a);
-    return (int32_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _sse2neon_cvtd_s32(_a);
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 {
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _sse2neon_cvtd_s64(_a);
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
 // 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
 
 // Divide packed double-precision (64-bit) floating-point elements in a by
 // packed elements in b, and store the results in dst.
-//
-//  FOR j := 0 to 1
-//    i := 64*j
-//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
-//  ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
-    return vld1q_f32((float32_t *) c);
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
+    return sse2neon_vld1q_f32_from_f64pair(c);
 #endif
 }
 
@@ -4556,7 +5178,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     float64x2_t tmp =
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_f64(
@@ -4566,36 +5188,35 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
 #endif
 }
 
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 7]
+#define _mm_extract_epi16(a, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 7), \
+     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)))
 
-// Loads two double-precision from 16-byte aligned memory, floating-point
-// values.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, const int imm)
+// imm must be a compile-time constant in range [0, 7]
+#define _mm_insert_epi16(a, b, imm)           \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 7), \
+     vreinterpretq_m128i_s16(                 \
+         vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))))
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
 FORCE_INLINE __m128d _mm_load_pd(const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vld1q_f64(p));
 #else
-    const float *fp = (const float *) p;
+    const float *fp = _sse2neon_reinterpret_cast(const float *, p);
     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
     return vreinterpretq_m128d_f32(vld1q_f32(data));
 #endif
@@ -4603,71 +5224,59 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
 
 // Load a double-precision (64-bit) floating-point element from memory into both
 // elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
 #define _mm_load_pd1 _mm_load1_pd
 
 // Load a double-precision (64-bit) floating-point element from memory into the
 // lower of dst, and zero the upper element. mem_addr does not need to be
 // aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
 FORCE_INLINE __m128d _mm_load_sd(const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
 #else
-    const float *fp = (const float *) p;
+    const float *fp = _sse2neon_reinterpret_cast(const float *, p);
     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
     return vreinterpretq_m128d_f32(vld1q_f32(data));
 #endif
 }
 
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(
+        vld1q_s32(_sse2neon_reinterpret_cast(const int32_t *, p)));
 }
 
 // Load a double-precision (64-bit) floating-point element from memory into both
 // elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+    return vreinterpretq_m128d_s64(
+        vdupq_n_s64(*_sse2neon_reinterpret_cast(const int64_t *, p)));
 #endif
 }
 
 // Load a double-precision (64-bit) floating-point element from memory into the
 // upper element of dst, and copy the lower element from a to dst. mem_addr does
 // not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
 #else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(a)),
+                     vld1_f32(_sse2neon_reinterpret_cast(const float *, p))));
 #endif
 }
 
@@ -4679,25 +5288,22 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
      */
     return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+        vcombine_s32(vld1_s32(_sse2neon_reinterpret_cast(int32_t const *, p)),
+                     vcreate_s32(0)));
 }
 
 // Load a double-precision (64-bit) floating-point element from memory into the
 // lower element of dst, and copy the upper element from a to dst. mem_addr does
 // not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := a[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
 #else
     return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
+        vcombine_f32(vld1_f32(_sse2neon_reinterpret_cast(const float *, p)),
                      vget_high_f32(vreinterpretq_f32_m128d(a))));
 #endif
 }
@@ -4705,18 +5311,14 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
 // general-protection exception may be generated.
-//
-//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     float64x2_t v = vld1q_f64(p);
     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
 #else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
+    int64x2_t v = vld1q_s64(_sse2neon_reinterpret_cast(const int64_t *, p));
     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
 #endif
 }
@@ -4728,38 +5330,33 @@ FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
     return _mm_load_pd(p);
 }
 
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+    return vreinterpretq_m128i_s32(
+        vld1q_s32(_sse2neon_reinterpret_cast(const unaligned_int32_t *, p)));
 }
 
 // Load unaligned 32-bit integer from memory into the first element of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[MAX:32] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
 {
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(
+        *_sse2neon_reinterpret_cast(const unaligned_int32_t *, p),
+        vdupq_n_s32(0), 0));
 }
 
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0) + (a1 * b1)
-//   r1 := (a2 * b2) + (a3 * b3)
-//   r2 := (a4 * b4) + (a5 * b5)
-//   r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
                               vget_low_s16(vreinterpretq_s16_m128i(b)));
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int32x4_t high =
         vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
 
@@ -4783,25 +5380,25 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
 {
     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-    __m128 b = _mm_load_ps((const float *) mem_addr);
+    __m128 b = _mm_load_ps(_sse2neon_reinterpret_cast(const float *, mem_addr));
     int8x16_t masked =
         vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
                  vreinterpretq_s8_m128(b));
-    vst1q_s8((int8_t *) mem_addr, masked);
+    vst1q_s8(_sse2neon_reinterpret_cast(int8_t *, mem_addr), masked);
 }
 
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -4813,7 +5410,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4823,15 +5420,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
 
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4841,28 +5442,30 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(c));
 #endif
 }
 
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -4874,7 +5477,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #if SSE2NEON_PRECISE_MINMAX
     float64x2_t _a = vreinterpretq_f64_m128d(a);
     float64x2_t _b = vreinterpretq_f64_m128d(b);
@@ -4884,14 +5487,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4901,22 +5508,20 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(c));
 #endif
 }
 
 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
 // upper element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
 {
@@ -4927,10 +5532,6 @@ FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
 // Move the lower double-precision (64-bit) floating-point element from b to the
 // lower element of dst, and copy the upper element from a to the upper element
 // of dst.
-//
-//   dst[63:0] := b[63:0]
-//   dst[127:64] := a[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
 {
@@ -4939,88 +5540,81 @@ FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
                      vget_high_f32(vreinterpretq_f32_m128d(a))));
 }
 
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+//
+//   Input (__m128i): 16 bytes, extract bit 7 (MSB) of each
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F|  byte index
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//    |   ...                     |
+//   MSB                         MSB
+//    v   v v v v v v v v v v v v v v
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |0|1|2|3|4|5|6|7|8|9|A|B|C|D|E|F|  bit position in result
+//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//   |<-- low byte ->|<-- high byte->|
+//
+//   Output (int): 16-bit mask where bit[i] = MSB of input byte[i]
 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
 {
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
     uint8x16_t input = vreinterpretq_u8_m128i(a);
 
-    // Shift out everything but the sign bits with an unsigned shift right.
+#if SSE2NEON_ARCH_AARCH64
+    // AArch64: Variable shift + horizontal add (vaddv).
+    //
+    // Step 1: Extract MSB of each byte (vshr #7: 0x80->1, 0x7F->0)
+    uint8x16_t msbs = vshrq_n_u8(input, 7);
+
+    // Step 2: Shift each byte left by its bit position (0-7 per half)
     //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
+    //   msbs:     [ 1  ][ 0  ][ 1  ][ 1  ][ 0  ][ 1  ][ 0  ][ 1  ] (example)
+    //   shifts:   [ 0  ][ 1  ][ 2  ][ 3  ][ 4  ][ 5  ][ 6  ][ 7  ]
+    //               |     |     |     |     |     |     |     |
+    //              <<0   <<1   <<2   <<3   <<4   <<5   <<6   <<7
+    //               v     v     v     v     v     v     v     v
+    //   result:  [0x01][0x00][0x04][0x08][0x00][0x20][0x00][0x80]
     //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
+    //   Horizontal sum: 0x01+0x04+0x08+0x20+0x80 = 0xAD = 0b10101101
+    //   Each bit in sum corresponds to one input byte's MSB.
+    static const int8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+                                           0, 1, 2, 3, 4, 5, 6, 7};
+    int8x16_t shifts = vld1q_s8(shift_table);
+    uint8x16_t positioned = vshlq_u8(msbs, shifts);
+
+    // Step 3: Sum each half -> bits [7:0] and [15:8]
+    return vaddv_u8(vget_low_u8(positioned)) |
+           (vaddv_u8(vget_high_u8(positioned)) << 8);
+#else
+    // ARMv7: Shift-right-accumulate (no vaddv).
     //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
+    // Step 1: Extract MSB of each byte
+    uint8x16_t msbs = vshrq_n_u8(input, 7);
+    uint64x2_t bits = vreinterpretq_u64_u8(msbs);
+
+    // Step 2: Parallel bit collection via shift-right-accumulate
     //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
+    //   Initial (8 bytes shown):
+    //   byte:     [  0 ][  1 ][  2 ][  3 ][  4 ][  5 ][  6 ][  7 ]
+    //   value:    [ 01 ][ 00 ][ 01 ][ 01 ][ 00 ][ 01 ][ 00 ][ 01 ]
     //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
+    //   vsra(..., 7):  add original + (original >> 7)
+    //   byte 1 gets: orig[1] + orig[0] = b1|b0 in bits [1:0]
+    //   byte 3 gets: orig[3] + orig[2] = b3|b2 in bits [1:0]
+    //   ...
+    //   Result: pairs combined into odd bytes
     //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+    //   vsra(..., 14): combine pairs -> 4 bits in bytes 3,7
+    //   vsra(..., 28): combine all   -> 8 bits in byte 7 (actually byte 0)
+    bits = vsraq_n_u64(bits, bits, 7);
+    bits = vsraq_n_u64(bits, bits, 14);
+    bits = vsraq_n_u64(bits, bits, 28);
 
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+    // Step 3: Extract packed result from byte 0 of each half
+    uint8x16_t output = vreinterpretq_u8_u64(bits);
+    return vgetq_lane_u8(output, 0) | (vgetq_lane_u8(output, 8) << 8);
+#endif
 }
 
 // Set each bit of mask dst based on the most significant bit of the
@@ -5030,13 +5624,11 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a)
 {
     uint64x2_t input = vreinterpretq_u64_m128d(a);
     uint64x2_t high_bits = vshrq_n_u64(input, 63);
-    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+    return _sse2neon_static_cast(int, vgetq_lane_u64(high_bits, 0) |
+                                          (vgetq_lane_u64(high_bits, 1) << 1));
 }
 
 // Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
 {
@@ -5045,10 +5637,6 @@ FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
 
 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
 // element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
 {
@@ -5058,9 +5646,7 @@ FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
 
 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
 // a and b, and store the unsigned 64-bit results in dst.
-//
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 {
     // vmull_u32 upcasts instead of masking, so we downcast.
@@ -5074,16 +5660,22 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
+    return sse2neon_vld1q_f32_from_f64pair(c);
 #endif
 }
 
@@ -5098,9 +5690,6 @@ FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
 
 // Multiply the low unsigned 32-bit integers from a and b, and store the
 // unsigned 64-bit result in dst.
-//
-//   dst[63:0] := a[31:0] * b[31:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
 {
@@ -5108,21 +5697,15 @@ FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
 }
 
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0)[31:16]
-//   r1 := (a1 * b1)[31:16]
-//   ...
-//   r7 := (a7 * b7)[31:16]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
 {
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    // vmull_s16 is used instead of vqdmulhq_s16 to avoid saturation issues
+    // with large values (e.g., -32768 * -32768). vmull_s16 produces full 32-bit
+    // products without saturation.
     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
@@ -5143,7 +5726,7 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint32x4_t ab7654 =
         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
@@ -5159,15 +5742,9 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
 #endif
 }
 
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
-//
-//   r0 := (a0 * b0)[15:0]
-//   r1 := (a1 * b1)[15:0]
-//   ...
-//   r7 := (a7 * b7)[15:0]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
@@ -5183,20 +5760,18 @@ FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
-//
-//   r := a | b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
@@ -5204,19 +5779,9 @@ FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
 }
 
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-//   r0 := SignedSaturate(a0)
-//   r1 := SignedSaturate(a1)
-//   r2 := SignedSaturate(a2)
-//   r3 := SignedSaturate(a3)
-//   r4 := SignedSaturate(b0)
-//   r5 := SignedSaturate(b1)
-//   r6 := SignedSaturate(b2)
-//   r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
@@ -5224,19 +5789,9 @@ FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
 }
 
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   ...
-//   r7 := UnsignedSaturate(a7)
-//   r8 := UnsignedSaturate(b0)
-//   r9 := UnsignedSaturate(b1)
-//   ...
-//   r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -5249,9 +5804,14 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 // 'yield' instruction isn't a good fit because it's effectively a nop on most
 // Arm cores. Experience with several databases has shown has shown an 'isb' is
 // a reasonable approximation.
-FORCE_INLINE void _mm_pause()
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause(void)
 {
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    __isb(_ARM64_BARRIER_SY);
+#else
     __asm__ __volatile__("isb\n");
+#endif
 }
 
 // Compute the absolute differences of packed unsigned 8-bit integers in a and
@@ -5261,12 +5821,13 @@ FORCE_INLINE void _mm_pause()
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
 {
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16x8_t t = vpaddlq_u8(
+        vabdq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
     return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
 }
 
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
 FORCE_INLINE __m128i _mm_set_epi16(short i7,
                                    short i6,
                                    short i5,
@@ -5280,33 +5841,31 @@ FORCE_INLINE __m128i _mm_set_epi16(short i7,
     return vreinterpretq_m128i_s16(vld1q_s16(data));
 }
 
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
 {
     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
     return vreinterpretq_m128i_s32(vld1q_s32(data));
 }
 
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
 {
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
 }
 
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
 {
     return vreinterpretq_m128i_s64(
         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
 }
 
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
                                   signed char b14,
                                   signed char b13,
@@ -5324,12 +5883,16 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
                                   signed char b1,
                                   signed char b0)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        _sse2neon_static_cast(int8_t, b0),  _sse2neon_static_cast(int8_t, b1),
+        _sse2neon_static_cast(int8_t, b2),  _sse2neon_static_cast(int8_t, b3),
+        _sse2neon_static_cast(int8_t, b4),  _sse2neon_static_cast(int8_t, b5),
+        _sse2neon_static_cast(int8_t, b6),  _sse2neon_static_cast(int8_t, b7),
+        _sse2neon_static_cast(int8_t, b8),  _sse2neon_static_cast(int8_t, b9),
+        _sse2neon_static_cast(int8_t, b10), _sse2neon_static_cast(int8_t, b11),
+        _sse2neon_static_cast(int8_t, b12), _sse2neon_static_cast(int8_t, b13),
+        _sse2neon_static_cast(int8_t, b14), _sse2neon_static_cast(int8_t, b15)};
+    return vreinterpretq_m128i_s8(vld1q_s8(data));
 }
 
 // Set packed double-precision (64-bit) floating-point elements in dst with the
@@ -5338,10 +5901,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 {
     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128d_f64(
+        vld1q_f64(_sse2neon_reinterpret_cast(float64_t *, data)));
 #else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+    return vreinterpretq_m128d_f32(sse2neon_vld1q_f32_from_f64pair(data));
 #endif
 }
 
@@ -5355,61 +5919,43 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
 FORCE_INLINE __m128d _mm_set_sd(double a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
 #else
     return _mm_set_pd(0, a);
 #endif
 }
 
-// Sets the 8 signed 16-bit integer values to w.
-//
-//   r0 := w
-//   r1 := w
-//   ...
-//   r7 := w
-//
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
 FORCE_INLINE __m128i _mm_set1_epi16(short w)
 {
     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
 }
 
-// Sets the 4 signed 32-bit integer values to i.
-//
-//   r0 := i
-//   r1 := i
-//   r2 := i
-//   r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
 {
     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
 }
 
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
 {
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
 }
 
-// Sets the 2 signed 64-bit integer values to i.
+// Broadcast 64-bit integer a to all elements of dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
 {
     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 }
 
-// Sets the 16 signed 8-bit integer values to b.
-//
-//   r0 := b
-//   r1 := b
-//   ...
-//   r15 := b
-//
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 {
     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
@@ -5420,20 +5966,16 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
 FORCE_INLINE __m128d _mm_set1_pd(double d)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
-// Sets the 8 signed 16-bit integer values in reverse order.
-//
-// Return Value
-//   r0 := w0
-//   r1 := w1
-//   ...
-//   r7 := w7
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
                                     short w1,
                                     short w2,
@@ -5444,11 +5986,12 @@ FORCE_INLINE __m128i _mm_setr_epi16(short w0,
                                     short w7)
 {
     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+    return vreinterpretq_m128i_s16(
+        vld1q_s16(_sse2neon_reinterpret_cast(int16_t *, data)));
 }
 
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
 {
     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
@@ -5462,8 +6005,8 @@ FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
 }
 
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
                                    signed char b1,
                                    signed char b2,
@@ -5481,12 +6024,16 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
                                    signed char b14,
                                    signed char b15)
 {
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
+    int8_t ALIGN_STRUCT(16) data[16] = {
+        _sse2neon_static_cast(int8_t, b0),  _sse2neon_static_cast(int8_t, b1),
+        _sse2neon_static_cast(int8_t, b2),  _sse2neon_static_cast(int8_t, b3),
+        _sse2neon_static_cast(int8_t, b4),  _sse2neon_static_cast(int8_t, b5),
+        _sse2neon_static_cast(int8_t, b6),  _sse2neon_static_cast(int8_t, b7),
+        _sse2neon_static_cast(int8_t, b8),  _sse2neon_static_cast(int8_t, b9),
+        _sse2neon_static_cast(int8_t, b10), _sse2neon_static_cast(int8_t, b11),
+        _sse2neon_static_cast(int8_t, b12), _sse2neon_static_cast(int8_t, b13),
+        _sse2neon_static_cast(int8_t, b14), _sse2neon_static_cast(int8_t, b15)};
+    return vreinterpretq_m128i_s8(vld1q_s8(data));
 }
 
 // Set packed double-precision (64-bit) floating-point elements in dst with the
@@ -5501,27 +6048,29 @@ FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
 FORCE_INLINE __m128d _mm_setzero_pd(void)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
 #else
     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
 #endif
 }
 
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
 FORCE_INLINE __m128i _mm_setzero_si128(void)
 {
     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
 }
 
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#if defined(_sse2neon_shuffle)
 #define _mm_shuffle_epi32(a, imm)                                            \
     __extension__({                                                          \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                           \
         int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
         int32x4_t _shuf =                                                    \
             vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
@@ -5529,84 +6078,84 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
         vreinterpretq_m128i_s32(_shuf);                                      \
     })
 #else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
+#define _mm_shuffle_epi32(a, imm)                                           \
+    _sse2neon_define1(                                                      \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \
+        switch (imm) {                                                      \
+            case _MM_SHUFFLE(1, 0, 3, 2):                                   \
+                ret = _mm_shuffle_epi_1032(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(2, 3, 0, 1):                                   \
+                ret = _mm_shuffle_epi_2301(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(0, 3, 2, 1):                                   \
+                ret = _mm_shuffle_epi_0321(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(2, 1, 0, 3):                                   \
+                ret = _mm_shuffle_epi_2103(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(1, 0, 1, 0):                                   \
+                ret = _mm_shuffle_epi_1010(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(1, 0, 0, 1):                                   \
+                ret = _mm_shuffle_epi_1001(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(0, 1, 0, 1):                                   \
+                ret = _mm_shuffle_epi_0101(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(2, 2, 1, 1):                                   \
+                ret = _mm_shuffle_epi_2211(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(0, 1, 2, 2):                                   \
+                ret = _mm_shuffle_epi_0122(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(3, 3, 3, 2):                                   \
+                ret = _mm_shuffle_epi_3332(_a);                             \
+                break;                                                      \
+            case _MM_SHUFFLE(0, 0, 0, 0):                                   \
+                ret = _mm_shuffle_epi32_splat(_a, 0);                       \
+                break;                                                      \
+            case _MM_SHUFFLE(1, 1, 1, 1):                                   \
+                ret = _mm_shuffle_epi32_splat(_a, 1);                       \
+                break;                                                      \
+            case _MM_SHUFFLE(2, 2, 2, 2):                                   \
+                ret = _mm_shuffle_epi32_splat(_a, 2);                       \
+                break;                                                      \
+            case _MM_SHUFFLE(3, 3, 3, 3):                                   \
+                ret = _mm_shuffle_epi32_splat(_a, 3);                       \
+                break;                                                      \
+            default:                                                        \
+                ret = _mm_shuffle_epi32_default(_a, (imm));                 \
+                break;                                                      \
+        } _sse2neon_return(ret);)
 #endif
 
 // Shuffle double-precision (64-bit) floating-point elements using the control
 // in imm8, and store the results in dst.
-//
-//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
-//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+// imm8 must be a compile-time constant in range [0, 3]
 #ifdef _sse2neon_shuffle
-#define _mm_shuffle_pd(a, b, imm8)                                            \
-    vreinterpretq_m128d_s64(                                                  \
-        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
-                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#define _mm_shuffle_pd(a, b, imm8)                                  \
+    __extension__({                                                 \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 3);                   \
+        vreinterpretq_m128d_s64(vshuffleq_s64(                      \
+            vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+            (imm8) & 0x1, (((imm8) & 0x2) >> 1) + 2));              \
+    })
 #else
-#define _mm_shuffle_pd(a, b, imm8)                                     \
-    _mm_castsi128_pd(_mm_set_epi64x(                                   \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
-        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#define _mm_shuffle_pd(a, b, imm8)                                        \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 3),                            \
+     _mm_castsi128_pd(_mm_set_epi64x(                                     \
+         vgetq_lane_s64(vreinterpretq_s64_m128d(b), ((imm8) & 0x2) >> 1), \
+         vgetq_lane_s64(vreinterpretq_s64_m128d(a), (imm8) & 0x1))))
 #endif
 
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#if defined(_sse2neon_shuffle)
 #define _mm_shufflehi_epi16(a, imm)                                           \
     __extension__({                                                           \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                            \
         int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
         int16x8_t _shuf =                                                     \
             vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
@@ -5614,127 +6163,83 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
                           (((imm) >> 6) & 0x3) + 4);                          \
         vreinterpretq_m128i_s16(_shuf);                                       \
     })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#else
+#define _mm_shufflehi_epi16(a, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255), \
+     _mm_shufflehi_epi16_function((a), (imm)))
 #endif
 
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#ifdef _sse2neon_shuffle
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#if defined(_sse2neon_shuffle)
 #define _mm_shufflelo_epi16(a, imm)                                  \
     __extension__({                                                  \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                   \
         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
         int16x8_t _shuf = vshuffleq_s16(                             \
             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
         vreinterpretq_m128i_s16(_shuf);                              \
     })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#else
+#define _mm_shufflelo_epi16(a, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255), \
+     _mm_shufflelo_epi16_function((a), (imm)))
 #endif
 
 // Shift packed 16-bit integers in a left by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF count[63:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
+    if (_sse2neon_unlikely(c > 15))
         return _mm_setzero_si128();
 
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    int16x8_t vc = vdupq_n_s16(_sse2neon_static_cast(int16_t, c));
     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
 }
 
 // Shift packed 32-bit integers in a left by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF count[63:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
+    if (_sse2neon_unlikely(c > 31))
         return _mm_setzero_si128();
 
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    int32x4_t vc = vdupq_n_s32(_sse2neon_static_cast(int32_t, c));
     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
 }
 
 // Shift packed 64-bit integers in a left by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF count[63:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
+    if (_sse2neon_unlikely(c > 63))
         return _mm_setzero_si128();
 
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    int64x2_t vc = vdupq_n_s64(_sse2neon_static_cast(int64_t, c));
     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
 }
 
 // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
 {
     if (_sse2neon_unlikely(imm & ~15))
         return _mm_setzero_si128();
     return vreinterpretq_m128i_s16(
-        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+        vshlq_s16(vreinterpretq_s16_m128i(a),
+                  vdupq_n_s16(_sse2neon_static_cast(int16_t, imm))));
 }
 
 // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
 {
@@ -5746,16 +6251,6 @@ FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
 
 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF imm8[7:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 {
@@ -5767,38 +6262,31 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
 
 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
 // dst.
-//
-//   tmp := imm8[7:0]
-//   IF tmp > 15
-//     tmp := 16
-//   FI
-//   dst[127:0] := a[127:0] << (tmp*8)
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
-#define _mm_slli_si128(a, imm)                                         \
-    __extension__({                                                    \
-        int8x16_t ret;                                                 \
-        if (_sse2neon_unlikely(imm == 0))                              \
-            ret = vreinterpretq_s8_m128i(a);                           \
-        else if (_sse2neon_unlikely((imm) & ~15))                      \
-            ret = vdupq_n_s8(0);                                       \
-        else                                                           \
-            ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a),   \
-                           ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
-        vreinterpretq_m128i_s8(ret);                                   \
-    })
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_slli_si128(a, imm)                                                \
+    _sse2neon_define1(                                                        \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int8x16_t ret; \
+        if (_sse2neon_unlikely((imm) == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);        \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),        \
+                            (((imm) <= 0 || (imm) > 15) ? 0 : (16 - (imm)))); \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Compute the square root of packed double-precision (64-bit) floating-point
 // elements in a, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
 #endif
 }
 
@@ -5808,268 +6296,165 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
 // Shift packed 16-bit integers in a right by count while shifting in sign bits,
 // and store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF count[63:0] > 15
-//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
-//     ELSE
-//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
-//     FI
-//  ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
 {
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~15))
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c > 15))
         return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a),
+                  vdupq_n_s16(-_sse2neon_static_cast(int16_t, c))));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in sign bits,
 // and store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF count[63:0] > 31
-//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-//     ELSE
-//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
-//     FI
-//  ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
 {
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (_sse2neon_unlikely(c & ~31))
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c > 31))
         return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a),
+                  vdupq_n_s32(-_sse2neon_static_cast(int32_t, c))));
 }
 
 // Shift packed 16-bit integers in a right by imm8 while shifting in sign
 // bits, and store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
-//     ELSE
-//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
 {
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+    const int16_t count =
+        (imm & ~15) ? 15 : _sse2neon_static_cast(int16_t, imm);
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(-count)));
 }
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
 // and store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-//     ELSE
-//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) == 0)) {                                \
-            ret = a;                                                         \
-        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_s32(                                   \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
-        }                                                                    \
-        ret;                                                                 \
-    })
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_srai_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret;   \
+        if (_sse2neon_unlikely((imm) == 0)) {                                 \
+            ret = _a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
+        } _sse2neon_return(ret);)
 
 // Shift packed 16-bit integers in a right by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF count[63:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~15))
+    if (_sse2neon_unlikely(c > 15))
         return _mm_setzero_si128();
 
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    int16x8_t vc = vdupq_n_s16(-_sse2neon_static_cast(int16_t, c));
     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
 }
 
 // Shift packed 32-bit integers in a right by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF count[63:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~31))
+    if (_sse2neon_unlikely(c > 31))
         return _mm_setzero_si128();
 
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    int32x4_t vc = vdupq_n_s32(-_sse2neon_static_cast(int32_t, c));
     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
 }
 
 // Shift packed 64-bit integers in a right by count while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF count[63:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 {
     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (_sse2neon_unlikely(c & ~63))
+    if (_sse2neon_unlikely(c > 63))
         return _mm_setzero_si128();
 
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    int64x2_t vc = vdupq_n_s64(-_sse2neon_static_cast(int64_t, c));
     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
 }
 
 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~15)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u16(                                   \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_srli_epi16(a, imm)                                              \
+    _sse2neon_define0(                                                      \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret; \
+        if (_sse2neon_unlikely((imm) & ~15)) {                              \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_u16(vshlq_u16(                        \
+                vreinterpretq_u16_m128i(_a),                                \
+                vdupq_n_s16(_sse2neon_static_cast(int16_t, -(imm)))));      \
+        } _sse2neon_return(ret);)
 
 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~31)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u32(                                   \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_srli_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret;   \
+        if (_sse2neon_unlikely((imm) & ~31)) {                                \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u32(                                    \
+                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } _sse2neon_return(ret);)
 
 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
 // store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF imm8[7:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                               \
-    __extension__({                                                          \
-        __m128i ret;                                                         \
-        if (_sse2neon_unlikely((imm) & ~63)) {                               \
-            ret = _mm_setzero_si128();                                       \
-        } else {                                                             \
-            ret = vreinterpretq_m128i_u64(                                   \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
-        }                                                                    \
-        ret;                                                                 \
-    })
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_srli_epi64(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m128i ret;   \
+        if (_sse2neon_unlikely((imm) & ~63)) {                                \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u64(                                    \
+                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
+        } _sse2neon_return(ret);)
 
 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
 // dst.
-//
-//   tmp := imm8[7:0]
-//   IF tmp > 15
-//     tmp := 16
-//   FI
-//   dst[127:0] := a[127:0] >> (tmp*8)
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
-#define _mm_srli_si128(a, imm)                                       \
-    __extension__({                                                  \
-        int8x16_t ret;                                               \
-        if (_sse2neon_unlikely((imm) & ~15))                         \
-            ret = vdupq_n_s8(0);                                     \
-        else                                                         \
-            ret = vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), \
-                           (imm > 15 ? 0 : imm));                    \
-        vreinterpretq_m128i_s8(ret);                                 \
-    })
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_srli_si128(a, imm)                                                \
+    _sse2neon_define1(                                                        \
+        __m128i, a, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); int8x16_t ret; \
+        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);             \
+        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0),        \
+                            ((imm) > 15 ? 0 : (imm)));                        \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
@@ -6077,10 +6462,12 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#if SSE2NEON_ARCH_AARCH64
+    vst1q_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr),
+              vreinterpretq_f64_m128d(a));
 #else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+    vst1q_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr),
+              vreinterpretq_f32_m128d(a));
 #endif
 }
 
@@ -6090,13 +6477,13 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
+    vst1q_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr),
               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
 #else
     float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
+    vst1q_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr),
               vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
 #endif
 }
@@ -6106,18 +6493,22 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#if SSE2NEON_ARCH_AARCH64
+    vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr),
+             vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
-    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+    vst1_u64(_sse2neon_reinterpret_cast(uint64_t *, mem_addr),
+             vget_low_u64(vreinterpretq_u64_m128d(a)));
 #endif
 }
 
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 {
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+    vst1q_s32(_sse2neon_reinterpret_cast(int32_t *, p),
+              vreinterpretq_s32_m128i(a));
 }
 
 // Store the lower double-precision (64-bit) floating-point element from a into
@@ -6128,48 +6519,43 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 
 // Store the upper double-precision (64-bit) floating-point element from a into
 // memory.
-//
-//   MEM[mem_addr+63:mem_addr] := a[127:64]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#if SSE2NEON_ARCH_AARCH64
+    vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr),
+             vget_high_f64(vreinterpretq_f64_m128d(a)));
 #else
-    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+    vst1_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr),
+             vget_high_f32(vreinterpretq_f32_m128d(a)));
 #endif
 }
 
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 {
-    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+    vst1_u64(_sse2neon_reinterpret_cast(uint64_t *, a),
+             vget_low_u64(vreinterpretq_u64_m128i(b)));
 }
 
 // Store the lower double-precision (64-bit) floating-point element from a into
 // memory.
-//
-//   MEM[mem_addr+63:mem_addr] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
 {
-#if defined(__aarch64__)
-    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#if SSE2NEON_ARCH_AARCH64
+    vst1_f64(_sse2neon_reinterpret_cast(float64_t *, mem_addr),
+             vget_low_f64(vreinterpretq_f64_m128d(a)));
 #else
-    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+    vst1_f32(_sse2neon_reinterpret_cast(float32_t *, mem_addr),
+             vget_low_f32(vreinterpretq_f32_m128d(a)));
 #endif
 }
 
 // Store 2 double-precision (64-bit) floating-point elements from a into memory
 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
 // general-protection exception may be generated.
-//
-//   MEM[mem_addr+63:mem_addr] := a[127:64]
-//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
 {
@@ -6186,65 +6572,89 @@ FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
     _mm_store_pd(mem_addr, a);
 }
 
-// Stores 128-bits of integer data a at the address p.
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
 {
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+    vst1q_s32(_sse2neon_reinterpret_cast(int32_t *, p),
+              vreinterpretq_s32_m128i(a));
 }
 
-// Stores 32-bits of integer data a at the address p.
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
 {
-    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+    vst1q_lane_s32(_sse2neon_reinterpret_cast(int32_t *, p),
+                   vreinterpretq_s32_m128i(a), 0);
 }
 
 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
 // elements) from a into memory using a non-temporal memory hint. mem_addr must
 // be aligned on a 16-byte boundary or a general-protection exception may be
 // generated.
+// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store
+// Non-temporal Pair), providing true non-temporal hint for 128-bit stores.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
-#elif defined(__aarch64__)
+    __builtin_nontemporal_store(a, _sse2neon_reinterpret_cast(__m128d *, p));
+#elif SSE2NEON_ARCH_AARCH64
     vst1q_f64(p, vreinterpretq_f64_m128d(a));
 #else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+    vst1q_s64(_sse2neon_reinterpret_cast(int64_t *, p),
+              vreinterpretq_s64_m128d(a));
 #endif
 }
 
-// Stores the data in a to the address p without polluting the caches.  If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// Note: On AArch64, __builtin_nontemporal_store generates STNP (Store
+// Non-temporal Pair), providing true non-temporal hint for 128-bit stores.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
 {
 #if __has_builtin(__builtin_nontemporal_store)
     __builtin_nontemporal_store(a, p);
 #else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+    vst1q_s64(_sse2neon_reinterpret_cast(int64_t *, p),
+              vreinterpretq_s64_m128i(a));
 #endif
 }
 
 // Store 32-bit integer a into memory using a non-temporal hint to minimize
 // cache pollution. If the cache line containing address mem_addr is already in
 // the cache, the cache will be updated.
+// Note: ARM lacks non-temporal store for 32-bit scalar. STNP requires pair
+// stores; __builtin_nontemporal_store may generate regular store on AArch64.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
 FORCE_INLINE void _mm_stream_si32(int *p, int a)
 {
-    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_lane_s32(_sse2neon_reinterpret_cast(int32_t *, p), vdupq_n_s32(a), 0);
+#endif
 }
 
 // Store 64-bit integer a into memory using a non-temporal hint to minimize
 // cache pollution. If the cache line containing address mem_addr is already in
 // the cache, the cache will be updated.
+// Note: ARM lacks direct non-temporal store for single 64-bit value. STNP
+// requires pair stores; __builtin_nontemporal_store may generate regular store
+// on AArch64.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
 {
-    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1_s64(_sse2neon_reinterpret_cast(int64_t *, p),
+             vdup_n_s64(_sse2neon_static_cast(int64_t, a)));
+#endif
 }
 
 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
@@ -6256,25 +6666,18 @@ FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-//    r0 := a0 - b0
-//    r1 := a1 - b1
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s64(
@@ -6293,25 +6696,25 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 // Subtract packed double-precision (64-bit) floating-point elements in b from
 // packed double-precision (64-bit) floating-point elements in a, and store the
 // results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
-//   ENDFOR
-//
 //  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
-    return vld1q_f32((float32_t *) c);
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
+    return sse2neon_vld1q_f32_from_f64pair(c);
 #endif
 }
 
@@ -6326,9 +6729,6 @@ FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
 }
 
 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] - b[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
 {
@@ -6336,54 +6736,36 @@ FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
 }
 
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s8(
         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
 }
 
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
-//
-//   r0 := UnsignedSaturate(a0 - b0)
-//   r1 := UnsignedSaturate(a1 - b1)
-//   ...
-//   r15 := UnsignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u8(
@@ -6398,36 +6780,33 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
 #define _mm_ucomineq_sd _mm_comineq_sd
 
 // Return vector of type __m128d with undefined elements.
+// Note: MSVC forces zero-initialization while GCC/Clang return truly undefined
+// memory. Use SSE2NEON_UNDEFINED_ZERO=1 to force zero on all compilers.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
 FORCE_INLINE __m128d _mm_undefined_pd(void)
 {
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_UNDEFINED_ZERO || \
+    (SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG)
+    return _mm_setzero_pd();
+#else
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
     __m128d a;
     return a;
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma GCC diagnostic pop
 #endif
+#endif
 }
 
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a4
-//   r1 := b4
-//   r2 := a5
-//   r3 := b5
-//   r4 := a6
-//   r5 := b6
-//   r6 := a7
-//   r7 := b7
-//
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s16(
         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -6438,12 +6817,12 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 #endif
 }
 
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s32(
         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -6454,33 +6833,27 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 #endif
 }
 
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-//   r0 := a1
-//   r1 := b1
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
 {
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
 }
 
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a8
-//   r1 := b8
-//   r2 := a9
-//   r3 := b9
-//   ...
-//   r14 := a15
-//   r15 := b15
-//
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s8(
         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -6495,18 +6868,10 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 
 // Unpack and interleave double-precision (64-bit) floating-point elements from
 // the high half of a and b, and store the results in dst.
-//
-//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
-//     dst[63:0] := src1[127:64]
-//     dst[127:64] := src2[127:64]
-//     RETURN dst[127:0]
-//   }
-//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6516,22 +6881,12 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
 #endif
 }
 
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   r4 := a2
-//   r5 := b2
-//   r6 := a3
-//   r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s16(
         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 #else
@@ -6542,18 +6897,12 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 #endif
 }
 
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s32(
         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 #else
@@ -6564,28 +6913,27 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 #endif
 }
 
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
 {
+#if SSE2NEON_ARCH_AARCH64
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
 }
 
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   ...
-//   r14 := a7
-//   r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s8(
         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 #else
@@ -6598,18 +6946,10 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 
 // Unpack and interleave double-precision (64-bit) floating-point elements from
 // the low half of a and b, and store the results in dst.
-//
-//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
-//     dst[63:0] := src1[63:0]
-//     dst[127:64] := src2[63:0]
-//     RETURN dst[127:0]
-//   }
-//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
@@ -6621,12 +6961,6 @@ FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
 
 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
 // elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := j*64
-//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
 {
@@ -6634,8 +6968,9 @@ FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
 }
 
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
@@ -6644,24 +6979,21 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 
 /* SSE3 */
 
+// Rounding mode note: The single-precision horizontal operations
+// (_mm_addsub_ps, _mm_hadd_ps, _mm_hsub_ps) are sensitive to rounding mode
+// on ARM. On x86, these intrinsics produce consistent results regardless of
+// MXCSR rounding mode. On ARM NEON, the current FPCR/FPSCR rounding mode
+// affects intermediate results. For consistent cross-platform behavior, call
+// _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST) before using these intrinsics.
+
 // Alternatively add and subtract packed double-precision (64-bit)
 // floating-point elements in a to/from packed elements in b, and store the
 // results in dst.
-//
-// FOR j := 0 to 1
-//   i := j*64
-//   IF ((j & 1) == 0)
-//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
-//   ELSE
-//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
-//   FI
-// ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 {
     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
                                              vreinterpretq_f64_m128d(b),
                                              vreinterpretq_f64_m128d(mask)));
@@ -6672,12 +7004,12 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 
 // Alternatively add and subtract packed single-precision (32-bit)
 // floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
+// results in dst. See SSE3 rounding mode note above.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
                                             vreinterpretq_f32_m128(mask),
                                             vreinterpretq_f32_m128(b)));
@@ -6691,23 +7023,31 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
+    return vreinterpretq_m128d_u64(
+        vld1q_u64(_sse2neon_reinterpret_cast(uint64_t *, c)));
 #endif
 }
 
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// See SSE3 rounding mode note above.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 #else
@@ -6723,29 +7063,37 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // Horizontally subtract adjacent pairs of double-precision (64-bit)
 // floating-point elements in a and b, and pack the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
+#if SSE2NEON_ARCH_AARCH64
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
-    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
+    return vreinterpretq_m128d_u64(
+        vld1q_u64(_sse2neon_reinterpret_cast(uint64_t *, c)));
 #endif
 }
 
 // Horizontally subtract adjacent pairs of single-precision (32-bit)
 // floating-point elements in a and b, and pack the results in dst.
+// See SSE3 rounding mode note above.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 {
     float32x4_t a = vreinterpretq_f32_m128(_a);
     float32x4_t b = vreinterpretq_f32_m128(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
 #else
@@ -6757,27 +7105,40 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
 // may perform better than _mm_loadu_si128 when the data crosses a cache line
 // boundary.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
 #define _mm_lddqu_si128 _mm_loadu_si128
 
 // Load a double-precision (64-bit) floating-point element from memory into both
 // elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
 #define _mm_loaddup_pd _mm_load1_pd
 
+// Sets up a linear address range to be monitored by hardware and activates the
+// monitor. The address range should be a write-back memory caching type.
+//
+// ARM implementation notes:
+// - This is a NO-OP. ARM has no userspace equivalent for "monitor a cacheline
+//   and wake on store". There is no "armed" address after calling this.
+// - The extensions and hints parameters are ignored (no architectural
+//   equivalent for x86 C-state hints on ARM).
+// - _mm_mwait provides only a low-power hint, not a monitor-armed wait.
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_monitor
+FORCE_INLINE void _mm_monitor(void const *p,
+                              unsigned int extensions,
+                              unsigned int hints)
+{
+    (void) p;
+    (void) extensions;
+    (void) hints;
+}
+
 // Duplicate the low double-precision (64-bit) floating-point element from a,
 // and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(
         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
 #else
@@ -6791,7 +7152,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6810,7 +7171,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128_f32(
         vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
 #elif defined(_sse2neon_shuffle)
@@ -6824,16 +7185,68 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
 #endif
 }
 
+// Provides a hint that allows the processor to enter an implementation-
+// dependent optimized state while waiting for a memory write to the monitored
+// address range set up by _mm_monitor.
+//
+// ARM implementation notes:
+// - This is only a LOW-POWER HINT, not a monitor-armed wait. Since _mm_monitor
+//   is a no-op on ARM, there is no "armed" address range to wake on.
+// - The extensions and hints parameters are ignored (no architectural
+//   equivalent for x86 C-state hints on ARM).
+// - No memory ordering is guaranteed beyond what the hint instruction provides.
+// - WFI/WFE in EL0 may trap depending on OS configuration (Linux can trap
+//   EL0 WFI/WFE via SCTLR_EL1; iOS/macOS may also restrict these).
+//
+// Behavior controlled by SSE2NEON_MWAIT_POLICY (see top of file for details).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mwait
+FORCE_INLINE void _mm_mwait(unsigned int extensions, unsigned int hints)
+{
+    (void) extensions;
+    (void) hints;
+
+    // ARM implementation: low-power hint via yield/wfe/wfi.
+    // x86: no-op for compilation (MONITOR/MWAIT require CPL0, trap in
+    // userspace).
+#if SSE2NEON_ARCH_AARCH64 || defined(__arm__) || defined(_M_ARM) || \
+    defined(_M_ARM64)
+    // Use MSVC intrinsics on Windows ARM, inline asm on GCC/Clang.
+    // Note: GCC's arm_acle.h may not define __yield/__wfe/__wfi on all
+    // versions.
+#if SSE2NEON_MWAIT_POLICY == 0
+    // Policy 0: yield - safe everywhere, never blocks
+#if SSE2NEON_COMPILER_MSVC
+    __yield();
+#else
+    __asm__ __volatile__("yield" ::: "memory");
+#endif
+
+#elif SSE2NEON_MWAIT_POLICY == 1
+    // Policy 1: wfe - event wait, requires SEV/SEVL, may block
+#if SSE2NEON_COMPILER_MSVC
+    __wfe();
+#else
+    __asm__ __volatile__("wfe" ::: "memory");
+#endif
+
+#elif SSE2NEON_MWAIT_POLICY == 2
+    // Policy 2: wfi - interrupt wait, may trap in EL0
+#if SSE2NEON_COMPILER_MSVC
+    __wfi();
+#else
+    __asm__ __volatile__("wfi" ::: "memory");
+#endif
+
+#else
+#error "Invalid SSE2NEON_MWAIT_POLICY value (must be 0, 1, or 2)"
+#endif
+#endif /* ARM architecture */
+}
+
 /* SSSE3 */
 
 // Compute the absolute value of packed signed 16-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 {
@@ -6842,12 +7255,6 @@ FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
 
 // Compute the absolute value of packed signed 32-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 {
@@ -6856,12 +7263,6 @@ FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
 
 // Compute the absolute value of packed signed 8-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 {
@@ -6870,12 +7271,6 @@ FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
 
 // Compute the absolute value of packed signed 16-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
 {
@@ -6884,12 +7279,6 @@ FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
 
 // Compute the absolute value of packed signed 32-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
 {
@@ -6898,12 +7287,6 @@ FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
 
 // Compute the absolute value of packed signed 8-bit integers in a, and store
 // the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 {
@@ -6912,62 +7295,110 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
 
 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
 // the result right by imm8 bytes, and store the low 16 bytes in dst.
-//
-//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
-//   dst[127:0] := tmp[127:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
-#define _mm_alignr_epi8(a, b, imm)                                            \
-    __extension__({                                                           \
-        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
-        __m128i ret;                                                          \
-        if (_sse2neon_unlikely((imm) & ~31))                                  \
-            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
-        else if (imm >= 16)                                                   \
-            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
-        else                                                                  \
-            ret =                                                             \
-                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
-        ret;                                                                  \
+// imm must be a compile-time constant in range [0, 255]
+#if defined(__GNUC__) && !defined(__clang__)
+#define _mm_alignr_epi8(a, b, imm)                                        \
+    __extension__({                                                       \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                        \
+        __m128i _a_m128i = (a);                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(_a_m128i);                 \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                        \
+        __m128i ret;                                                      \
+        if (_sse2neon_unlikely((imm) & ~31))                              \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                  \
+        else if ((imm) >= 16)                                             \
+            ret = vreinterpretq_m128i_s8(                                 \
+                vextq_s8(vreinterpretq_s8_m128i(_a_m128i), vdupq_n_s8(0), \
+                         ((imm) >= 16 && (imm) < 32) ? (imm) - 16 : 0));  \
+        else                                                              \
+            ret = vreinterpretq_m128i_u8(                                 \
+                vextq_u8(_b, _a, (imm) < 16 ? (imm) : 0));                \
+        ret;                                                              \
     })
 
+// Clang path: inline _mm_srli_si128 logic to avoid both:
+// 1. Variable shadowing: _mm_srli_si128(_a, ...) creates __m128i _a = (_a)
+// 2. Double evaluation: _mm_srli_si128((a), ...) re-evaluates macro arg
+#elif SSE2NEON_COMPILER_CLANG
+#define _mm_alignr_epi8(a, b, imm)                                   \
+    _sse2neon_define2(                                               \
+        __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);    \
+        uint8x16_t __a = vreinterpretq_u8_m128i(_a);                 \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;    \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                   \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                   \
+        else if ((imm) >= 16) ret = vreinterpretq_m128i_s8(          \
+            vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0),      \
+                     ((imm) >= 16 && (imm) < 32) ? (imm) - 16 : 0)); \
+        else ret = vreinterpretq_m128i_u8(                           \
+            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));             \
+        _sse2neon_return(ret);)
+
+// MSVC path: use _a (lambda parameter) since lambda [] cannot capture (a).
+// No shadowing issue because lambda parameters shadow captures properly.
+#else
+#define _mm_alignr_epi8(a, b, imm)                                \
+    _sse2neon_define2(                                            \
+        __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); \
+        uint8x16_t __a = vreinterpretq_u8_m128i(_a);              \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                \
+        else if ((imm) >= 16) ret =                               \
+            _mm_srli_si128(_a, (imm) >= 16 ? (imm) - 16 : 0);     \
+        else ret = vreinterpretq_m128i_u8(                        \
+            vextq_u8(__b, __a, (imm) < 16 ? (imm) : 0));          \
+        _sse2neon_return(ret);)
+
+#endif
+
 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
 // the result right by imm8 bytes, and store the low 8 bytes in dst.
-//
-//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
-//   dst[63:0] := tmp[63:0]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+// imm must be a compile-time constant in range [0, 255]
+#if defined(__GNUC__) && !defined(__clang__)
 #define _mm_alignr_pi8(a, b, imm)                                           \
     __extension__({                                                         \
+        SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);                          \
+        __m64 _a = (a), _b = (b);                                           \
         __m64 ret;                                                          \
         if (_sse2neon_unlikely((imm) >= 16)) {                              \
             ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else if ((imm) >= 8) {                                            \
+            ret = vreinterpret_m64_u8(                                      \
+                vext_u8(vreinterpret_u8_m64(_a), vdup_n_u8(0), (imm) - 8)); \
         } else {                                                            \
-            uint8x8_t tmp_low, tmp_high;                                    \
-            if ((imm) >= 8) {                                               \
-                const int idx = (imm) -8;                                   \
-                tmp_low = vreinterpret_u8_m64(a);                           \
-                tmp_high = vdup_n_u8(0);                                    \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            } else {                                                        \
-                const int idx = (imm);                                      \
-                tmp_low = vreinterpret_u8_m64(b);                           \
-                tmp_high = vreinterpret_u8_m64(a);                          \
-                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
-            }                                                               \
+            ret = vreinterpret_m64_u8(vext_u8(                              \
+                vreinterpret_u8_m64(_b), vreinterpret_u8_m64(_a), (imm)));  \
         }                                                                   \
         ret;                                                                \
     })
 
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
+#else
+#define _mm_alignr_pi8(a, b, imm)                                              \
+    _sse2neon_define2(                                                         \
+        __m64, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255); __m64 ret;     \
+        if (_sse2neon_unlikely((imm) >= 16)) {                                 \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                           \
+        } else if ((imm) >= 8) {                                               \
+            ret = vreinterpret_m64_u8(vext_u8(vreinterpret_u8_m64(_a),         \
+                                              vdup_n_u8(0), ((imm) - 8) & 7)); \
+        } else {                                                               \
+            ret = vreinterpret_m64_u8(vext_u8(                                 \
+                vreinterpret_u8_m64(_b), vreinterpret_u8_m64(_a), (imm) & 7)); \
+        } _sse2neon_return(ret);)
+
+#endif
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
 #else
     return vreinterpretq_m128i_s16(
@@ -6976,13 +7407,14 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
 #endif
 }
 
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
 #else
     return vreinterpretq_m128i_s32(
@@ -7009,11 +7441,12 @@ FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
 }
 
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
     return vreinterpretq_s64_s16(
@@ -7038,7 +7471,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t res = vuzp_s16(a, b);
@@ -7053,7 +7486,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s16(
         vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -7069,7 +7502,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
     int32x4_t b = vreinterpretq_s32_m128i(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s32(
         vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
 #else
@@ -7085,7 +7518,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -7100,7 +7533,7 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 {
     int32x2_t a = vreinterpret_s32_m64(_a);
     int32x2_t b = vreinterpret_s32_m64(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
 #else
     int32x2x2_t c = vuzp_s32(a, b);
@@ -7108,14 +7541,14 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
 #endif
 }
 
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
     int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s16(
         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
 #else
@@ -7131,7 +7564,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 {
     int16x4_t a = vreinterpret_s16_m64(_a);
     int16x4_t b = vreinterpret_s16_m64(_b);
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
 #else
     int16x4x2_t c = vuzp_s16(a, b);
@@ -7143,15 +7576,10 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
 // and pack the saturated results in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*16
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-//      a[i+7:i]*b[i+7:i] )
-//   ENDFOR
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t a = vreinterpretq_u8_m128i(_a);
     int8x16_t b = vreinterpretq_s8_m128i(_b);
     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
@@ -7212,12 +7640,7 @@ FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
 // Multiply packed signed 16-bit integers in a and b, producing intermediate
 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
 // the packed 16-bit integers in dst.
-//
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-//   ...
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
 {
     // Has issues due to saturation
@@ -7260,7 +7683,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
     uint8x16_t idx_masked =
         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #elif defined(__GNUC__)
     int8x16_t ret;
@@ -7283,22 +7706,12 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
 
 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
 // corresponding 8-bit element of b, and store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     IF b[i+7] == 1
-//       dst[i+7:i] := 0
-//     ELSE
-//       index[2:0] := b[i+2:i]
-//       dst[i+7:i] := a[index*8+7:index*8]
-//     FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
 {
     const int8x8_t controlMask =
-        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+        vand_s8(vreinterpret_s8_m64(b),
+                vdup_n_s8(_sse2neon_static_cast(int8_t, 0x1 << 7 | 0x07)));
     int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
     return vreinterpret_m64_s8(res);
 }
@@ -7307,16 +7720,7 @@ FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
 // 16-bit integer in b is negative, and store the results in dst.
 // Element in dst are zeroed out when the corresponding element
 // in b is zero.
-//
-//   for i in 0..7
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
 {
     int16x8_t a = vreinterpretq_s16_m128i(_a);
@@ -7326,7 +7730,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
     // (b < 0) ? 0xFFFF : 0
     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
 #else
     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
@@ -7344,16 +7748,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
 // 32-bit integer in b is negative, and store the results in dst.
 // Element in dst are zeroed out when the corresponding element
 // in b is zero.
-//
-//   for i in 0..3
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
 {
     int32x4_t a = vreinterpretq_s32_m128i(_a);
@@ -7364,7 +7759,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
 #else
     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
@@ -7382,16 +7777,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
 // 8-bit integer in b is negative, and store the results in dst.
 // Element in dst are zeroed out when the corresponding element
 // in b is zero.
-//
-//   for i in 0..15
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
 {
     int8x16_t a = vreinterpretq_s8_m128i(_a);
@@ -7402,7 +7788,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
 #else
     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
@@ -7420,18 +7806,6 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
 // integer in b is negative, and store the results in dst. Element in dst are
 // zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      IF b[i+15:i] < 0
-//        dst[i+15:i] := -(a[i+15:i])
-//      ELSE IF b[i+15:i] == 0
-//        dst[i+15:i] := 0
-//      ELSE
-//        dst[i+15:i] := a[i+15:i]
-//      FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
 {
@@ -7443,7 +7817,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
 
     // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
 #else
     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
@@ -7461,18 +7835,6 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
 // integer in b is negative, and store the results in dst. Element in dst are
 // zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 1
-//      i := j*32
-//      IF b[i+31:i] < 0
-//        dst[i+31:i] := -(a[i+31:i])
-//      ELSE IF b[i+31:i] == 0
-//        dst[i+31:i] := 0
-//      ELSE
-//        dst[i+31:i] := a[i+31:i]
-//      FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
 {
@@ -7484,7 +7846,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
 
     // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
 #else
     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
@@ -7502,18 +7864,6 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
 // in b is negative, and store the results in dst. Element in dst are zeroed out
 // when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      IF b[i+7:i] < 0
-//        dst[i+7:i] := -(a[i+7:i])
-//      ELSE IF b[i+7:i] == 0
-//        dst[i+7:i] := 0
-//      ELSE
-//        dst[i+7:i] := a[i+7:i]
-//      FI
-//   ENDFOR
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 {
@@ -7525,7 +7875,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
 
     // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
 #else
     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
@@ -7544,76 +7894,61 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
 
 // Blend packed 16-bit integers from a and b using control mask imm8, and store
 // the results in dst.
-//
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF imm8[j]
-//           dst[i+15:i] := b[i+15:i]
-//       ELSE
-//           dst[i+15:i] := a[i+15:i]
-//       FI
-//   ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                            \
-    __extension__({                                                           \
-        const uint16_t ones = 0xffff;                                         \
-        const uint16_t zeros = 0x0000;                                        \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? ones : zeros,         \
-                                   ((imm) & (1 << 1)) ? ones : zeros,         \
-                                   ((imm) & (1 << 2)) ? ones : zeros,         \
-                                   ((imm) & (1 << 3)) ? ones : zeros,         \
-                                   ((imm) & (1 << 4)) ? ones : zeros,         \
-                                   ((imm) & (1 << 5)) ? ones : zeros,         \
-                                   ((imm) & (1 << 6)) ? ones : zeros,         \
-                                   ((imm) & (1 << 7)) ? ones : zeros};        \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
-    })
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm)
+// imm must be a compile-time constant in range [0, 255]
+#define _mm_blend_epi16(a, b, imm)                                           \
+    _sse2neon_define2(                                                       \
+        __m128i, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 255);            \
+        const uint16_t _mask[8] = _sse2neon_init(                            \
+            ((imm) & (1 << 0)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0), \
+            ((imm) & (1 << 1)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 2)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 3)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 4)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 5)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 6)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0),  \
+            ((imm) & (1 << 7)) ? _sse2neon_static_cast(uint16_t, -1) : _sse2neon_static_cast(uint16_t, 0x0)); \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                             \
+        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                        \
+        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return(      \
+            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
 
 // Blend packed double-precision (64-bit) floating-point elements from a and b
 // using control mask imm8, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
-#define _mm_blend_pd(a, b, imm)                                \
-    __extension__({                                            \
-        const uint64_t _mask[2] = {                            \
-            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
-            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
-        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
-        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
-        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
-        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
-    })
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_blend_pd(a, b, imm)                                              \
+    _sse2neon_define2(                                                       \
+        __m128d, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3);              \
+        const uint64_t _mask[2] =                                            \
+            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
+                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
+        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
+        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
+            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
 
 // Blend packed single-precision (32-bit) floating-point elements from a and b
 // using mask, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
-FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
-{
-    const uint32_t ALIGN_STRUCT(16)
-        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
-                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
-    uint32x4_t mask = vld1q_u32(data);
-    float32x4_t a = vreinterpretq_f32_m128(_a);
-    float32x4_t b = vreinterpretq_f32_m128(_b);
-    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
-}
+// imm8 must be a compile-time constant in range [0, 15]
+#define _mm_blend_ps(a, b, imm8)                                        \
+    _sse2neon_define2(                                                  \
+        __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 15);        \
+        const uint32_t _mask[4] =                                       \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);       \
+        uint32x4_t _mask_vec = vld1q_u32(_mask);                        \
+        float32x4_t __a = vreinterpretq_f32_m128(_a);                   \
+        float32x4_t __b = vreinterpretq_f32_m128(_b); _sse2neon_return( \
+            vreinterpretq_m128_f32(vbslq_f32(_mask_vec, __b, __a)));)
 
 // Blend packed 8-bit integers from a and b using mask, and store the results in
 // dst.
-//
-//   FOR j := 0 to 15
-//       i := j*8
-//       IF mask[i+7]
-//           dst[i+7:i] := b[i+7:i]
-//       ELSE
-//           dst[i+7:i] := a[i+7:i]
-//       FI
-//   ENDFOR
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
 {
     // Use a signed shift right to create a mask with the sign bit
@@ -7631,7 +7966,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
 {
     uint64x2_t mask =
         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     float64x2_t a = vreinterpretq_f64_m128d(_a);
     float64x2_t b = vreinterpretq_f64_m128d(_b);
     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
@@ -7661,11 +7996,13 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -7675,10 +8012,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
 #else
-    float *f = (float *) &a;
+    float *f = _sse2neon_reinterpret_cast(float *, &a);
     return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
 #endif
 }
@@ -7697,10 +8034,6 @@ FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
 // an integer value, store the result as a single-precision floating-point
 // element in the lower element of dst, and copy the upper 3 packed elements
 // from a to the upper elements of dst.
-//
-//   dst[31:0] := CEIL(b[31:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 {
@@ -7711,7 +8044,7 @@ FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
 // in dst
 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_u64(
         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
 #else
@@ -7724,16 +8057,18 @@ FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 #endif
 }
 
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
 {
     return vreinterpretq_m128i_s32(
         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
 }
 
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
 {
     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
@@ -7742,16 +8077,18 @@ FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
     return vreinterpretq_m128i_s64(s64x2);
 }
 
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
 {
     return vreinterpretq_m128i_s64(
         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
 }
 
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
 {
     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
@@ -7759,8 +8096,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
     return vreinterpretq_m128i_s16(s16x8);
 }
 
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
 {
     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
@@ -7769,8 +8107,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
     return vreinterpretq_m128i_s32(s32x4);
 }
 
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
 {
     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
@@ -7780,16 +8119,18 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
     return vreinterpretq_m128i_s64(s64x2);
 }
 
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
 {
     return vreinterpretq_m128i_u32(
         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
 }
 
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
 {
     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
@@ -7798,8 +8139,9 @@ FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
     return vreinterpretq_m128i_u64(u64x2);
 }
 
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
 {
     return vreinterpretq_m128i_u64(
@@ -7816,9 +8158,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
     return vreinterpretq_m128i_u16(u16x8);
 }
 
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
 {
     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
@@ -7827,8 +8169,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
     return vreinterpretq_m128i_u32(u32x4);
 }
 
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
 {
     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
@@ -7845,11 +8188,11 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
 FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 {
     // Generate mask value from constant immediate bit value
-    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
-    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+    const int64_t bit0Mask = imm & 0x01 ? INT64_C(-1) : 0;
+    const int64_t bit1Mask = imm & 0x02 ? INT64_C(-1) : 0;
 #if !SSE2NEON_PRECISE_DP
-    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
-    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+    const int64_t bit4Mask = imm & 0x10 ? INT64_C(-1) : 0;
+    const int64_t bit5Mask = imm & 0x20 ? INT64_C(-1) : 0;
 #endif
     // Conditional multiplication
 #if !SSE2NEON_PRECISE_DP
@@ -7858,7 +8201,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
         _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
     __m128d tmp = _mm_and_pd(mul, mulMask);
 #else
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                              : 0;
@@ -7866,16 +8209,28 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
     // Sum the products
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7890,65 +8245,99 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 {
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
+    /* Early exit: no input selected or no output lanes */
+    if ((imm & 0xF0) == 0 || (imm & 0x0F) == 0)
+        return _mm_setzero_ps();
+
+    float32x4_t prod = vreinterpretq_f32_m128(_mm_mul_ps(a, b));
+
+#if SSE2NEON_ARCH_AARCH64
+    /* Fast path: all elements, broadcast to all lanes */
+    if (imm == 0xFF)
+        return _mm_set1_ps(vaddvq_f32(prod));
+
+    /* Fast path: 3-element dot product (x,y,z), broadcast to all lanes */
     if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
+        prod = vsetq_lane_f32(0.0f, prod, 3);
+        return _mm_set1_ps(vaddvq_f32(prod));
     }
-#endif
 
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
+    /* Vectorized generic path: apply input mask, sum, apply output mask */
+    const uint32_t input_mask[4] = {
+        (imm & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0),
+    };
+    prod = vreinterpretq_f32_u32(
+        vandq_u32(vreinterpretq_u32_f32(prod), vld1q_u32(input_mask)));
+
+    float32x4_t sum = vdupq_n_f32(vaddvq_f32(prod));
+
+    const uint32_t output_mask[4] = {
+        (imm & 0x1) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & 0x2) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & 0x4) ? ~UINT32_C(0) : UINT32_C(0),
+        (imm & 0x8) ? ~UINT32_C(0) : UINT32_C(0),
+    };
+    return vreinterpretq_m128_f32(vreinterpretq_f32_u32(
+        vandq_u32(vreinterpretq_u32_f32(sum), vld1q_u32(output_mask))));
+#else
+    /* ARMv7: scalar fallback (no vaddvq_f32) */
+    float s = 0.0f;
 
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
     if (imm & (1 << 4))
-        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+        s += vgetq_lane_f32(prod, 0);
     if (imm & (1 << 5))
-        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+        s += vgetq_lane_f32(prod, 1);
     if (imm & (1 << 6))
-        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+        s += vgetq_lane_f32(prod, 2);
     if (imm & (1 << 7))
-        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
+        s += vgetq_lane_f32(prod, 3);
+
+    const float32_t res[4] = {
+        (imm & 0x1) ? s : 0.0f,
+        (imm & 0x2) ? s : 0.0f,
+        (imm & 0x4) ? s : 0.0f,
+        (imm & 0x8) ? s : 0.0f,
     };
-    return vreinterpretq_m128_f32(res);
+    return vreinterpretq_m128_f32(vld1q_f32(res));
+#endif
 }
 
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_extract_epi32(a, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \
+     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 1]
+#define _mm_extract_epi64(a, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 1), \
+     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, const int imm)
+// imm must be a compile-time constant in range [0, 15]
+#define _mm_extract_epi8(a, imm)               \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 15), \
+     vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)))
 
 // Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+// FORCE_INLINE int _mm_extract_ps(__m128 a, const int imm)
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_extract_ps(a, imm)                \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \
+     vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)))
 
 // Round the packed double-precision (64-bit) floating-point elements in a down
 // to an integer value, and store the results as packed double-precision
@@ -7956,11 +8345,13 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }
 
@@ -7970,10 +8361,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
 #else
-    float *f = (float *) &a;
+    float *f = _sse2neon_reinterpret_cast(float *, &a);
     return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
 #endif
 }
@@ -7992,80 +8383,70 @@ FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
 // an integer value, store the result as a single-precision floating-point
 // element in the lower element of dst, and copy the upper 3 packed elements
 // from a to the upper elements of dst.
-//
-//   dst[31:0] := FLOOR(b[31:0])
-//   dst[127:32] := a[127:32]
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
 {
     return _mm_move_ss(a, _mm_floor_ps(b));
 }
 
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, const int imm)
+// imm must be a compile-time constant in range [0, 3]
+#define _mm_insert_epi32(a, b, imm)           \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 3), \
+     vreinterpretq_m128i_s32(                 \
+         vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))))
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, const int imm)
+// imm must be a compile-time constant in range [0, 1]
+#define _mm_insert_epi64(a, b, imm)           \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 1), \
+     vreinterpretq_m128i_s64(                 \
+         vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))))
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, const int imm)
+// imm must be a compile-time constant in range [0, 15]
+#define _mm_insert_epi8(a, b, imm)             \
+    (SSE2NEON_REQUIRE_CONST_RANGE(imm, 0, 15), \
+     vreinterpretq_m128i_s8(                   \
+         vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))))
 
 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
 // element from b into tmp using the control in imm8. Store tmp to dst using
 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+// imm8 must be a compile-time constant in range [0, 255]
 #define _mm_insert_ps(a, b, imm8)                                              \
-    __extension__({                                                            \
+    _sse2neon_define2(                                                         \
+        __m128, a, b, SSE2NEON_REQUIRE_CONST_RANGE(imm8, 0, 255);              \
         float32x4_t tmp1 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
-                           vreinterpretq_f32_m128(a), 0);                      \
+            vsetq_lane_f32(vgetq_lane_f32(_b, ((imm8) >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                     \
         float32x4_t tmp2 =                                                     \
-            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
-                           ((imm8 >> 4) & 0x3));                               \
-        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
-                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                            \
+                           vreinterpretq_f32_m128(_a), (((imm8) >> 4) & 0x3)); \
+        const uint32_t data[4] =                                               \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,               \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);              \
         uint32x4_t mask = vld1q_u32(data);                                     \
         float32x4_t all_zeros = vdupq_n_f32(0);                                \
                                                                                \
-        vreinterpretq_m128_f32(                                                \
-            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-    })
+        _sse2neon_return(vreinterpretq_m128_f32(                               \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
 
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 > b0) ? a0 : b0
-//   r1 := (a1 > b1) ? a1 : b1
-//   r2 := (a2 > b2) ? a2 : b2
-//   r3 := (a3 > b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
@@ -8099,16 +8480,9 @@ FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
 }
 
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 < b0) ? a0 : b0
-//   r1 := (a1 < b1) ? a1 : b1
-//   r2 := (a2 < b2) ? a2 : b2
-//   r3 := (a3 < b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
@@ -8144,62 +8518,43 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
 
 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
-//
-//   index[2:0] := 0
-//   min[15:0] := a[15:0]
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF a[i+15:i] < min[15:0]
-//           index[2:0] := j
-//           min[15:0] := a[i+15:i]
-//       FI
-//   ENDFOR
-//   dst[15:0] := min[15:0]
-//   dst[18:16] := index[2:0]
-//   dst[127:19] := 0
-//
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
 {
-    __m128i dst;
     uint16_t min, idx = 0;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
+    uint16x8_t _a = vreinterpretq_u16_m128i(a);
     // Find the minimum value
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+    min = vminvq_u16(_a);
 
     // Get the index of the minimum value
     static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
     uint16x8_t minv = vdupq_n_u16(min);
-    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    uint16x8_t cmeq = vceqq_u16(minv, _a);
     idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
 #else
+    uint16x8_t _a = vreinterpretq_u16_m128i(a);
     // Find the minimum value
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    uint16x4_t tmp = vmin_u16(vget_low_u16(_a), vget_high_u16(_a));
+    tmp = vpmin_u16(tmp, tmp);
+    tmp = vpmin_u16(tmp, tmp);
+    min = vget_lane_u16(tmp, 0);
     // Get the index of the minimum value
     int i;
     for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
+        if (min == vgetq_lane_u16(_a, 0)) {
+            idx = _sse2neon_static_cast(uint16_t, i);
             break;
         }
-        a = _mm_srli_si128(a, 2);
+        _a = vreinterpretq_u16_s8(
+            vextq_s8(vreinterpretq_s8_u16(_a), vreinterpretq_s8_u16(_a), 2));
     }
 #endif
     // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
+    uint16x8_t result = vdupq_n_u16(0);
+    result = vsetq_lane_u16(min, result, 0);
+    result = vsetq_lane_u16(idx, result, 1);
+    return vreinterpretq_m128i_u16(result);
 }
 
 // Compute the sum of absolute differences (SADs) of quadruplets of unsigned
@@ -8223,8 +8578,10 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
                                             vreinterpretq_u32_m128i(a), 1));
         break;
     default:
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
         __builtin_unreachable();
+#elif SSE2NEON_COMPILER_MSVC
+        __assume(0);
 #endif
         break;
     }
@@ -8247,8 +8604,10 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
             vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
         break;
     default:
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
         __builtin_unreachable();
+#elif SSE2NEON_COMPILER_MSVC
+        __assume(0);
 #endif
         break;
     }
@@ -8262,7 +8621,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
     c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
     uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
     c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     // |0|4|2|6|
     c04 = vpaddq_s16(c04, c26);
     // |1|5|3|7|
@@ -8288,9 +8647,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
 
 // Multiply the low signed 32-bit integers from each packed 64-bit element in
 // a and b, and store the signed 64-bit results in dst.
-//
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
 {
     // vmull_s32 upcasts instead of masking, so we downcast.
@@ -8299,26 +8656,18 @@ FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
 }
 
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_s32(
         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   r2 := UnsignedSaturate(a2)
-//   r3 := UnsignedSaturate(a3)
-//   r4 := UnsignedSaturate(b0)
-//   r5 := UnsignedSaturate(b1)
-//   r6 := UnsignedSaturate(b2)
-//   r7 := UnsignedSaturate(b3)
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 {
     return vreinterpretq_m128i_u16(
@@ -8332,23 +8681,25 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 {
-#if defined(__aarch64__)
+    rounding &= ~(_MM_FROUND_RAISE_EXC | _MM_FROUND_NO_EXC);
+
+#if SSE2NEON_ARCH_AARCH64
     switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_NEAREST_INT:
         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_NEG_INF:
         return _mm_floor_pd(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_POS_INF:
         return _mm_ceil_pd(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_ZERO:
         return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
     default:  //_MM_FROUND_CUR_DIRECTION
         return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
     }
 #else
-    double *v_double = (double *) &a;
+    double *v_double = _sse2neon_reinterpret_cast(double *, &a);
 
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+    if (rounding == _MM_FROUND_TO_NEAREST_INT ||
         (rounding == _MM_FROUND_CUR_DIRECTION &&
          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
         double res[2], tmp;
@@ -8381,11 +8732,11 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
             res[i] = (v_double[i] < 0) ? -res[i] : res[i];
         }
         return _mm_set_pd(res[1], res[0]);
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+    } else if (rounding == _MM_FROUND_TO_NEG_INF ||
                (rounding == _MM_FROUND_CUR_DIRECTION &&
                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
         return _mm_floor_pd(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+    } else if (rounding == _MM_FROUND_TO_POS_INF ||
                (rounding == _MM_FROUND_CUR_DIRECTION &&
                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
         return _mm_ceil_pd(a);
@@ -8401,48 +8752,58 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
 {
-#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    rounding &= ~(_MM_FROUND_RAISE_EXC | _MM_FROUND_NO_EXC);
+
+#if SSE2NEON_ARCH_AARCH64 || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
     switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_NEAREST_INT:
         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_NEG_INF:
         return _mm_floor_ps(a);
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_POS_INF:
         return _mm_ceil_ps(a);
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+    case _MM_FROUND_TO_ZERO:
         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
     default:  //_MM_FROUND_CUR_DIRECTION
         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
     }
 #else
-    float *v_float = (float *) &a;
+    float *v_float = _sse2neon_reinterpret_cast(float *, &a);
+    float32x4_t v = vreinterpretq_f32_m128(a);
+
+    /* Detect values safe to convert to int32. Values outside this range
+     * (including infinity, NaN, and large finite values) must be preserved
+     * as-is since integer conversion would produce undefined results. */
+    const float32x4_t max_representable = vdupq_n_f32(2147483520.0f);
+    uint32x4_t is_safe =
+        vcleq_f32(vabsq_f32(v), max_representable); /* |v| <= max int32 */
 
-    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+    if (rounding == _MM_FROUND_TO_NEAREST_INT ||
         (rounding == _MM_FROUND_CUR_DIRECTION &&
          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
         uint32x4_t signmask = vdupq_n_u32(0x80000000);
-        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-        int32x4_t r_trunc = vcvtq_s32_f32(
-            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        float32x4_t half =
+            vbslq_f32(signmask, v, vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal =
+            vcvtq_s32_f32(vaddq_f32(v, half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(v);  /* truncate to integer: [a] */
         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
         float32x4_t delta = vsubq_f32(
-            vreinterpretq_f32_m128(a),
-            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+            v, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
         uint32x4_t is_delta_half =
             vceqq_f32(delta, half); /* delta == +/- 0.5 */
-        return vreinterpretq_m128_f32(
-            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
-    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+        float32x4_t rounded =
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+        /* Preserve original value for inputs outside int32 range */
+        return vreinterpretq_m128_f32(vbslq_f32(is_safe, rounded, v));
+    } else if (rounding == _MM_FROUND_TO_NEG_INF ||
                (rounding == _MM_FROUND_CUR_DIRECTION &&
                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
         return _mm_floor_ps(a);
-    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+    } else if (rounding == _MM_FROUND_TO_POS_INF ||
                (rounding == _MM_FROUND_CUR_DIRECTION &&
                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
         return _mm_ceil_ps(a);
@@ -8487,16 +8848,16 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
 // Load 128-bits of integer data from memory into dst using a non-temporal
 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
 // general-protection exception may be generated.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
+// Note: On AArch64, __builtin_nontemporal_load generates LDNP (Load
+// Non-temporal Pair), providing true non-temporal hint for 128-bit loads.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
 {
-#if __has_builtin(__builtin_nontemporal_store)
+#if __has_builtin(__builtin_nontemporal_load)
     return __builtin_nontemporal_load(p);
 #else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+    return vreinterpretq_m128i_s64(
+        vld1q_s64(_sse2neon_reinterpret_cast(int64_t *, p)));
 #endif
 }
 
@@ -8505,8 +8866,9 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
 FORCE_INLINE int _mm_test_all_ones(__m128i a)
 {
-    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
+    return _sse2neon_static_cast(uint64_t,
+                                 vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~_sse2neon_static_cast(uint64_t, 0);
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and
@@ -8525,14 +8887,22 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
 // otherwise return 0.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// Note: Argument names may be wrong in the Intel intrinsics guide.
 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 {
-    uint64x2_t zf =
-        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t cf =
-        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
-    uint64x2_t result = vandq_u64(zf, cf);
-    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+    uint64x2_t v = vreinterpretq_u64_m128i(a);
+    uint64x2_t m = vreinterpretq_u64_m128i(mask);
+
+    // find ones (set-bits) and zeros (clear-bits) under clip mask
+    uint64x2_t ones = vandq_u64(m, v);
+    uint64x2_t zeros = vbicq_u64(m, v);
+
+    // If both 128-bit variables are populated (non-zero) then return 1.
+    // For comparison purposes, first compact each var down to 32-bits.
+    uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
+
+    // if folding minimum is non-zero then both vars must be non-zero
+    return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -8542,9 +8912,9 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
@@ -8562,17 +8932,17 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
 {
-    int64x2_t s64 =
+    int64x2_t s64_vec =
         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    return !(vgetq_lane_s64(s64_vec, 0) | vgetq_lane_s64(s64_vec, 1));
 }
 
 /* SSE4.2 */
 
-const static uint16_t _sse2neon_cmpestr_mask16b[8] ALIGN_STRUCT(16) = {
+static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 };
-const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
     0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 };
@@ -8732,40 +9102,40 @@ const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
                                       SSE2NEON_CAT(u, size)))                \
     } while (0)
 
-#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
-    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
-                                                int lb)                       \
-    {                                                                         \
-        __m128i mtx[16];                                                      \
-        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
-                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
-        return SSE2NEON_CAT(                                                  \
-            _sse2neon_aggregate_equal_any_,                                   \
-            SSE2NEON_CAT(                                                     \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
-                                             type))))(la, lb, mtx);           \
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                               \
+    static uint16_t _sse2neon_cmp_##type##_equal_any(__m128i a, int la, \
+                                                     __m128i b, int lb) \
+    {                                                                   \
+        __m128i mtx[16];                                                \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),    \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));      \
+        return SSE2NEON_CAT(                                            \
+            _sse2neon_aggregate_equal_any_,                             \
+            SSE2NEON_CAT(                                               \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                  \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
+                                             type))))(la, lb, mtx);     \
     }
 
-#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
-    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
-                                                 int lb)                       \
-    {                                                                          \
-        __m128i mtx[16];                                                       \
-        PCMPSTR_RANGES(                                                        \
-            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
-            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
-        return SSE2NEON_CAT(                                                   \
-            _sse2neon_aggregate_ranges_,                                       \
-            SSE2NEON_CAT(                                                      \
-                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
-                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
-                                             type))))(la, lb, mtx);            \
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)          \
+    static uint16_t _sse2neon_cmp_##us##type##_ranges(__m128i a, int la,     \
+                                                      __m128i b, int lb)     \
+    {                                                                        \
+        __m128i mtx[16];                                                     \
+        PCMPSTR_RANGES(                                                      \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);    \
+        return SSE2NEON_CAT(                                                 \
+            _sse2neon_aggregate_ranges_,                                     \
+            SSE2NEON_CAT(                                                    \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                       \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,      \
+                                             type))))(la, lb, mtx);          \
     }
 
 #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
-    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
-                                                    __m128i b, int lb)         \
+    static uint16_t _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,    \
+                                                         __m128i b, int lb)    \
     {                                                                          \
         __m128i mtx[16];                                                       \
         PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
@@ -8779,40 +9149,88 @@ const static uint8_t _sse2neon_cmpestr_mask8b[16] ALIGN_STRUCT(16) = {
             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
     }
 
-static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_8x16(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
-    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
-    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u8(
-            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u8(
-            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
-        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m & 0xff)), vec_mask);
+    uint8x8_t t_hi =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m >> 8)), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+
+    /* Process all 16 rows in parallel.
+     * For each row j, check if any element in mtx[j] (masked by vec) is
+     * non-zero. Result bit j = 1 if row j has any match.
+     *
+     * Key optimization: Process all rows, then mask by lb at the end.
+     * This allows full SIMD utilization without loop-carried dependencies.
+     */
+#if SSE2NEON_ARCH_AARCH64
+    /* AArch64: Use vmaxvq for horizontal max (equivalent to OR for 0/1) */
+#define SSE2NEON_UMAXV_MATCH(i)                                           \
+    ((vmaxvq_u8(vandq_u8(vec, vreinterpretq_u8_m128i(mtx[i]))) ? 1U : 0U) \
+     << (i))
+    uint16_t res = _sse2neon_static_cast(
+        uint16_t, (SSE2NEON_UMAXV_MATCH(0) | SSE2NEON_UMAXV_MATCH(1) |
+                   SSE2NEON_UMAXV_MATCH(2) | SSE2NEON_UMAXV_MATCH(3) |
+                   SSE2NEON_UMAXV_MATCH(4) | SSE2NEON_UMAXV_MATCH(5) |
+                   SSE2NEON_UMAXV_MATCH(6) | SSE2NEON_UMAXV_MATCH(7) |
+                   SSE2NEON_UMAXV_MATCH(8) | SSE2NEON_UMAXV_MATCH(9) |
+                   SSE2NEON_UMAXV_MATCH(10) | SSE2NEON_UMAXV_MATCH(11) |
+                   SSE2NEON_UMAXV_MATCH(12) | SSE2NEON_UMAXV_MATCH(13) |
+                   SSE2NEON_UMAXV_MATCH(14) | SSE2NEON_UMAXV_MATCH(15)) &
+                      0xFFFFu);
+#undef SSE2NEON_UMAXV_MATCH
+#else
+    /* ARMv7: Use OR-based horizontal reduction (faster than vpmax cascade).
+     * The _sse2neon_any_nonzero_u8x16 helper uses 3 OR ops vs 4 vpmax ops.
+     */
+    uint16_t res = 0;
+    for (int j = 0; j < 16; j++) {
+        uint8x16_t masked = vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]));
+        res |= (_sse2neon_any_nonzero_u8x16(masked) ? 1U : 0U) << j;
     }
-    return res;
+#endif
+    /* Mask result to valid range based on lb */
+    return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1);
 }
 
-static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_equal_any_16x8(int la,
+                                                   int lb,
+                                                   __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t m = _sse2neon_static_cast(uint16_t, 1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
-    for (int j = 0; j < lb; j++) {
-        mtx[j] = vreinterpretq_m128i_u16(
-            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
-        mtx[j] = vreinterpretq_m128i_u16(
-            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
-        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
-        res |= (tmp << j);
+
+    /* Process all 8 rows in parallel for 16-bit word mode.
+     * Result bit j = 1 if any element in row j matches.
+     */
+#if SSE2NEON_ARCH_AARCH64
+    /* AArch64: Use vmaxvq for horizontal max */
+#define SSE2NEON_UMAXV_MATCH16(i)                                            \
+    ((vmaxvq_u16(vandq_u16(vec, vreinterpretq_u16_m128i(mtx[i]))) ? 1U : 0U) \
+     << (i))
+    uint16_t res = _sse2neon_static_cast(
+        uint16_t, (SSE2NEON_UMAXV_MATCH16(0) | SSE2NEON_UMAXV_MATCH16(1) |
+                   SSE2NEON_UMAXV_MATCH16(2) | SSE2NEON_UMAXV_MATCH16(3) |
+                   SSE2NEON_UMAXV_MATCH16(4) | SSE2NEON_UMAXV_MATCH16(5) |
+                   SSE2NEON_UMAXV_MATCH16(6) | SSE2NEON_UMAXV_MATCH16(7)) &
+                      0xFFu);
+#undef SSE2NEON_UMAXV_MATCH16
+#else
+    /* ARMv7: Use OR-based horizontal reduction */
+    uint16_t res = 0;
+    for (int j = 0; j < 8; j++) {
+        uint16x8_t masked = vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]));
+        res |= (_sse2neon_any_nonzero_u16x8(masked) ? 1U : 0U) << j;
     }
-    return res;
+#endif
+    /* Mask result to valid range based on lb */
+    return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1);
 }
 
 /* clang-format off */
@@ -8823,12 +9241,51 @@ static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
 
 SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
 
-static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t m = _sse2neon_static_cast(uint16_t, 1 << la) - 1;
     uint16x8_t vec =
         vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+
+#if SSE2NEON_ARCH_AARCH64
+    /* Vectorized: process all 8 rows in parallel using vmaxvq.
+     * For RANGES mode with word elements:
+     * - Each row has 8 u16 values representing comparisons with 4 range pairs
+     * - Adjacent u16 elements [2k, 2k+1] form a range: (char >= low, char <=
+     * high)
+     * - Result bit j = 1 if any range pair matches for haystack position j
+     *
+     * Algorithm per row:
+     * 1. Mask by la validity: vand(vec, mtx[i])
+     * 2. Swap adjacent u16 pairs: vrev32 swaps within each 32-bit lane
+     * 3. Pair-AND: AND original with swapped to get [m0&m1, m0&m1, ...]
+     * 4. Horizontal OR via vmaxvq_u16 (faster than vmaxvq_u32)
+     */
+#define SSE2NEON_RANGES_MATCH16(i)                                           \
+    do {                                                                     \
+        uint16x8_t masked = vandq_u16(vec, vreinterpretq_u16_m128i(mtx[i])); \
+        uint16x8_t swapped = vrev32q_u16(masked);                            \
+        uint16x8_t pair_and = vandq_u16(masked, swapped);                    \
+        res |= _sse2neon_static_cast(uint16_t,                               \
+                                     (vmaxvq_u16(pair_and) ? 1U : 0U) << i); \
+    } while (0)
+
+    uint16_t res = 0;
+    SSE2NEON_RANGES_MATCH16(0);
+    SSE2NEON_RANGES_MATCH16(1);
+    SSE2NEON_RANGES_MATCH16(2);
+    SSE2NEON_RANGES_MATCH16(3);
+    SSE2NEON_RANGES_MATCH16(4);
+    SSE2NEON_RANGES_MATCH16(5);
+    SSE2NEON_RANGES_MATCH16(6);
+    SSE2NEON_RANGES_MATCH16(7);
+#undef SSE2NEON_RANGES_MATCH16
+
+    /* Mask result to valid range based on lb */
+    return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1);
+#else
+    /* ARMv7 fallback: sequential loop */
+    uint16_t res = 0;
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u16(
             vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
@@ -8838,25 +9295,70 @@ static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
             vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
         uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
                                        vreinterpretq_u32_m128i(tmp));
-#if defined(__aarch64__)
-        int t = vaddvq_u32(vec_res) ? 1 : 0;
-#else
         uint64x2_t sumh = vpaddlq_u32(vec_res);
-        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
-#endif
+        uint16_t t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
         res |= (t << j);
     }
     return res;
+#endif
 }
 
-static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+static uint16_t _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
 {
-    int res = 0;
-    int m = (1 << la) - 1;
+    uint16_t m = _sse2neon_static_cast(uint16_t, (1 << la) - 1);
     uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
-    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x8_t t_lo =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m & 0xff)), vec_mask);
+    uint8x8_t t_hi =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m >> 8)), vec_mask);
     uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+
+#if SSE2NEON_ARCH_AARCH64
+    /* Vectorized: process all 16 rows in parallel using vmaxvq.
+     * For RANGES mode with byte elements:
+     * - Each row has 16 bytes representing comparisons with 8 range pairs
+     * - Adjacent bytes [2k, 2k+1] form a range: (char >= low, char <= high)
+     * - Result bit j = 1 if any range pair matches for haystack position j
+     *
+     * Algorithm per row:
+     * 1. Mask by la validity: vand(vec, mtx[i])
+     * 2. Swap adjacent bytes: vrev16 swaps within each 16-bit lane
+     * 3. Pair-AND: AND original with swapped to get [b0&b1, b0&b1, ...]
+     * 4. Horizontal OR via vmaxvq_u8 (faster than vmaxvq_u16)
+     */
+#define SSE2NEON_RANGES_MATCH8(i)                                              \
+    do {                                                                       \
+        uint8x16_t masked = vandq_u8(vec, vreinterpretq_u8_m128i(mtx[i]));     \
+        uint8x16_t swapped = vrev16q_u8(masked);                               \
+        uint8x16_t pair_and = vandq_u8(masked, swapped);                       \
+        res |= _sse2neon_static_cast(uint16_t, (vmaxvq_u8(pair_and) ? 1U : 0U) \
+                                                   << i);                      \
+    } while (0)
+
+    uint16_t res = 0;
+    SSE2NEON_RANGES_MATCH8(0);
+    SSE2NEON_RANGES_MATCH8(1);
+    SSE2NEON_RANGES_MATCH8(2);
+    SSE2NEON_RANGES_MATCH8(3);
+    SSE2NEON_RANGES_MATCH8(4);
+    SSE2NEON_RANGES_MATCH8(5);
+    SSE2NEON_RANGES_MATCH8(6);
+    SSE2NEON_RANGES_MATCH8(7);
+    SSE2NEON_RANGES_MATCH8(8);
+    SSE2NEON_RANGES_MATCH8(9);
+    SSE2NEON_RANGES_MATCH8(10);
+    SSE2NEON_RANGES_MATCH8(11);
+    SSE2NEON_RANGES_MATCH8(12);
+    SSE2NEON_RANGES_MATCH8(13);
+    SSE2NEON_RANGES_MATCH8(14);
+    SSE2NEON_RANGES_MATCH8(15);
+#undef SSE2NEON_RANGES_MATCH8
+
+    /* Mask result to valid range based on lb */
+    return res & _sse2neon_static_cast(uint16_t, (1 << lb) - 1);
+#else
+    /* ARMv7 fallback: sequential loop */
+    uint16_t res = 0;
     for (int j = 0; j < lb; j++) {
         mtx[j] = vreinterpretq_m128i_u8(
             vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
@@ -8866,10 +9368,11 @@ static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
             vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
         uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
                                        vreinterpretq_u16_m128i(tmp));
-        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        uint16_t t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
         res |= (t << j);
     }
     return res;
+#endif
 }
 
 #define SSE2NEON_CMP_RANGES_IS_BYTE 1
@@ -8888,22 +9391,29 @@ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
 #undef SSE2NEON_CMP_RANGES_IS_BYTE
 #undef SSE2NEON_CMP_RANGES_IS_WORD
 
-static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_byte_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint8x16_t mtx =
         vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x10000 - (1 << la);
-    int tb = 0x10000 - (1 << lb);
+    uint16_t m0 =
+        _sse2neon_static_cast(uint16_t, (la < lb) ? 0 : (1 << la) - (1 << lb));
+    uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << la));
+    uint16_t tb = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << lb));
     uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
     uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
     vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
-    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
-    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
-    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
-    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
-    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
-    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+    vec0_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m0)), vec_mask);
+    vec0_hi =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m0 >> 8)), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1)), vec_mask);
+    vec1_hi =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1 >> 8)), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, tb)), vec_mask);
+    tmp_hi =
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, tb >> 8)), vec_mask);
 
     res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
     res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
@@ -8912,17 +9422,21 @@ static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
     res_lo = vand_u8(res_lo, vec_mask);
     res_hi = vand_u8(res_hi, vec_mask);
 
-    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
-    return res;
+    return _sse2neon_vaddv_u8(res_lo) +
+           _sse2neon_static_cast(uint16_t, _sse2neon_vaddv_u8(res_hi) << 8);
 }
 
-static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+static uint16_t _sse2neon_cmp_word_equal_each(__m128i a,
+                                              int la,
+                                              __m128i b,
+                                              int lb)
 {
     uint16x8_t mtx =
         vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
-    int m1 = 0x100 - (1 << la);
-    int tb = 0x100 - (1 << lb);
+    uint16_t m0 = _sse2neon_static_cast(
+        uint16_t, (la < lb) ? 0 : ((1 << la) - (1 << lb)));
+    uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x100 - (1 << la));
+    uint16_t tb = _sse2neon_static_cast(uint16_t, 0x100 - (1 << lb));
     uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
     uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
     uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
@@ -8933,53 +9447,308 @@ static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
     return _sse2neon_vaddvq_u16(mtx);
 }
 
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+/* EQUAL_ORDERED aggregation for 8x16 (byte mode).
+ * The algorithm checks where string a appears in string b.
+ * For result bit i: AND together mtx[i][0] & mtx[i+1][1] & mtx[i+2][2] & ...
+ *
+ * Vectorization approach: transpose matrix FIRST, then apply masking to
+ * transposed matrix, then use vextq diagonal extraction.
+ * After transpose: mtx_T[j][i] = mtx[i][j] = (a[j] == b[i])
+ * vextq on mtx_T gives: result[i] = mtx_T[0][i] & mtx_T[1][i+1] & ...
+ *                                 = mtx[i][0] & mtx[i+1][1] & ... (correct!)
+ */
+static uint16_t _sse2neon_aggregate_equal_ordered_8x16(int bound,
+                                                       int la,
+                                                       int lb,
+                                                       __m128i mtx[16])
+{
+#if SSE2NEON_ARCH_AARCH64
+    uint8x16_t rows[16];
+    for (int i = 0; i < 16; i++)
+        rows[i] = vreinterpretq_u8_m128i(mtx[i]);
+
+    /* Transpose the 16x16 byte matrix using hierarchical vtrn operations.
+     * After transpose: rows[j][i] = original mtx[i][j]
+     */
+    /* Level 1: Transpose 2x2 blocks of 8-bit elements */
+    for (int i = 0; i < 16; i += 2) {
+        uint8x16x2_t t = vtrnq_u8(rows[i], rows[i + 1]);
+        rows[i] = t.val[0];
+        rows[i + 1] = t.val[1];
+    }
 
-#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
-    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
-        int bound, int la, int lb, __m128i mtx[16])                            \
-    {                                                                          \
-        int res = 0;                                                           \
-        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
-        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
-            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
-            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
-        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
-            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
-                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
-            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
-        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
-        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
-        for (int j = 0; j < lb; j++) {                                         \
-            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
-                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
-        }                                                                      \
-        for (int j = lb; j < bound; j++) {                                     \
-            mtx[j] = vreinterpretq_m128i_u##size(                              \
-                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
-        }                                                                      \
-        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
-            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
-        for (int i = 0; i < bound; i++) {                                      \
-            int val = 1;                                                       \
-            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
-                val &= ptr[k * bound + j];                                     \
-            res += val << i;                                                   \
-        }                                                                      \
-        return res;                                                            \
+    /* Level 2: Transpose 2x2 blocks of 16-bit elements */
+    for (int i = 0; i < 16; i += 4) {
+        uint16x8x2_t t0 = vtrnq_u16(vreinterpretq_u16_u8(rows[i]),
+                                    vreinterpretq_u16_u8(rows[i + 2]));
+        uint16x8x2_t t1 = vtrnq_u16(vreinterpretq_u16_u8(rows[i + 1]),
+                                    vreinterpretq_u16_u8(rows[i + 3]));
+        rows[i] = vreinterpretq_u8_u16(t0.val[0]);
+        rows[i + 2] = vreinterpretq_u8_u16(t0.val[1]);
+        rows[i + 1] = vreinterpretq_u8_u16(t1.val[0]);
+        rows[i + 3] = vreinterpretq_u8_u16(t1.val[1]);
     }
 
-/* clang-format off */
-#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
-    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
-    prefix##IMPL(16, 8, prefix##IS_UWORD)
-/* clang-format on */
+    /* Level 3: Transpose 2x2 blocks of 32-bit elements */
+    for (int i = 0; i < 16; i += 8) {
+        uint32x4x2_t t0 = vtrnq_u32(vreinterpretq_u32_u8(rows[i]),
+                                    vreinterpretq_u32_u8(rows[i + 4]));
+        uint32x4x2_t t1 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 1]),
+                                    vreinterpretq_u32_u8(rows[i + 5]));
+        uint32x4x2_t t2 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 2]),
+                                    vreinterpretq_u32_u8(rows[i + 6]));
+        uint32x4x2_t t3 = vtrnq_u32(vreinterpretq_u32_u8(rows[i + 3]),
+                                    vreinterpretq_u32_u8(rows[i + 7]));
+        rows[i] = vreinterpretq_u8_u32(t0.val[0]);
+        rows[i + 4] = vreinterpretq_u8_u32(t0.val[1]);
+        rows[i + 1] = vreinterpretq_u8_u32(t1.val[0]);
+        rows[i + 5] = vreinterpretq_u8_u32(t1.val[1]);
+        rows[i + 2] = vreinterpretq_u8_u32(t2.val[0]);
+        rows[i + 6] = vreinterpretq_u8_u32(t2.val[1]);
+        rows[i + 3] = vreinterpretq_u8_u32(t3.val[0]);
+        rows[i + 7] = vreinterpretq_u8_u32(t3.val[1]);
+    }
+
+    /* Level 4: Swap 64-bit halves between row pairs */
+    {
+        uint8x16_t tmp;
+#define SSE2NEON_SWAP_HL_8(a, b)                       \
+    tmp = vcombine_u8(vget_low_u8(a), vget_low_u8(b)); \
+    b = vcombine_u8(vget_high_u8(a), vget_high_u8(b)); \
+    a = tmp;
+
+        SSE2NEON_SWAP_HL_8(rows[0], rows[8]);
+        SSE2NEON_SWAP_HL_8(rows[1], rows[9]);
+        SSE2NEON_SWAP_HL_8(rows[2], rows[10]);
+        SSE2NEON_SWAP_HL_8(rows[3], rows[11]);
+        SSE2NEON_SWAP_HL_8(rows[4], rows[12]);
+        SSE2NEON_SWAP_HL_8(rows[5], rows[13]);
+        SSE2NEON_SWAP_HL_8(rows[6], rows[14]);
+        SSE2NEON_SWAP_HL_8(rows[7], rows[15]);
+#undef SSE2NEON_SWAP_HL_8
+    }
+
+    /* Apply masking to TRANSPOSED matrix:
+     * - Rows j >= la: set entire row to 0xFF (needle positions beyond la)
+     * - For rows j < la: columns k >= lb set to 0x00 (force AND fail for
+     *   positions that would access haystack beyond lb)
+     *
+     * lb_valid has bits set for valid positions (0..lb-1)
+     * lb_clear has 0xFF for positions < lb, 0x00 for positions >= lb
+     */
+    uint8x16_t vec_ff = vdupq_n_u8(0xFF);
+    uint16_t lb_valid =
+        _sse2neon_static_cast(uint16_t, (1U << lb) - 1); /* e.g. lb=6: 0x003F */
+    uint8x8_t pos_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x16_t lb_clear = vcombine_u8(
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, lb_valid)), pos_mask),
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, lb_valid >> 8)),
+                pos_mask));
+
+    for (int j = 0; j < la; j++) {
+        rows[j] = vandq_u8(rows[j], lb_clear); /* clear positions >= lb */
+    }
+    for (int j = la; j < 16; j++) {
+        rows[j] = vec_ff;
+    }
+
+    /* vextq diagonal extraction: shift row k by k, then AND all rows.
+     * result[i] = rows[0][i] & rows[1][i+1] & rows[2][i+2] & ...
+     */
+    uint8x16_t result = vec_ff;
+
+/* Shift row K by K positions, filling with 0xFF, then AND into result */
+#define SSE2NEON_VEXT_AND_8(K)                             \
+    do {                                                   \
+        uint8x16_t shifted = vextq_u8(rows[K], vec_ff, K); \
+        result = vandq_u8(result, shifted);                \
+    } while (0)
+
+    SSE2NEON_VEXT_AND_8(0);
+    SSE2NEON_VEXT_AND_8(1);
+    SSE2NEON_VEXT_AND_8(2);
+    SSE2NEON_VEXT_AND_8(3);
+    SSE2NEON_VEXT_AND_8(4);
+    SSE2NEON_VEXT_AND_8(5);
+    SSE2NEON_VEXT_AND_8(6);
+    SSE2NEON_VEXT_AND_8(7);
+    SSE2NEON_VEXT_AND_8(8);
+    SSE2NEON_VEXT_AND_8(9);
+    SSE2NEON_VEXT_AND_8(10);
+    SSE2NEON_VEXT_AND_8(11);
+    SSE2NEON_VEXT_AND_8(12);
+    SSE2NEON_VEXT_AND_8(13);
+    SSE2NEON_VEXT_AND_8(14);
+    SSE2NEON_VEXT_AND_8(15);
+
+#undef SSE2NEON_VEXT_AND_8
+
+    /* Convert result to bitmask: each lane is 0xFF (match) or 0x00 (no match).
+     * Extract MSB of each byte to form 16-bit result using _mm_movemask_epi8
+     * approach: shift right to get MSB in LSB, position each bit, sum halves.
+     */
+    uint8x16_t msbs = vshrq_n_u8(result, 7);
+    static const int8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7,
+                                           0, 1, 2, 3, 4, 5, 6, 7};
+    int8x16_t shifts = vld1q_s8(shift_table);
+    uint8x16_t positioned = vshlq_u8(msbs, shifts);
+    return _sse2neon_static_cast(uint16_t,
+                                 vaddv_u8(vget_low_u8(positioned)) |
+                                     (vaddv_u8(vget_high_u8(positioned)) << 8));
+#else
+    /* ARMv7 fallback: apply masking and use scalar extraction */
+    uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x10000 - (1 << la));
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x16_t vec1 = vcombine_u8(
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1)), vec_mask),
+        vtst_u8(vdup_n_u8(_sse2neon_static_cast(uint8_t, m1 >> 8)), vec_mask));
+    uint8x16_t vec_minusone = vdupq_n_u8(0xFF);
+    uint8x16_t vec_zero = vdupq_n_u8(0);
+
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vbslq_u8(vec1, vec_minusone, vreinterpretq_u8_m128i(mtx[j])));
+    }
+    for (int j = lb; j < bound; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(vbslq_u8(vec1, vec_minusone, vec_zero));
+    }
 
-SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+    uint16_t res = 0;
+    unsigned char *ptr = _sse2neon_reinterpret_cast(unsigned char *, mtx);
+    for (int i = 0; i < bound; i++) {
+        int val = 1;
+        for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)
+            val &= ptr[k * bound + j];
+        res += _sse2neon_static_cast(uint16_t, val << i);
+    }
+    return res;
+#endif
+}
+
+/* EQUAL_ORDERED aggregation for 16x8 (word mode).
+ * Same algorithm as 8x16 but for 16-bit elements with 8 lanes.
+ *
+ * Vectorization approach: transpose matrix FIRST, then apply masking to
+ * transposed matrix, then use vextq diagonal extraction.
+ */
+static uint16_t _sse2neon_aggregate_equal_ordered_16x8(int bound,
+                                                       int la,
+                                                       int lb,
+                                                       __m128i mtx[16])
+{
+#if SSE2NEON_ARCH_AARCH64
+    uint16x8_t rows[8];
+    for (int i = 0; i < 8; i++)
+        rows[i] = vreinterpretq_u16_m128i(mtx[i]);
+
+    /* Transpose the 8x8 word matrix using hierarchical vtrn operations.
+     * After transpose: rows[j][i] = original mtx[i][j]
+     */
+    /* Level 1: Transpose 2x2 blocks of 16-bit elements */
+    for (int i = 0; i < 8; i += 2) {
+        uint16x8x2_t t = vtrnq_u16(rows[i], rows[i + 1]);
+        rows[i] = t.val[0];
+        rows[i + 1] = t.val[1];
+    }
+
+    /* Level 2: Transpose 2x2 blocks of 32-bit elements */
+    for (int i = 0; i < 8; i += 4) {
+        uint32x4x2_t t0 = vtrnq_u32(vreinterpretq_u32_u16(rows[i]),
+                                    vreinterpretq_u32_u16(rows[i + 2]));
+        uint32x4x2_t t1 = vtrnq_u32(vreinterpretq_u32_u16(rows[i + 1]),
+                                    vreinterpretq_u32_u16(rows[i + 3]));
+        rows[i] = vreinterpretq_u16_u32(t0.val[0]);
+        rows[i + 2] = vreinterpretq_u16_u32(t0.val[1]);
+        rows[i + 1] = vreinterpretq_u16_u32(t1.val[0]);
+        rows[i + 3] = vreinterpretq_u16_u32(t1.val[1]);
+    }
+
+    /* Level 3: Swap 64-bit halves between row pairs */
+    {
+        uint16x8_t tmp;
+#define SSE2NEON_SWAP_HL_16(a, b)                         \
+    tmp = vcombine_u16(vget_low_u16(a), vget_low_u16(b)); \
+    b = vcombine_u16(vget_high_u16(a), vget_high_u16(b)); \
+    a = tmp;
+
+        SSE2NEON_SWAP_HL_16(rows[0], rows[4]);
+        SSE2NEON_SWAP_HL_16(rows[1], rows[5]);
+        SSE2NEON_SWAP_HL_16(rows[2], rows[6]);
+        SSE2NEON_SWAP_HL_16(rows[3], rows[7]);
+#undef SSE2NEON_SWAP_HL_16
+    }
+
+    /* Apply masking to TRANSPOSED matrix:
+     * - Rows j >= la: set entire row to 0xFFFF
+     * - For rows j < la: columns k >= lb set to 0x0000
+     */
+    uint16x8_t vec_ff = vdupq_n_u16(0xFFFF);
+    uint16_t lb_valid =
+        _sse2neon_static_cast(uint16_t, (1U << lb) - 1); /* e.g. lb=6: 0x003F */
+    uint16x8_t pos_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t lb_clear = vtstq_u16(vdupq_n_u16(lb_valid), pos_mask);
+
+    for (int j = 0; j < la; j++) {
+        rows[j] = vandq_u16(rows[j], lb_clear);
+    }
+    for (int j = la; j < 8; j++) {
+        rows[j] = vec_ff;
+    }
+
+    /* vextq diagonal extraction: shift row k by k, then AND all rows */
+    uint16x8_t result = vec_ff;
+
+#define SSE2NEON_VEXT_AND_16(K)                             \
+    do {                                                    \
+        uint16x8_t shifted = vextq_u16(rows[K], vec_ff, K); \
+        result = vandq_u16(result, shifted);                \
+    } while (0)
+
+    SSE2NEON_VEXT_AND_16(0);
+    SSE2NEON_VEXT_AND_16(1);
+    SSE2NEON_VEXT_AND_16(2);
+    SSE2NEON_VEXT_AND_16(3);
+    SSE2NEON_VEXT_AND_16(4);
+    SSE2NEON_VEXT_AND_16(5);
+    SSE2NEON_VEXT_AND_16(6);
+    SSE2NEON_VEXT_AND_16(7);
+
+#undef SSE2NEON_VEXT_AND_16
+
+    /* Convert result to bitmask: each lane is 0xFFFF or 0x0000.
+     * Extract MSB of each word and form 8-bit result.
+     */
+    uint16x8_t msbs = vshrq_n_u16(result, 15);
+    uint16x8_t positioned = vmulq_u16(msbs, pos_mask);
+    return _sse2neon_static_cast(uint16_t, _sse2neon_vaddvq_u16(positioned));
+#else
+    /* ARMv7 fallback: apply masking and use scalar extraction */
+    uint16_t m1 = _sse2neon_static_cast(uint16_t, 0x100 - (1 << la));
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t vec_minusone = vdupq_n_u16(0xFFFF);
+    uint16x8_t vec_zero = vdupq_n_u16(0);
+
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vbslq_u16(vec1, vec_minusone, vreinterpretq_u16_m128i(mtx[j])));
+    }
+    for (int j = lb; j < bound; j++) {
+        mtx[j] =
+            vreinterpretq_m128i_u16(vbslq_u16(vec1, vec_minusone, vec_zero));
+    }
 
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
-#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+    uint16_t res = 0;
+    unsigned short *ptr = _sse2neon_reinterpret_cast(unsigned short *, mtx);
+    for (int i = 0; i < bound; i++) {
+        int val = 1;
+        for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)
+            val &= ptr[k * bound + j];
+        res += _sse2neon_static_cast(uint16_t, val << i);
+    }
+    return res;
+#endif
+}
 
 /* clang-format off */
 #define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
@@ -8989,42 +9758,48 @@ SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
 
 SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
 
-#define SSE2NEON_CMPESTR_LIST                          \
-    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
-    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
-    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
-    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
-    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
-    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
-    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
-    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
-    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
-    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
-    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+#define SSE2NEON_CMPESTR_LIST                                  \
+    _SSE2NEON(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _SSE2NEON(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _SSE2NEON(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _SSE2NEON(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _SSE2NEON(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _SSE2NEON(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _SSE2NEON(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _SSE2NEON(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _SSE2NEON(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _SSE2NEON(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _SSE2NEON(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _SSE2NEON(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _SSE2NEON(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _SSE2NEON(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _SSE2NEON(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _SSE2NEON(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
 
 enum {
-#define _(name, func_suffix) name,
+#define _SSE2NEON(name, func_suffix) name,
     SSE2NEON_CMPESTR_LIST
-#undef _
+#undef _SSE2NEON
 };
-typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+typedef uint16_t (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
 static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
-#define _(name, func_suffix) _sse2neon_##func_suffix,
+#define _SSE2NEON(name, func_suffix) _sse2neon_##func_suffix,
     SSE2NEON_CMPESTR_LIST
-#undef _
+#undef _SSE2NEON
 };
 
-FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+FORCE_INLINE uint16_t _sse2neon_sido_negative(int res,
+                                              int lb,
+                                              int imm8,
+                                              int bound)
 {
     switch (imm8 & 0x30) {
     case _SIDD_NEGATIVE_POLARITY:
         res ^= 0xffffffff;
         break;
+    case _SIDD_MASKED_POSITIVE_POLARITY:
+        res &= (1 << lb) - 1;
+        break;
     case _SIDD_MASKED_NEGATIVE_POLARITY:
         res ^= (1 << lb) - 1;
         break;
@@ -9032,15 +9807,15 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
         break;
     }
 
-    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+    return _sse2neon_static_cast(uint16_t, res &((bound == 8) ? 0xFF : 0xFFFF));
 }
 
 FORCE_INLINE int _sse2neon_clz(unsigned int x)
 {
-#if _MSC_VER
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
     unsigned long cnt = 0;
-    if (_BitScanForward(&cnt, x))
-        return cnt;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
     return 32;
 #else
     return x != 0 ? __builtin_clz(x) : 32;
@@ -9049,10 +9824,10 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
 
 FORCE_INLINE int _sse2neon_ctz(unsigned int x)
 {
-#if _MSC_VER
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
     unsigned long cnt = 0;
-    if (_BitScanReverse(&cnt, x))
-        return 31 - cnt;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
     return 32;
 #else
     return x != 0 ? __builtin_ctz(x) : 32;
@@ -9061,20 +9836,19 @@ FORCE_INLINE int _sse2neon_ctz(unsigned int x)
 
 FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 {
-#if _MSC_VER
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
     unsigned long cnt;
-#ifdef defined(SSE2NEON_HAS_BITSCAN64)
-    (defined(_M_AMD64) || defined(__x86_64__))
-        if((_BitScanForward64(&cnt, x))
-            return (int)(cnt);
+#if defined(SSE2NEON_HAS_BITSCAN64)
+    if (_BitScanForward64(&cnt, x))
+        return (int) (cnt);
 #else
     if (_BitScanForward(&cnt, (unsigned long) (x)))
         return (int) cnt;
     if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
         return (int) (cnt + 32);
-#endif
+#endif /* SSE2NEON_HAS_BITSCAN64 */
     return 64;
-#else
+#else /* assume GNU compatible compilers */
     return x != 0 ? __builtin_ctzll(x) : 64;
 #endif
 }
@@ -9082,7 +9856,7 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
 
 #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
-    const int var = (imm & 0x01) ? 8 : 16
+    const int var = ((imm) & 0x01) ? 8 : 16
 
 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
     int tmp1 = la ^ (la >> 31);                  \
@@ -9097,32 +9871,35 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
 // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
 // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
 // string a and b.
-#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
-    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
-    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
-    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                         \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                               \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                               \
+    uint16_t r2 = (_sse2neon_cmpfunc_table[(imm8) & 0x0f])(a, la, b, lb); \
     r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
 
-#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
-    return (r2 == 0) ? bound                                     \
-                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
-                                      : _sse2neon_ctz(r2))
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)            \
+    return (r2 == 0) ? bound                                       \
+                     : (((imm8) & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                        : _sse2neon_ctz(r2))
 
 #define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
     __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
-    if (imm8 & 0x40) {                                                         \
+    if ((imm8) & 0x40) {                                                       \
         if (bound == 8) {                                                      \
             uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
                                        vld1q_u16(_sse2neon_cmpestr_mask16b));  \
             dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
-                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+                tmp, vdupq_n_u16(_sse2neon_static_cast(uint16_t, -1)),         \
+                vreinterpretq_u16_m128i(dst)));                                \
         } else {                                                               \
-            uint8x16_t vec_r2 =                                                \
-                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t vec_r2 = vcombine_u8(                                   \
+                vdup_n_u8(_sse2neon_static_cast(uint8_t, r2)),                 \
+                vdup_n_u8(_sse2neon_static_cast(uint8_t, r2 >> 8)));           \
             uint8x16_t tmp =                                                   \
                 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
             dst = vreinterpretq_m128i_u8(                                      \
-                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+                vbslq_u8(tmp, vdupq_n_u8(_sse2neon_static_cast(uint8_t, -1)),  \
+                         vreinterpretq_u8_m128i(dst)));                        \
         }                                                                      \
     } else {                                                                   \
         if (bound == 16) {                                                     \
@@ -9130,7 +9907,8 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
                 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
         } else {                                                               \
             dst = vreinterpretq_m128i_u8(                                      \
-                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+                vsetq_lane_u8(_sse2neon_static_cast(uint8_t, r2 & 0xff),       \
+                              vreinterpretq_u8_m128i(dst), 0));                \
         }                                                                      \
     }                                                                          \
     return dst
@@ -9147,7 +9925,7 @@ FORCE_INLINE int _mm_cmpestra(__m128i a,
 {
     int lb_cpy = lb;
     SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
-    return !r2 & (lb_cpy > bound);
+    return !r2 & (lb_cpy >= bound);
 }
 
 // Compare packed strings in a and b with lengths la and lb using the control in
@@ -9208,6 +9986,9 @@ FORCE_INLINE int _mm_cmpestrs(__m128i a,
                               int lb,
                               const int imm8)
 {
+    (void) a;
+    (void) b;
+    (void) lb;
     SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
     return la <= (bound - 1);
 }
@@ -9221,13 +10002,16 @@ FORCE_INLINE int _mm_cmpestrz(__m128i a,
                               int lb,
                               const int imm8)
 {
+    (void) a;
+    (void) b;
+    (void) la;
     SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
     return lb <= (bound - 1);
 }
 
 #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
     do {                                                                 \
-        if (imm8 & 0x01) {                                               \
+        if ((imm8) & 0x01) {                                             \
             uint16x8_t equal_mask_##str =                                \
                 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
             uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
@@ -9302,6 +10086,7 @@ FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
 FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
 {
+    (void) b;
     SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
     int la;
     SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
@@ -9313,6 +10098,7 @@ FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
 FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 {
+    (void) a;
     SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
     int lb;
     SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
@@ -9323,7 +10109,7 @@ FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
 // in b for greater than.
 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     return vreinterpretq_m128i_u64(
         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 #else
@@ -9333,84 +10119,170 @@ FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 #endif
 }
 
+/* A function-like macro to generate CRC-32C calculation using Barrett
+ * reduction.
+ *
+ * The input parameters depict as follows:
+ * - 'crc' means initial value or CRC.
+ * - 'v' means the element of input message.
+ * - 'bit' means the element size of input message (e.g., if each message is one
+ * byte then 'bit' will be 8 as 1 byte equals 8 bits.
+ * - 'shift' represents a toggle to perform shifting.
+ *
+ * For a reminder, the CRC calculation uses bit-reflected sense.
+ *
+ * As there are two mysterious variables 'p' and 'mu', here are what they serve:
+ * 1. 'p' stands for Polynomial P(x) in CRC calculation.
+ *    As we are using CRC-32C, 'p' has the value of 0x105EC76F1 (0x1EDC6F41 in
+ *    bit-reflected form).
+ * 2. 'mu' stands for the multiplicative inverse of 'p' in GF(64).
+ *    'mu' has the value of 0x1dea713f1.
+ *    (mu_{64} = \lfloor 2^{64} / P(x) \rfloor = 0x11f91caf6)
+ *    (the bit-reflected form of 0x11f91caf6 is 0x1dea713f1)
+ *
+ * The CRC value is calculated as follows:
+ * 1. Update (XOR) 'crc' with new input message element 'v'.
+ * 2. Create 'orig' and 'tmp' vector.
+ *    Before creating the vectors, We store 'crc' in lower half of vector
+ *    then shift left by 'bit' bits so that the result of carry-less
+ *    multiplication will always appear in the upper half of destination vector.
+ *    Doing so can reduce some masking and subtraction operations.
+ *    For one exception is that there is no need to perform shifting if 'bit'
+ *    is 64.
+ * 3. Do carry-less multiplication on the lower half of 'tmp' with 'mu'.
+ * 4. Do carry-less multiplication on the upper half of 'tmp' with 'p'.
+ * 5. Extract the lower (in bit-reflected sense) 32 bits in the upper half of
+ *    'tmp'.
+ */
+#define SSE2NEON_CRC32C_BASE(crc, v, bit, shift)                               \
+    do {                                                                       \
+        crc ^= v;                                                              \
+        uint64x2_t orig =                                                      \
+            vcombine_u64(_sse2neon_vcreate_u64(SSE2NEON_IIF(shift)(            \
+                             (uint64_t) (crc) << (bit), (uint64_t) (crc))),    \
+                         _sse2neon_vcreate_u64(0x0));                          \
+        uint64x2_t tmp = orig;                                                 \
+        uint64_t p = 0x105EC76F1;                                              \
+        uint64_t mu = 0x1dea713f1;                                             \
+        tmp =                                                                  \
+            _sse2neon_vmull_p64(vget_low_u64(tmp), _sse2neon_vcreate_u64(mu)); \
+        tmp =                                                                  \
+            _sse2neon_vmull_p64(vget_high_u64(tmp), _sse2neon_vcreate_u64(p)); \
+        crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 2);                   \
+    } while (0)
+
 // Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC
     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) ||             \
+    (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \
+     !SSE2NEON_COMPILER_CLANG)
     crc = __crc32ch(crc, v);
+#elif defined(__ARM_FEATURE_CRYPTO)
+    SSE2NEON_CRC32C_BASE(crc, v, 16, 1);
 #else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+    crc = _mm_crc32_u8(crc, _sse2neon_static_cast(uint8_t, v & 0xff));
+    crc = _mm_crc32_u8(crc, _sse2neon_static_cast(uint8_t, (v >> 8) & 0xff));
 #endif
     return crc;
 }
 
 // Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC
     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) ||             \
+    (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \
+     !SSE2NEON_COMPILER_CLANG)
     crc = __crc32cw(crc, v);
+#elif defined(__ARM_FEATURE_CRYPTO)
+    SSE2NEON_CRC32C_BASE(crc, v, 32, 1);
 #else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+    crc = _mm_crc32_u16(crc, _sse2neon_static_cast(uint16_t, v & 0xffff));
+    crc =
+        _mm_crc32_u16(crc, _sse2neon_static_cast(uint16_t, (v >> 16) & 0xffff));
 #endif
     return crc;
 }
 
 // Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC
     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#elif (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \
+       !SSE2NEON_COMPILER_CLANG)
+    crc = __crc32cd(_sse2neon_static_cast(uint32_t, crc), v);
+#elif defined(__ARM_FEATURE_CRYPTO)
+    SSE2NEON_CRC32C_BASE(crc, v, 64, 0);
+#else
+    crc = _mm_crc32_u32(_sse2neon_static_cast(uint32_t, crc),
+                        _sse2neon_static_cast(uint32_t, v & 0xffffffff));
+    crc =
+        _mm_crc32_u32(_sse2neon_static_cast(uint32_t, crc),
+                      _sse2neon_static_cast(uint32_t, (v >> 32) & 0xffffffff));
 #endif
     return crc;
 }
 
 // Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
 {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#if SSE2NEON_ARCH_AARCH64 && defined(__ARM_FEATURE_CRC32) && !SSE2NEON_ARM64EC
     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
                          : [c] "+r"(crc)
                          : [v] "r"(v));
-#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+#elif ((__ARM_ARCH >= 8) && defined(__ARM_FEATURE_CRC32)) ||             \
+    (SSE2NEON_COMPILER_MSVC && defined(_M_ARM64) && !SSE2NEON_ARM64EC && \
+     !SSE2NEON_COMPILER_CLANG)
     crc = __crc32cb(crc, v);
-#else
+#elif defined(__ARM_FEATURE_CRYPTO)
+    SSE2NEON_CRC32C_BASE(crc, v, 8, 1);
+#else  // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparison algorithm for the best ratio between
+    // performance and lookup table.
+
     crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
+        0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
+        0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
 #endif
     return crc;
 }
 
 /* AES */
 
-#if !defined(__ARM_FEATURE_CRYPTO)
+/* AES software fallback tables.
+ * Needed when __ARM_FEATURE_CRYPTO is not available, OR on ARM64EC where
+ * hardware crypto intrinsics may not be accessible despite the feature macro.
+ */
+#if !defined(__ARM_FEATURE_CRYPTO) || SSE2NEON_ARM64EC || defined(_M_ARM64EC)
 /* clang-format off */
 #define SSE2NEON_AES_SBOX(w)                                           \
     {                                                                  \
@@ -9500,8 +10372,67 @@ static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
 static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
 #undef SSE2NEON_AES_H0
 
+// File-scope constants for AES permutations - hoisted from inline functions
+// to ensure single load across multiple intrinsic calls.
+// ShiftRows permutation indices for encryption
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_shift_rows[16] = {
+    0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+    0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+};
+// InvShiftRows permutation indices for decryption
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_inv_shift_rows[16] = {
+    0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+    0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+};
+// Rotate right by 8 bits within each 32-bit word (for MixColumns)
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_aes_ror32by8[16] = {
+    0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+    0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+};
+
+#if SSE2NEON_ARCH_AARCH64
+// NEON S-box lookup using 4x64-byte tables; reused by aesenc/dec/keygenassist.
+// Uses vsubq_u8 instead of C++ operator- for MSVC compatibility.
+FORCE_INLINE uint8x16_t _sse2neon_aes_subbytes(uint8x16_t x)
+{
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), x);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40),
+                   vsubq_u8(x, vdupq_n_u8(0x40)));
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80),
+                   vsubq_u8(x, vdupq_n_u8(0x80)));
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0),
+                   vsubq_u8(x, vdupq_n_u8(0xc0)));
+    return v;
+}
+
+FORCE_INLINE uint8x16_t _sse2neon_aes_inv_subbytes(uint8x16_t x)
+{
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), x);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40),
+                   vsubq_u8(x, vdupq_n_u8(0x40)));
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80),
+                   vsubq_u8(x, vdupq_n_u8(0x80)));
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0),
+                   vsubq_u8(x, vdupq_n_u8(0xc0)));
+    return v;
+}
+
+// AES xtime: multiply by {02} in GF(2^8) with reduction polynomial 0x11b
+// Uses signed comparison to generate mask: if MSB set, XOR with 0x1b
+FORCE_INLINE uint8x16_t _sse2neon_aes_xtime(uint8x16_t v)
+{
+    // Arithmetic right shift by 7 gives 0xFF for bytes >= 0x80, 0x00 otherwise
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v), 7));
+    // AND with reduction polynomial 0x1b
+    uint8x16_t reduced = vandq_u8(mask, vdupq_n_u8(0x1b));
+    // Shift left and XOR with reduction
+    return veorq_u8(vshlq_n_u8(v, 1), reduced);
+}
+#endif
+
 /* x_time function and matrix multiply function */
-#if !defined(__aarch64__)
+#if !SSE2NEON_ARCH_AARCH64
 #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
 #define SSE2NEON_MULTIPLY(x, y)                                  \
     (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
@@ -9510,57 +10441,49 @@ static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
      ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
 #endif
 
-// In the absence of crypto extensions, implement aesenc using regular neon
+// In the absence of crypto extensions, implement aesenc using regular NEON
 // intrinsics instead. See:
 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
-// for more information Reproduced with permission of the author.
+// for more information.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t v;
     uint8x16_t w = vreinterpretq_u8_m128i(a);
 
     /* shift rows */
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+    w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_shift_rows));
 
     /* sub bytes */
-    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-    // look up each of the table. After each lookup, we load the next table
-    // which locates at the next 64-bytes. In the meantime, the index in the
-    // table would be smaller than it was, so the index parameters of
-    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
-
-    /* mix columns */
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    v = _sse2neon_aes_subbytes(w);
+
+    /* mix columns:
+     * MixColumns multiplies each column by the matrix:
+     *   [02 03 01 01]
+     *   [01 02 03 01]
+     *   [01 01 02 03]
+     *   [03 01 01 02]
+     * Using: out = xtime(v) ^ ror8(xtime(v)^v) ^ rot16(v)
+     */
+    w = _sse2neon_aes_xtime(v);  // w = v * {02}
+    w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v))));
+    w = veorq_u8(w,
+                 vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8)));
 
     /* add round key */
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+    return vreinterpretq_m128i_u8(
+        veorq_u8(w, vreinterpretq_u8_m128i(RoundKey)));
 
 #else /* ARMv7-A implementation for a table-based AES */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
-    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
-     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
-// muliplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)           \
+    ((_sse2neon_static_cast(uint32_t, b3) << 24) | \
+     (_sse2neon_static_cast(uint32_t, b2) << 16) | \
+     (_sse2neon_static_cast(uint32_t, b1) << 8) |  \
+     _sse2neon_static_cast(uint32_t, b0))
+// multiplying 'x' by 2 in GF(2^8)
 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-// muliplying 'x' by 3 in GF(2^8)
+// multiplying 'x' by 3 in GF(2^8)
 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
 #define SSE2NEON_AES_U0(p) \
     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
@@ -9615,69 +10538,114 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
 FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
-
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t v;
     uint8x16_t w = vreinterpretq_u8_m128i(a);
 
     // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+    w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_inv_shift_rows));
 
     // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-
-    // inverse mix columns
-    // muliplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    v = _sse2neon_aes_inv_subbytes(w);
+
+    /* inverse mix columns:
+     * InvMixColumns multiplies each column by the matrix:
+     *   [0E 0B 0D 09]
+     *   [09 0E 0B 0D]
+     *   [0D 09 0E 0B]
+     *   [0B 0D 09 0E]
+     * Computed as: v*{04} ^ v ^ rotate(v*{04}, 16) then standard MixColumns
+     */
+    // v*{04} = xtime(xtime(v))
+    w = _sse2neon_aes_xtime(v);
+    w = _sse2neon_aes_xtime(w);
+    v = veorq_u8(v, w);
+    v = veorq_u8(v, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(w))));
+
+    // Apply standard MixColumns to transformed v
+    w = _sse2neon_aes_xtime(v);
+    w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v))));
+    w = veorq_u8(w,
+                 vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8)));
 
     // add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t i, e, f, g, h, v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
-
-    // inverse mix columns
-    for (i = 0; i < 4; ++i) {
-        e = v[i][0];
-        f = v[i][1];
-        g = v[i][2];
-        h = v[i][3];
-
-        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-    }
+    return vreinterpretq_m128i_u8(
+        veorq_u8(w, vreinterpretq_u8_m128i(RoundKey)));
+
+#else /* ARMv7-A implementation using inverse T-tables */
+    // GF(2^8) multiplication helpers for InvMixColumns coefficients
+#define SSE2NEON_AES_DEC_B2W(b0, b1, b2, b3)       \
+    ((_sse2neon_static_cast(uint32_t, b3) << 24) | \
+     (_sse2neon_static_cast(uint32_t, b2) << 16) | \
+     (_sse2neon_static_cast(uint32_t, b1) << 8) |  \
+     _sse2neon_static_cast(uint32_t, b0))
+    // xtime: multiply by 2 in GF(2^8), using 0x011b to clear bit 8
+#define SSE2NEON_AES_DEC_X2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b))
+    // multiply by 4 in GF(2^8)
+#define SSE2NEON_AES_DEC_X4(x) SSE2NEON_AES_DEC_X2(SSE2NEON_AES_DEC_X2(x))
+    // multiply by 8 in GF(2^8)
+#define SSE2NEON_AES_DEC_X8(x) SSE2NEON_AES_DEC_X2(SSE2NEON_AES_DEC_X4(x))
+    // InvMixColumns coefficients: 0x09, 0x0b, 0x0d, 0x0e
+#define SSE2NEON_AES_DEC_F9(x) (SSE2NEON_AES_DEC_X8(x) ^ (x))
+#define SSE2NEON_AES_DEC_FB(x) \
+    (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X2(x) ^ (x))
+#define SSE2NEON_AES_DEC_FD(x) \
+    (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X4(x) ^ (x))
+#define SSE2NEON_AES_DEC_FE(x) \
+    (SSE2NEON_AES_DEC_X8(x) ^ SSE2NEON_AES_DEC_X4(x) ^ SSE2NEON_AES_DEC_X2(x))
+    // Inverse T-table generators combining InvSubBytes + InvMixColumns
+#define SSE2NEON_AES_DEC_V0(p)                                           \
+    SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FE(p), SSE2NEON_AES_DEC_F9(p), \
+                         SSE2NEON_AES_DEC_FD(p), SSE2NEON_AES_DEC_FB(p))
+#define SSE2NEON_AES_DEC_V1(p)                                           \
+    SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FB(p), SSE2NEON_AES_DEC_FE(p), \
+                         SSE2NEON_AES_DEC_F9(p), SSE2NEON_AES_DEC_FD(p))
+#define SSE2NEON_AES_DEC_V2(p)                                           \
+    SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_FD(p), SSE2NEON_AES_DEC_FB(p), \
+                         SSE2NEON_AES_DEC_FE(p), SSE2NEON_AES_DEC_F9(p))
+#define SSE2NEON_AES_DEC_V3(p)                                           \
+    SSE2NEON_AES_DEC_B2W(SSE2NEON_AES_DEC_F9(p), SSE2NEON_AES_DEC_FD(p), \
+                         SSE2NEON_AES_DEC_FB(p), SSE2NEON_AES_DEC_FE(p))
+
+    // Inverse T-tables: combine InvShiftRows + InvSubBytes + InvMixColumns
+    // Each table entry is the InvMixColumns result for that S-box output
+    static const uint32_t ALIGN_STRUCT(16) aes_inv_table[4][256] = {
+        SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V0),
+        SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V1),
+        SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V2),
+        SSE2NEON_AES_RSBOX(SSE2NEON_AES_DEC_V3),
+    };
+#undef SSE2NEON_AES_DEC_B2W
+#undef SSE2NEON_AES_DEC_X2
+#undef SSE2NEON_AES_DEC_X4
+#undef SSE2NEON_AES_DEC_X8
+#undef SSE2NEON_AES_DEC_F9
+#undef SSE2NEON_AES_DEC_FB
+#undef SSE2NEON_AES_DEC_FD
+#undef SSE2NEON_AES_DEC_FE
+#undef SSE2NEON_AES_DEC_V0
+#undef SSE2NEON_AES_DEC_V1
+#undef SSE2NEON_AES_DEC_V2
+#undef SSE2NEON_AES_DEC_V3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+
+    // InvShiftRows is integrated into table indexing:
+    // Row 0: no shift, Row 1: right by 1, Row 2: right by 2, Row 3: right by 3
+    __m128i out = _mm_set_epi32(
+        (aes_inv_table[0][x3 & 0xff] ^ aes_inv_table[1][(x2 >> 8) & 0xff] ^
+         aes_inv_table[2][(x1 >> 16) & 0xff] ^ aes_inv_table[3][x0 >> 24]),
+        (aes_inv_table[0][x2 & 0xff] ^ aes_inv_table[1][(x1 >> 8) & 0xff] ^
+         aes_inv_table[2][(x0 >> 16) & 0xff] ^ aes_inv_table[3][x3 >> 24]),
+        (aes_inv_table[0][x1 & 0xff] ^ aes_inv_table[1][(x0 >> 8) & 0xff] ^
+         aes_inv_table[2][(x3 >> 16) & 0xff] ^ aes_inv_table[3][x2 >> 24]),
+        (aes_inv_table[0][x0 & 0xff] ^ aes_inv_table[1][(x3 >> 8) & 0xff] ^
+         aes_inv_table[2][(x2 >> 16) & 0xff] ^ aes_inv_table[3][x1 >> 24]));
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(out, RoundKey);
 #endif
 }
 
@@ -9686,26 +10654,19 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {
-        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-    };
-
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t v;
     uint8x16_t w = vreinterpretq_u8_m128i(a);
 
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+    // shift rows - use file-scope constant
+    w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_shift_rows));
 
     // sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+    v = _sse2neon_aes_subbytes(w);
 
     // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+    return vreinterpretq_m128i_u8(
+        veorq_u8(v, vreinterpretq_u8_m128i(RoundKey)));
 
 #else /* ARMv7-A implementation */
     uint8_t v[16] = {
@@ -9727,7 +10688,126 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
         _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
     };
 
-    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
+#endif
+}
+
+FORCE_INLINE uint8x16_t _sse2neon_vqtbl1q_u8(uint8x16_t t, uint8x16_t idx)
+{
+#if SSE2NEON_ARCH_AARCH64
+    return vqtbl1q_u8(t, idx);
+#else
+    // Split 'idx' into two D registers.
+    uint8x8_t idx_low = vget_low_u8(idx);
+    uint8x8_t idx_high = vget_high_u8(idx);
+
+    uint8x8x2_t tbl = {
+        vget_low_u8(t),
+        vget_high_u8(t),
+    };
+
+    // Perform Lookup using vtbl2_u8.
+    // Perform lookup for the first 8 bytes of the result.
+    uint8x8_t ret_low = vtbl2_u8(tbl, idx_low);
+    // Perform lookup for the second 8 bytes of the result.
+    uint8x8_t ret_high = vtbl2_u8(tbl, idx_high);
+
+    // Combine the retults.
+    return vcombine_u8(ret_low, ret_high);
+#endif
+}
+
+FORCE_INLINE uint8x16_t _sse2neon_vqtbl4q_u8(uint8x16x4_t t, uint8x16_t idx)
+{
+#if SSE2NEON_ARCH_AARCH64
+    return vqtbl4q_u8(t, idx);
+#else
+    // Split 'idx' into two D registers.
+    uint8x8_t idx_lo = vget_low_u8(idx);
+    uint8x8_t idx_hi = vget_high_u8(idx);
+
+    uint8x8x4_t tbl_chunk_0 = {
+        vget_low_u8(t.val[0]),
+        vget_high_u8(t.val[0]),
+        vget_low_u8(t.val[1]),
+        vget_high_u8(t.val[1]),
+    };
+
+    uint8x8x4_t tbl_chunk_1 = {
+        vget_low_u8(t.val[2]),
+        vget_high_u8(t.val[2]),
+        vget_low_u8(t.val[3]),
+        vget_high_u8(t.val[3]),
+    };
+
+    // Shift indices down by 32 so index 32 becomes 0 for the new table.
+    uint8x16_t idx_minus_32 = vsubq_u8(idx, vdupq_n_u8(32));
+    uint8x8_t idx_lo_mod = vget_low_u8(idx_minus_32);
+    uint8x8_t idx_hi_mod = vget_high_u8(idx_minus_32);
+
+    // Pass 1: Use vtbl4_u8 (VTBL).
+    // NOTE: VTBL produces 0 of the indices are larger than 31.
+    uint8x8_t ret_lo = vtbl4_u8(tbl_chunk_0, idx_lo);
+    uint8x8_t ret_hi = vtbl4_u8(tbl_chunk_0, idx_hi);
+
+    // Use vtbx4_u8 (VTBX).
+    // It takes the result of Pass 1 as the accumulator.
+    ret_lo = vtbx4_u8(ret_lo, tbl_chunk_1, idx_lo_mod);
+    ret_hi = vtbx4_u8(ret_hi, tbl_chunk_1, idx_hi_mod);
+
+    // Combine the results
+    return vcombine_u8(ret_lo, ret_hi);
+#endif
+}
+
+FORCE_INLINE uint8x16_t _sse2neon_vqtbx4q_u8(uint8x16_t acc,
+                                             uint8x16x4_t t,
+                                             uint8x16_t idx)
+{
+#if SSE2NEON_ARCH_AARCH64
+    return vqtbx4q_u8(acc, t, idx);
+#else
+    // Split 'acc' into two D registers.
+    uint8x8_t ret_low = vget_low_u8(acc);
+    uint8x8_t ret_high = vget_high_u8(acc);
+    // Split 'idx' into two D registers.
+    uint8x8_t idx_low = vget_low_u8(idx);
+    uint8x8_t idx_high = vget_high_u8(idx);
+
+    uint8x8x4_t tbl_chunk_0 = {
+        vget_low_u8(t.val[0]),
+        vget_high_u8(t.val[0]),
+        vget_low_u8(t.val[1]),
+        vget_high_u8(t.val[1]),
+    };
+
+    uint8x8x4_t tbl_chunk_1 = {
+        vget_low_u8(t.val[2]),
+        vget_high_u8(t.val[2]),
+        vget_low_u8(t.val[3]),
+        vget_high_u8(t.val[3]),
+    };
+
+    // Adjust indices: We want to map index 32 to index 0 of this new table.
+    // To do so, we subtract 32 from all indices.
+    // NOTE: If the original index is smaller than 32, the adjusted index wraps
+    // around due to unsigned underflow (e.g., 5 - 32 = 229).
+    // Since 229 > 31, vtbx4_u8 (VTBX) preserves the result from Pass 1.
+    // This is the intended behavior.
+    uint8x16_t idx_minus_32 = vsubq_u8(idx, vdupq_n_u8(32));
+    uint8x8_t idx_low_mod = vget_low_u8(idx_minus_32);
+    uint8x8_t idx_high_mod = vget_high_u8(idx_minus_32);
+
+    // Perform vtbx4_u8 in the first chunk.
+    ret_low = vtbx4_u8(ret_low, tbl_chunk_0, idx_low);
+    ret_high = vtbx4_u8(ret_high, tbl_chunk_0, idx_high);
+
+    // Perform vtbx4_u8 on the second chunk.
+    ret_low = vtbx4_u8(ret_low, tbl_chunk_1, idx_low_mod);
+    ret_high = vtbx4_u8(ret_high, tbl_chunk_1, idx_high_mod);
+
+    // Combine the results.
+    return vcombine_u8(ret_low, ret_high);
 #endif
 }
 
@@ -9736,36 +10816,42 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
 FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
 {
-#if defined(__aarch64__)
-    static const uint8_t inv_shift_rows[] = {
-        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-    };
-
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t v;
     uint8x16_t w = vreinterpretq_u8_m128i(a);
 
-    // inverse shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+    // inverse shift rows - use file-scope constant
+    w = vqtbl1q_u8(w, vld1q_u8(_sse2neon_aes_inv_shift_rows));
 
     // inverse sub bytes
-    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+    v = _sse2neon_aes_inv_subbytes(w);
 
     // add round key
-    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+    return vreinterpretq_m128i_u8(
+        veorq_u8(v, vreinterpretq_u8_m128i(RoundKey)));
 
-#else /* ARMv7-A NEON implementation */
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4];
-    uint8_t *_a = (uint8_t *) &a;
-    for (int i = 0; i < 16; ++i) {
-        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-    }
+#else /* ARMv7-A implementation */
+    // Inverse shift rows indices: 0,13,10,7,4,1,14,11,8,5,2,15,12,9,6,3
+    uint8_t v[16] = {
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_rsbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+    };
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vld1q_u8(v)), RoundKey);
 #endif
 }
 
@@ -9773,29 +10859,28 @@ FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
 FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
 {
-#if defined(__aarch64__)
-    static const uint8_t ror32by8[] = {
-        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-    };
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t v = vreinterpretq_u8_m128i(a);
     uint8x16_t w;
 
-    // multiplying 'v' by 4 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-    v ^= w;
-    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-
-    // multiplying 'v' by 2 in GF(2^8)
-    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    /* InvMixColumns: same algorithm as in _mm_aesdec_si128 */
+    // v*{04} = xtime(xtime(v))
+    w = _sse2neon_aes_xtime(v);
+    w = _sse2neon_aes_xtime(w);
+    v = veorq_u8(v, w);
+    v = veorq_u8(v, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(w))));
+
+    // Apply standard MixColumns pattern
+    w = _sse2neon_aes_xtime(v);
+    w = veorq_u8(w, vreinterpretq_u8_u16(vrev32q_u16(vreinterpretq_u16_u8(v))));
+    w = veorq_u8(w,
+                 vqtbl1q_u8(veorq_u8(v, w), vld1q_u8(_sse2neon_aes_ror32by8)));
     return vreinterpretq_m128i_u8(w);
 
 #else /* ARMv7-A NEON implementation */
     uint8_t i, e, f, g, h, v[4][4];
-    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    vst1q_u8(_sse2neon_reinterpret_cast(uint8_t *, v),
+             vreinterpretq_u8_m128i(a));
     for (i = 0; i < 4; ++i) {
         e = v[i][0];
         f = v[i][1];
@@ -9812,40 +10897,43 @@ FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
                   SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
     }
 
-    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+    return vreinterpretq_m128i_u8(
+        vld1q_u8(_sse2neon_reinterpret_cast(uint8_t *, v)));
 #endif
 }
 
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
 // This instruction generates a round key for AES encryption. See
 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
 // for details.
-//
-// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint8x16_t _a = vreinterpretq_u8_m128i(a);
-    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+    uint8x16_t sub = _sse2neon_aes_subbytes(_a);
 
-    uint32x4_t select_mask = {0xffffffff, 0x0, 0xffffffff, 0x0};
-    uint64x2_t v_mask = vshrq_n_u64(vreinterpretq_u64_u8(v), 32);
-    uint32x4_t x = vbslq_u32(select_mask, vreinterpretq_u32_u64(v_mask),
-                             vreinterpretq_u32_u8(v));
-    uint32x4_t ror_x = vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 24));
-    uint32x4_t ror_xor_x = veorq_u32(ror_x, vdupq_n_u32(rcon));
+    uint32x4_t sub_u32 = vreinterpretq_u32_u8(sub);
+    uint32x4_t rot =
+        vorrq_u32(vshrq_n_u32(sub_u32, 8), vshlq_n_u32(sub_u32, 24));
+    uint32x4_t rcon_vec =
+        vdupq_n_u32(_sse2neon_static_cast(uint32_t, rcon));  // lane-wise xor
+    uint32x4_t rot_xor = veorq_u32(rot, rcon_vec);
 
-    return vreinterpretq_m128i_u32(vbslq_u32(select_mask, x, ror_xor_x));
+    return vreinterpretq_m128i_u32(vtrn2q_u32(sub_u32, rot_xor));
 
 #else /* ARMv7-A NEON implementation */
     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
     for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+        (_sse2neon_reinterpret_cast(uint8_t *, &X1))[i] =
+            _sse2neon_sbox[(_sse2neon_reinterpret_cast(uint8_t *, &X1))[i]];
+        (_sse2neon_reinterpret_cast(uint8_t *, &X3))[i] =
+            _sse2neon_sbox[(_sse2neon_reinterpret_cast(uint8_t *, &X3))[i]];
     }
     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
@@ -9854,7 +10942,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 #undef SSE2NEON_AES_SBOX
 #undef SSE2NEON_AES_RSBOX
 
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #undef SSE2NEON_XT
 #undef SSE2NEON_MULTIPLY
 #endif
@@ -9868,9 +10956,9 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 // for more details.
 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
 {
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(b)));
 }
 
 // Perform one round of an AES decryption flow on data (state) in a using the
@@ -9899,35 +10987,67 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
 FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
 {
     return vreinterpretq_m128i_u8(
-               vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-           vreinterpretq_u8_m128i(RoundKey);
+        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
+                 vreinterpretq_u8_m128i(RoundKey)));
 }
 
 // Perform the InvMixColumns transformation on a and store the result in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
 FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
 {
-    return vreinterpretq_m128i_u8(vaesimcq_u8(a));
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
 }
 
 // Assist in expanding the AES cipher key by computing steps towards generating
 // a round key for encryption cipher using data from a and an 8-bit round
-// constant specified in imm8, and store the result in dst."
+// constant specified in imm8, and store the result in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 {
     // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+    uint8x16_t sb_ = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
 
+#if !SSE2NEON_COMPILER_MSVC || SSE2NEON_COMPILER_CLANG
     uint8x16_t dest = {
         // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+        sb_[0x4], sb_[0x1], sb_[0xE], sb_[0xB],  // SubBytes(X1)
+        sb_[0x1], sb_[0xE], sb_[0xB], sb_[0x4],  // ROT(SubBytes(X1))
+        sb_[0xC], sb_[0x9], sb_[0x6], sb_[0x3],  // SubBytes(X3)
+        sb_[0x9], sb_[0x6], sb_[0x3], sb_[0xC],  // ROT(SubBytes(X3))
     };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    uint32x4_t r = {0, _sse2neon_static_cast(unsigned, rcon), 0,
+                    _sse2neon_static_cast(unsigned, rcon)};
     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+#else
+    // We have to do this hack because MSVC is strictly adhering to the CPP
+    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    // unions must be initialized by their first member type.
+
+    // As per the Windows ARM64 ABI, it is always little endian, so this works
+    __n128 dest{
+        ((uint64_t) sb_.n128_u8[0x4] << 0) |
+            ((uint64_t) sb_.n128_u8[0x1] << 8) |
+            ((uint64_t) sb_.n128_u8[0xE] << 16) |
+            ((uint64_t) sb_.n128_u8[0xB] << 24) |
+            ((uint64_t) sb_.n128_u8[0x1] << 32) |
+            ((uint64_t) sb_.n128_u8[0xE] << 40) |
+            ((uint64_t) sb_.n128_u8[0xB] << 48) |
+            ((uint64_t) sb_.n128_u8[0x4] << 56),
+        ((uint64_t) sb_.n128_u8[0xC] << 0) |
+            ((uint64_t) sb_.n128_u8[0x9] << 8) |
+            ((uint64_t) sb_.n128_u8[0x6] << 16) |
+            ((uint64_t) sb_.n128_u8[0x3] << 24) |
+            ((uint64_t) sb_.n128_u8[0x9] << 32) |
+            ((uint64_t) sb_.n128_u8[0x6] << 40) |
+            ((uint64_t) sb_.n128_u8[0x3] << 48) |
+            ((uint64_t) sb_.n128_u8[0xC] << 56),
+    };
+
+    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
+    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
+
+    return dest;
+#endif
 }
 #endif
 
@@ -9958,19 +11078,19 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
     }
 }
 
-FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
 {
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#if SSE2NEON_ARCH_AARCH64
+    r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
@@ -9983,9 +11103,11 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #if __has_builtin(__builtin_popcount)
     return __builtin_popcount(a);
+#elif SSE2NEON_COMPILER_MSVC
+    return _CountOneBits(a);
 #else
     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
 #endif
@@ -9995,7 +11117,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
     uint16x4_t count16x4_val;
     uint32x2_t count32x2_val;
 
-    input_val = vld1_u8((uint8_t *) &a);
+    input_val = vld1_u8(_sse2neon_reinterpret_cast(uint8_t *, &a));
     count8x8_val = vcnt_u8(input_val);
     count16x4_val = vpaddl_u8(count8x8_val);
     count32x2_val = vpaddl_u16(count16x4_val);
@@ -10010,9 +11132,11 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
 #if __has_builtin(__builtin_popcountll)
     return __builtin_popcountll(a);
+#elif SSE2NEON_COMPILER_MSVC
+    return _CountOneBits64(a);
 #else
     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
 #endif
@@ -10023,7 +11147,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
     uint32x2_t count32x2_val;
     uint64x1_t count64x1_val;
 
-    input_val = vld1_u8((uint8_t *) &a);
+    input_val = vld1_u8(_sse2neon_reinterpret_cast(uint8_t *, &a));
     count8x8_val = vcnt_u8(input_val);
     count16x4_val = vpaddl_u8(count8x8_val);
     count32x2_val = vpaddl_u16(count16x4_val);
@@ -10039,43 +11163,45 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
     // regardless of the value of the FZ bit.
     union {
         fpcr_bitfield field;
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
         uint64_t value;
 #else
         uint32_t value;
 #endif
     } r;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#if SSE2NEON_ARCH_AARCH64
+    r.value = _sse2neon_get_fpcr();
 #else
     __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
 #endif
 
     r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
 
-#if defined(__aarch64__)
-    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#if SSE2NEON_ARCH_AARCH64
+    _sse2neon_set_fpcr(r.value);
 #else
-    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
 #endif
 }
 
 // Return the current 64-bit value of the processor's time-stamp counter.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
-
 FORCE_INLINE uint64_t _rdtsc(void)
 {
-#if defined(__aarch64__)
+#if SSE2NEON_ARCH_AARCH64
     uint64_t val;
 
     /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
-     * system counter is at least 56 bits wide; from Armv8.6, the counter
-     * must be 64 bits wide.  So the system counter could be less than 64
-     * bits wide and it is attributed with the flag 'cap_user_time_short'
-     * is true.
+     * system counter is at least 56 bits wide; from Armv8.6, the counter must
+     * be 64 bits wide. So the system counter could be less than 64 bits wide
+     * and it is attributed with the flag 'cap_user_time_short' is true.
      */
+#if SSE2NEON_COMPILER_MSVC && !SSE2NEON_COMPILER_CLANG
+    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
+#else
     __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+#endif
 
     return val;
 #else
@@ -10099,7 +11225,7 @@ FORCE_INLINE uint64_t _rdtsc(void)
 #endif
 }
 
-#if defined(__GNUC__) || defined(__clang__)
+#if SSE2NEON_COMPILER_GCC_COMPAT
 #pragma pop_macro("ALIGN_STRUCT")
 #pragma pop_macro("FORCE_INLINE")
 #endif
diff --git a/common/simd/simd.h b/common/simd/simd.h
index e777d2df01..3088867c8d 100644
--- a/common/simd/simd.h
+++ b/common/simd/simd.h
@@ -6,7 +6,7 @@
 #include "../math/emath.h"
 
 /* include SSE wrapper classes */
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #  include "sse.h"
 #endif
 
diff --git a/common/simd/sse.h b/common/simd/sse.h
index 04d90533dd..bf9d2c04f0 100644
--- a/common/simd/sse.h
+++ b/common/simd/sse.h
@@ -11,7 +11,7 @@
 
 namespace embree 
 {
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
   __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
     return _mm_blendv_ps(f,t,mask);
   }
diff --git a/common/simd/vboold4_avx.h b/common/simd/vboold4_avx.h
index 450bd7a4eb..6d22e26e7c 100644
--- a/common/simd/vboold4_avx.h
+++ b/common/simd/vboold4_avx.h
@@ -39,9 +39,9 @@ namespace embree
     __forceinline vboold(__m256d a) : v(a) {}
     __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
 
-    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
-    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
-    __forceinline operator const __m256d() const { return v; }
+    __forceinline __m256 m256() const { return _mm256_castpd_ps(v); }
+    __forceinline __m256d m256d() const { return v; }
+    __forceinline __m256i mask32() const { return _mm256_castpd_si256(v); }
 
     __forceinline vboold(int a)
     {
@@ -62,7 +62,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
     __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
 #else
     __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
@@ -80,17 +80,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a.m256d(), vboold4(embree::True).m256d()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
-  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
-  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a.m256d(), b.m256d()); }
 
-  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b.m256d(), a.m256d()); }
 
   __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
   __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
@@ -100,31 +100,31 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
-  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a.m256d(),b.m256d()),vboold4(embree::True).m256d()); }
 
   __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
-    return _mm256_blendv_pd(f, t, mask); 
+    return _mm256_blendv_pd(f.m256d(), t.m256d(), mask.m256d()); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if !defined(__aarch64__)
-  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
-  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a.m256d(), b.m256d()); }
 #endif
 
 #if defined(__AVX2__)
   template<int i0, int i1, int i2, int i3>
   __forceinline vboold4 shuffle(const vboold4& v) {
-    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm256_permute4x64_pd(v.m256d(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i>
   __forceinline vboold4 shuffle(const vboold4& v) {
-    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+    return _mm256_permute4x64_pd(v.m256d(), _MM_SHUFFLE(i, i, i, i));
   }
 #endif
 
@@ -133,19 +133,19 @@ namespace embree
   /// Reduction Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
-  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a.m256d()) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a.m256d(),a.m256d()); }
 
-  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
-  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
-  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a.m256d()) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a.m256d(),a.m256d()); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a.m256d(),a.m256d()) != 0; }
 
   __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
   __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
   __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
 
-  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
-  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a.m256d()); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a.m256d())); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vboold4_avx512.h b/common/simd/vboold4_avx512.h
index ceaad7bba5..be8244c21b 100644
--- a/common/simd/vboold4_avx512.h
+++ b/common/simd/vboold4_avx512.h
@@ -32,12 +32,17 @@ namespace embree
     __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
 
     __forceinline vboold(const __mmask8 &t) { v = t; }
-    __forceinline operator __mmask8() const { return v; }
 
     __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
     __forceinline vboold(int t)  { v = (__mmask8)t; }
     __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
 
+    /* return packed 8 bits mask */
+    __forceinline __mmask8 packedMask8() const { return v; }
+
+    /* return packed 16 bits mask */
+    __forceinline __mmask16 packedMask16() const { return (__mmask16)v; }
+
     /* return int8 mask */
     __forceinline __m128i mask8() const {
       return _mm_movm_epi8(v);
@@ -73,17 +78,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a.packedMask16(), 0xf); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
-  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
-  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
 
-  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b.packedMask16(), a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -97,11 +102,11 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
-  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a.packedMask16(), b.packedMask16()), 0xf); }
 
   __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
-    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+    return _mm512_kor(_mm512_kand(s.packedMask16(), a.packedMask16()), _mm512_kandn(s.packedMask16(), b.packedMask16()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -109,21 +114,21 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline int all (const vboold4& a) { return a.v == 0xf; }
-  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
-  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) != 0; }
 
   __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
   __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
   __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
 
-  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a.packedMask16()); }
   __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vboold8_avx512.h b/common/simd/vboold8_avx512.h
index 66d2054872..2238dce661 100644
--- a/common/simd/vboold8_avx512.h
+++ b/common/simd/vboold8_avx512.h
@@ -32,12 +32,17 @@ namespace embree
     __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
 
     __forceinline vboold(const __mmask8& t) { v = t; }
-    __forceinline operator __mmask8() const { return v; }
     
     __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
     __forceinline vboold(int t)  { v = (__mmask8)t; }
     __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
 
+    /* return packed 8 bits mask */
+    __forceinline __mmask8 packedMask8() const { return v; }
+
+    /* return packed 16 bits mask */
+    __forceinline __mmask16 packedMask16() const { return (__mmask16)v; }
+
     /* return int8 mask */
     __forceinline __m128i mask8() const {
       return _mm_movm_epi8(v);
@@ -68,17 +73,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a.packedMask16()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
-  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
-  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
 
-  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b.packedMask16(), a.packedMask16()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -92,11 +97,11 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
-  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a.packedMask16(), b.packedMask16()); }
   
   __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
-    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+    return _mm512_kor(_mm512_kand(s.packedMask16(), a.packedMask16()), _mm512_kandn(s.packedMask16(), b.packedMask16()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -104,21 +109,21 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   
   __forceinline int all (const vboold8& a) { return a.v == 0xff; }
-  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
-  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) != 0; }
 
   __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
   __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
   __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
   
-  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a.packedMask16()); }
   __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vboolf16_avx512.h b/common/simd/vboolf16_avx512.h
index 86b718f025..af8fb701ef 100644
--- a/common/simd/vboolf16_avx512.h
+++ b/common/simd/vboolf16_avx512.h
@@ -33,12 +33,17 @@ namespace embree
     __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
 
     __forceinline vboolf(const __mmask16& t) { v = t; }
-    __forceinline operator __mmask16() const { return v; }
     
     __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
     __forceinline vboolf(int t) { v = (__mmask16)t; }
     __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
 
+    /* return packed 16 bits mask */
+    __forceinline __mmask16 packedMask16() const { return v; }
+
+    /* return packed 8 bits mask */
+    __forceinline __mmask8 packedMask8() const { return (__mmask8)v; }
+
     /* return int8 mask */
     __forceinline __m128i mask8() const {
       return _mm_movm_epi8(v);
@@ -69,17 +74,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a.packedMask16()); }
   
    ////////////////////////////////////////////////////////////////////////////////
    /// Binary Operators
    ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
-  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
-  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a.packedMask16(),b.packedMask16()); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a.packedMask16(),b.packedMask16()); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a.packedMask16(),b.packedMask16()); }
 
-  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b.packedMask16(),a.packedMask16()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -93,33 +98,33 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
-  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a.packedMask16(), b.packedMask16()); }
   
   __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
-    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+    return _mm512_kor(_mm512_kand(s.packedMask16(),a.packedMask16()),_mm512_kandn(s.packedMask16(),b.packedMask16()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reduction Operations
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
-  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
-  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a.packedMask16(),a.packedMask16()) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a.packedMask16(),a.packedMask16()) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a.packedMask16(),a.packedMask16()) != 0; }
 
   __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
   __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
   __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
   
-  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a.packedMask16()); }
   __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a.packedMask16()); }
   __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/common/simd/vboolf4_avx512.h b/common/simd/vboolf4_avx512.h
index e65f66b025..af2de36e95 100644
--- a/common/simd/vboolf4_avx512.h
+++ b/common/simd/vboolf4_avx512.h
@@ -32,7 +32,6 @@ namespace embree
     __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
 
     __forceinline vboolf(const __mmask8 &t) { v = t; }
-    __forceinline operator __mmask8() const { return v; }
 
     __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
     __forceinline vboolf(int t)  { v = (__mmask8)t; }
@@ -41,6 +40,12 @@ namespace embree
     __forceinline vboolf(bool a, bool b, bool c, bool d)
       : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
 
+    /* return packed 8 bits mask */
+    __forceinline __mmask8 packedMask8() const { return v; }
+
+    /* return packed 16 bits mask */
+    __forceinline __mmask8 packedMask16() const { return (__mmask16)v; }
+
     /* return int8 mask */
     __forceinline __m128i mask8() const {
       return _mm_movm_epi8(v);
@@ -76,17 +81,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a.packedMask16(), 0xf); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
-  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
-  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
 
-  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b.packedMask16(), a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -100,11 +105,11 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
-  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a.packedMask16(), b.packedMask16()), 0xf); }
 
   __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
-    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+    return _mm512_kor(_mm512_kand(s.packedMask16(), a.packedMask16()), _mm512_kandn(s.packedMask16(), b.packedMask16()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -112,21 +117,21 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
-  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
-  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) != 0; }
 
   __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
   __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
   __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
 
-  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a.packedMask16()); }
   __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vboolf4_sse2.h b/common/simd/vboolf4_sse2.h
index e96525c9a7..3bb4566ba7 100644
--- a/common/simd/vboolf4_sse2.h
+++ b/common/simd/vboolf4_sse2.h
@@ -24,7 +24,7 @@ namespace embree
     typedef vfloat4 Float;
 
     enum  { size = 4 };            // number of SIMD elements
-    union { __m128 v; int i[4]; }; // data
+    union {__m128 v; int i[4]; }; // data
 
     ////////////////////////////////////////////////////////////////////////////////
     /// Constructors, Assignment & Cast Operators
@@ -35,10 +35,11 @@ namespace embree
     __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
 
     __forceinline vboolf(__m128 input) : v(input) {}
-    __forceinline operator const __m128&() const { return v; }
+    __forceinline const __m128& m128() const { return v; }
+    __forceinline __m128& m128() { return v; }
     #if !defined(__EMSCRIPTEN__)
-    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
-    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    __forceinline const __m128i m128i() const { return _mm_cvtps_epi32(v); }
+    __forceinline const __m128d m128d() const { return _mm_cvtps_pd(v); }
     #endif
 
     __forceinline vboolf(bool a)
@@ -74,17 +75,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a.m128(), (__m128)vboolf4(embree::True).m128()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
-  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
-  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a.m128(), b.m128()); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a.m128(), b.m128()); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a.m128(), b.m128()); }
 
-  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b.m128(), a.m128()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -98,14 +99,15 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
-  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a.m128(), b.m128()); }
+  //warning might be a.m128i() and b.m128i()
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a.mask32(), b.mask32())); }
   
   __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
-#if defined(__aarch64__) || defined(__SSE4_1__)
-    return _mm_blendv_ps(f, t, m); 
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
+    return _mm_blendv_ps(f.m128(), t.m128(), m.m128()); 
 #else
-    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+    return _mm_or_ps(_mm_and_ps(m.m128(), t.m128()), _mm_andnot_ps(m.m128(), f.m128())); 
 #endif
   }
 
@@ -113,28 +115,42 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
   
-  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
-  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a.m128(), b.m128()); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a.m128(), b.m128()); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
+#if !defined(_M_ARM64)
     return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32((int32x4_t)v.v), _MN_SHUFFLE(i0, i1, i2, i3)));
+#else
+    // Avoids C4576 (no mixing C+CPP syntax), and C4002 (comma inside macro invocation)
+    uint8x16_t _shuffle = _MN_SHUFFLE(i0, i1, i2, i3);
+    // warning not sure about the v.v
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v.v), _shuffle));
+#endif
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+#if !defined(_M_ARM64)
     return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+#else
+    // Avoids C4576 (no mixing C+CPP syntax), and C4002 (comma inside macro invocation)
+    uint8x16x2_t _ab = {(uint8x16_t)a.v, (uint8x16_t)b.v};
+    uint8x16_t _shuffle = _MF_SHUFFLE(i0, i1, i2, i3);
+    return vreinterpretq_f32_u8(vqtbl2q_u8(_ab, _shuffle));
+#endif
   }
 #else
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& v) {
-    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v.mask32(), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
-    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm_shuffle_ps(a.m128(), b.m128(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 #endif
 
@@ -144,13 +160,13 @@ namespace embree
   }
 
 #if defined(__SSE3__)
-  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
-  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
-  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v.m128()); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v.m128()); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v.m128d())); }
 #endif
 
-#if defined(__SSE4_1__) && !defined(__aarch64__)
-  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+#if defined(__SSE4_1__) && !defined(__aarch64__) && !defined(_M_ARM64)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a.m128(), b.m128(), (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
 #endif
@@ -159,22 +175,22 @@ namespace embree
   /// Reduction Operations
   ////////////////////////////////////////////////////////////////////////////////
     
-  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
-  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a.m128()) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a.m128()) != 0x0; }
 
-  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
-  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
-  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b.m128()) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b.m128()) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b.m128()) == 0x0; }
 
   __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
   __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
   __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
   
-  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
-#if defined(__aarch64__)
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a.m128()); }
+#if defined(__aarch64__) || defined(_M_ARM64)
   __forceinline size_t popcnt(const vboolf4& a) { return vaddvq_s32(vandq_u32(vreinterpretq_u32_f32(a.v),_mm_set1_epi32(1))); }
 #elif defined(__SSE4_2__)
-  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a.m128())); }
 #else
   __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
 #endif
diff --git a/common/simd/vboolf8_avx.h b/common/simd/vboolf8_avx.h
index 18cede19c6..4480bd5ac7 100644
--- a/common/simd/vboolf8_avx.h
+++ b/common/simd/vboolf8_avx.h
@@ -39,9 +39,8 @@ namespace embree
     __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
 
     __forceinline vboolf(__m256 a) : v(a) {}
-    __forceinline operator const __m256&() const { return v; }
-    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
-    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+    __forceinline const __m256& m256() const { return v; }
+    __forceinline const __m256d m256d() const { return _mm256_castps_pd(v); }
 
     __forceinline vboolf(int a)
     {
@@ -57,14 +56,14 @@ namespace embree
 #endif
     }
 
-    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
-    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a.m128()),a.m128(),1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a.m128()),b.m128(),1)) {}
     __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
 
-    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
-    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
-    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
-    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a).m128(), vboolf4(a).m128()).m256()) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a).m128(), vboolf4(b).m128()).m256()) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b).m128(), vboolf4(c,d).m128()).m256()) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d).m128(), vboolf4(e,f,g,h).m128()).m256()) {}
 
     /* return int32 mask */
     __forceinline __m256i mask32() const { 
@@ -90,17 +89,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a.m256(), vboolf8(embree::True).m256()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
-  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
-  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a.m256(), b.m256()); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a.m256(), b.m256()); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a.m256(), b.m256()); }
 
-  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b.m256(), a.m256()); }
 
   __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
   __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
@@ -110,70 +109,70 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
-  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a.m256(), b.m256()); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a.m256(),b.m256()),vboolf8(embree::True).m256()); }
 
   __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
-    return _mm256_blendv_ps(f, t, mask); 
+    return _mm256_blendv_ps(f.m256(), t.m256(), mask.m256()); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
-  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a.m256(), b.m256()); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a.m256(), b.m256()); }
 
   template<int i>
   __forceinline vboolf8 shuffle(const vboolf8& v) {
-    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+    return _mm256_permute_ps(v.m256(), _MM_SHUFFLE(i, i, i, i));
   }
 
   template<int i0, int i1>
   __forceinline vboolf8 shuffle4(const vboolf8& v) {
-    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_ps(v.m256(), v.m256(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
-    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_ps(a.m256(), b.m256(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf8 shuffle(const vboolf8& v) {
-    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm256_permute_ps(v.m256(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
-    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm256_shuffle_ps(a.m256(), b.m256(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
-  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
-  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
-  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v.m256()); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v.m256()); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v.m256()))); }
 
-  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
-  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
-  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a.m256(), b.m128(), i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a.m256(), i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a.m256());   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reduction Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
-  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a.m256()) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a.m256(),a.m256()); }
 
-  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
-  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
-  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a.m256()) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a.m256(),a.m256()); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a.m256(),a.m256()) != 0; }
 
   __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
   __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
   __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
 
-  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
-  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a.m256()); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a.m256())); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vboolf8_avx512.h b/common/simd/vboolf8_avx512.h
index 73ff5666e1..daa1079903 100644
--- a/common/simd/vboolf8_avx512.h
+++ b/common/simd/vboolf8_avx512.h
@@ -32,7 +32,6 @@ namespace embree
     __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
 
     __forceinline vboolf(const __mmask8 &t) { v = t; }
-    __forceinline operator __mmask8() const { return v; }
 
     __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
     __forceinline vboolf(int t)  { v = (__mmask8)t; }
@@ -41,6 +40,12 @@ namespace embree
     __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
       : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
 
+    /* return packed 8 bits mask */
+    __forceinline __mmask8 packedMask8() const { return v; }
+
+    /* return packed 16 bits mask */
+    __forceinline __mmask16 packedMask16() const { return (__mmask16)v; }
+
     /* return int8 mask */
     __forceinline __m128i mask8() const {
       return _mm_movm_epi8(v);
@@ -76,17 +81,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
-  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
-  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
 
-  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b.packedMask16(), a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -100,11 +105,11 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
-  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a.packedMask16(), b.packedMask16()); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a.packedMask16(), b.packedMask16()); }
 
   __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
-    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+    return _mm512_kor(_mm512_kand(s.packedMask16(), a.packedMask16()), _mm512_kandn(s.packedMask16(), b.packedMask16()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -112,21 +117,21 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
-  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
-  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a.packedMask16(), a.packedMask16()) != 0; }
 
   __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
   __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
   __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
 
-  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a.packedMask16()); }
   __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Conversion Operations
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a.packedMask16()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Get/Set Functions
diff --git a/common/simd/vdouble4_avx.h b/common/simd/vdouble4_avx.h
index 208bb7ac99..99d49624ac 100644
--- a/common/simd/vdouble4_avx.h
+++ b/common/simd/vdouble4_avx.h
@@ -36,7 +36,7 @@ namespace embree
     __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
 
     __forceinline vdouble(const __m256d& t) { v = t; }
-    __forceinline operator __m256d() const { return v; }
+    __forceinline __m256d m256d() const { return v; }
 
     __forceinline vdouble(double i) {
       v = _mm256_set1_pd(i);
@@ -61,7 +61,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
-      _mm256_stream_pd(ptr, a);
+      _mm256_stream_pd(ptr, a.m256d());
     }
 
     static __forceinline vdouble4 loadu(const double* addr) {
@@ -77,11 +77,11 @@ namespace embree
     }
 
     static __forceinline void store(double* ptr, const vdouble4& v) {
-      _mm256_store_pd(ptr, v);
+      _mm256_store_pd(ptr, v.m256d());
     }
 
     static __forceinline void storeu(double* ptr, const vdouble4& v) {
-      _mm256_storeu_pd(ptr, v);
+      _mm256_storeu_pd(ptr, v.m256d());
     }
 
     static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
@@ -99,46 +99,46 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX2__)
-  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
-  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a.m256i()); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a.m256d()); }
 #endif
 
   __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
-  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a.m256d()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
   __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
 
-  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
   __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
 
-  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
   __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
 
-  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
   __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
 
-  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
   __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
 
-  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
   __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
   
-  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
   __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
 
-  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a.m256d(), b.m256d()); }
   __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
   __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
   
@@ -147,10 +147,10 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__FMA__)
-  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
-  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
-  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
-  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a.m256d(),b.m256d(),c.m256d()); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a.m256d(),b.m256d(),c.m256d()); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a.m256d(),b.m256d(),c.m256d()); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a.m256d(),b.m256d(),c.m256d()); }
 #else
   __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
   __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
@@ -183,26 +183,26 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
-  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
-  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
-  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
-  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
-  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
-#elif !defined(__aarch64__)
-  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
-  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
-  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
-  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
-  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
-  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a.m256d(), b.m256d(), _MM_CMPINT_LE); }
+#elif !defined(__aarch64__) && !defined(_M_ARM64)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a.m256d(), b.m256d(), _CMP_LE_OS);  }
 #else
-  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
-  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
-  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
-  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
-  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
-  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a.m256d(), b.m256d());  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a.m256d(), b.m256d());  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a.m256d(), b.m256d()); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a.m256d(), b.m256d());  }
 #endif
 
   __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
@@ -231,12 +231,12 @@ namespace embree
   __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
-  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
-  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
-  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
-  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask.packedMask8(), a.m256d(), b.m256d(), _MM_CMPINT_LE); }
 #else
   __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
   __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
@@ -248,9 +248,9 @@ namespace embree
  
   __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
 #if defined(__AVX512VL__)
-    return _mm256_mask_blend_pd(m, f, t);
+    return _mm256_mask_blend_pd(m.packedMask8(), f.m256d(), t.m256d());
 #else
-    return _mm256_blendv_pd(f, t, m);
+    return _mm256_blendv_pd(f.m256d(), t.m256d(), m.m256d());
 #endif
   }
 
@@ -260,7 +260,7 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vdouble4 shuffle(const vdouble4& v) {
-    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+    return _mm256_permute_pd(v.m256d(), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
   }
 
   template<int i>
@@ -270,11 +270,11 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vdouble4 shuffle2(const vdouble4& v) {
-    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+    return _mm256_permute2f128_pd(v.m256d(), v.m256d(), (i1 << 4) | i0);
   }
 
   __forceinline double toScalar(const vdouble4& v) {
-    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v.m256d()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/common/simd/vdouble8_avx512.h b/common/simd/vdouble8_avx512.h
index 98d21bfe4a..5f2e73bd32 100644
--- a/common/simd/vdouble8_avx512.h
+++ b/common/simd/vdouble8_avx512.h
@@ -36,8 +36,8 @@ namespace embree
     __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
 
     __forceinline vdouble(const __m512d& t) { v = t; }
-    __forceinline operator __m512d() const { return v; }
-    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+    __forceinline __m512d m512d() const { return v; }
+    __forceinline __m256d m256d() const { return _mm512_castpd512_pd256(v); }
 
     __forceinline vdouble(double i) {
       v = _mm512_set1_pd(i);
@@ -68,7 +68,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
-      _mm512_stream_pd((double*)ptr, a);
+      _mm512_stream_pd((double*)ptr, a.m512d());
     }
 
     static __forceinline vdouble8 loadu(const void* addr) {
@@ -84,27 +84,27 @@ namespace embree
     }
 
     static __forceinline void store(void* ptr, const vdouble8& v) {
-      _mm512_store_pd(ptr, v);
+      _mm512_store_pd(ptr, v.m512d());
     }
 
     static __forceinline void storeu(void* ptr, const vdouble8& v) {
-      _mm512_storeu_pd(ptr, v);
+      _mm512_storeu_pd(ptr, v.m512d());
     }
 
     static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
-      _mm512_mask_storeu_pd(ptr, mask, f);
+      _mm512_mask_storeu_pd(ptr, mask.packedMask8(), f.m512d());
     }
 
     static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
-      _mm512_mask_store_pd(addr, mask, v2);
+      _mm512_mask_store_pd(addr, mask.packedMask8(), v2.m512d());
     }
 
     static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
-      return _mm512_mask_compress_pd(v, mask, v);
+      return _mm512_mask_compress_pd(v.m512d(), mask.packedMask8(), v.m512d());
     }
 
     static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
-      return _mm512_mask_compress_pd(a, mask, b);
+      return _mm512_mask_compress_pd(a.m512d(), mask.packedMask8(), b.m512d());
     }
 
     static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
@@ -122,72 +122,72 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
-  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a.m512i()); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a.m512d()); }
 
   __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
-  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a.m512d()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
   __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
 
-  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
   __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
 
-  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
   __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
 
-  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
   __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
 
-  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
   __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
 
-  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
   __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
 
-  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
-  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a.m512d()), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a.m512d()), n)); }
 
-  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
-  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a.m512d()), n.m512i())); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a.m512d()), n.m512i())); }
 
-  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
-  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
-  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a.m512d()), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a.m512d()), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a.m512d()), b)); }
 
-  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
   __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
 
-  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a.m512d(), b.m512d()); }
   __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
   __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
 
-  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
-  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c.m512d(),mask.packedMask8(),a.m512d(),b.m512d()); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c.m512d(),mask.packedMask8(),a.m512d(),b.m512d()); }
 
-  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
-  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c.m512d(),m.packedMask8(),a.m512d(),b.m512d()); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c.m512d(),m.packedMask8(),a.m512d(),b.m512d()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Ternary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
-  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
-  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
-  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a.m512d(),b.m512d(),c.m512d()); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a.m512d(),b.m512d(),c.m512d()); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a.m512d(),b.m512d(),c.m512d()); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a.m512d(),b.m512d(),c.m512d()); }
 
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -217,46 +217,46 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_EQ); }
   __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
   __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
 
-  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_NE); }
   __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
   __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
 
-  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_LT); }
   __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
   __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
 
-  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_GE); }
   __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
   __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
 
-  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_GT); }
   __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
   __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
 
-  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_LE); }
   __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
   __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
 
-  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a.m512d(),b.m512d(),_MM_CMPINT_LE); }
 
-  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
-  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
-  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
-  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
-  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
-  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask.packedMask8(),a.m512d(),b.m512d(),_MM_CMPINT_LE); }
 
   __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
-    return _mm512_mask_or_pd(f,m,t,t);
+    return _mm512_mask_or_pd(f.m512d(),m.packedMask8(),t.m512d(),t.m512d());
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -265,7 +265,7 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vdouble8 shuffle(const vdouble8& v) {
-    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+    return _mm512_permute_pd(v.m512d(), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
   }
 
   template<int i>
@@ -275,12 +275,12 @@ namespace embree
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vdouble8 shuffle(const vdouble8& v) {
-    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm512_permutex_pd(v.m512d(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1>
   __forceinline vdouble8 shuffle4(const vdouble8& v) {
-    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+    return _mm512_shuffle_f64x2(v.m512d(), v.m512d(), _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
   }
 
   template<int i>
@@ -290,11 +290,11 @@ namespace embree
   
   template<int i>
   __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
-    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a.m512d()), _mm512_castpd_si512(b.m512d()), i));
   }
 
   __forceinline double toScalar(const vdouble8& v) {
-    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v.m512d()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -322,7 +322,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
-    return _mm512_permutexvar_pd(index, v);
+    return _mm512_permutexvar_pd(index.m512i(), v.m512d());
   }
 
   __forceinline vdouble8 reverse(const vdouble8& a) {
diff --git a/common/simd/vfloat16_avx512.h b/common/simd/vfloat16_avx512.h
index b6160a438c..e98b45fee1 100644
--- a/common/simd/vfloat16_avx512.h
+++ b/common/simd/vfloat16_avx512.h
@@ -35,13 +35,16 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
         
     __forceinline vfloat() {}
-    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat(const vfloat16& t) { v = t.v; }
     __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
 
     __forceinline vfloat(const __m512& t) { v = t; }
-    __forceinline operator __m512() const { return v; }
-    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
-    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+    __forceinline __m512 m512() const { return v; }
+    __forceinline __m256 m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline __m128 m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline __m512i m512i() const { return _mm512_cvtps_epi32(v); }
+    __forceinline __m512i vec_int() const { return m512i(); }
 
     __forceinline vfloat(float f) {
       v = _mm512_set1_ps(f);
@@ -52,31 +55,31 @@ namespace embree
     }
 
     __forceinline vfloat(const vfloat4& i) {
-      v = _mm512_broadcast_f32x4(i);
+      v = _mm512_broadcast_f32x4(i.m128());
     }
 
     __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
-      v = _mm512_castps128_ps512(a);
-      v = _mm512_insertf32x4(v, b, 1);
-      v = _mm512_insertf32x4(v, c, 2);
-      v = _mm512_insertf32x4(v, d, 3);
+      v = _mm512_castps128_ps512(a.m128());
+      v = _mm512_insertf32x4(v, b.m128(), 1);
+      v = _mm512_insertf32x4(v, c.m128(), 2);
+      v = _mm512_insertf32x4(v, d.m128(), 3);
     }
 
     __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
-      v = _mm512_broadcast_f32x4(a);
-      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+      v = _mm512_broadcast_f32x4(a.m128());
+      v = _mm512_mask_broadcast_f32x4(v,mask.packedMask16(),b.m128());
     }
 
     __forceinline vfloat(const vfloat8& i) {
-      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i.m256())));
     }
 
     __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
-      v = _mm512_castps256_ps512(a);
+      v = _mm512_castps256_ps512(a.m256());
 #if defined(__AVX512DQ__)
-      v = _mm512_insertf32x8(v, b, 1);
+      v = _mm512_insertf32x8(v, b.m256(), 1);
 #else
-      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b.m256()), 1));
 #endif
     }
 
@@ -88,11 +91,11 @@ namespace embree
       }*/
     
     __forceinline explicit vfloat(const vint16& a) {
-      v = _mm512_cvtepi32_ps(a);
+      v = _mm512_cvtepi32_ps(a.m512i());
     }
 
     __forceinline explicit vfloat(const vuint16& a) {
-      v = _mm512_cvtepu32_ps(a);
+      v = _mm512_cvtepu32_ps(a.m512i());
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -114,17 +117,17 @@ namespace embree
     static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
     static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
 
-    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
-    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask.packedMask16(),(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask.packedMask16(),(float*)ptr); }
 
-    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v.m512()); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v.m512()); }
 
-    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
-    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask.packedMask16(),v.m512()); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask.packedMask16(),v.m512()); }
 
     static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
-      _mm512_stream_ps((float*)ptr,a);
+      _mm512_stream_ps((float*)ptr,a.m512());
     }
 
     static __forceinline vfloat16 broadcast(const float* f) {
@@ -133,23 +136,23 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
-      return _mm512_i32gather_ps(index, ptr, scale);
+      return _mm512_i32gather_ps(index.m512i(), ptr, scale);
     }
 
     template<int scale = 4>
     static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
       vfloat16 r = zero;
-      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+      return _mm512_mask_i32gather_ps(r.m512(), mask.packedMask16(), index.m512i(), ptr, scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
-      _mm512_i32scatter_ps(ptr, index, v, scale);
+      _mm512_i32scatter_ps(ptr, index.m512i(), v.m512(), scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
-      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+      _mm512_mask_i32scatter_ps(ptr, mask.packedMask16(), index.m512i(), v.m512(), scale);
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -164,91 +167,91 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
-  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
-  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a.m512i()); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a.m512()); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a.m512()); }
 
-  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
-  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a.m512i()); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a.m512()); }
 
   __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
-  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a.m512(),vfloat16(-1).m512()); }
 
-  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
-  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.m512()),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a.m512()),_mm512_set1_epi32(0x80000000))); }
 
   __forceinline vfloat16 rcp(const vfloat16& a)
   {
-    const vfloat16 r = _mm512_rcp14_ps(a);
-    return _mm512_fmadd_ps(r, _mm512_fnmadd_ps(a, r, vfloat16(1.0)), r);  // computes r + r * (1 - a*r)
+    const vfloat16 r = _mm512_rcp14_ps(a.m512());
+    return _mm512_fmadd_ps(r.m512(), _mm512_fnmadd_ps(a.m512(), r.m512(), vfloat16(1.0).m512()), r.m512());  // computes r + r * (1 - a*r)
   }
 
-  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
-  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a.m512(),a.m512()); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a.m512()); }
 
   __forceinline vfloat16 rsqrt(const vfloat16& a)
   {
-    const vfloat16 r = _mm512_rsqrt14_ps(a);
-    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
-                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+    const vfloat16 r = _mm512_rsqrt14_ps(a.m512());
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r.m512(),
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a.m512(), _mm512_set1_ps(-0.5f)), r.m512()), _mm512_mul_ps(r.m512(), r.m512()))); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a.m512(), b.m512()); }
   __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
   __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
 
-  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a.m512(), b.m512()); }
   __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
   __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
 
-  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a.m512(), b.m512()); }
   __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
   __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
 
-  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a.m512(),b.m512()); }
   __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
   __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
   
-  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
-  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a.m512(),b.m512()); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a.m512(),b.m512()); }
   __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
-    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a.m512()),_mm512_castps_si512(b.m512()))); 
   }
   
-  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a,b);  }
-  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a,vfloat16(b)); }
-  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a),b); }
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) { return _mm512_min_ps(a.m512(),b.m512());  }
+  __forceinline vfloat16 min(const vfloat16& a, float           b) { return _mm512_min_ps(a.m512(),vfloat16(b).m512()); }
+  __forceinline vfloat16 min(const float&    a, const vfloat16& b) { return _mm512_min_ps(vfloat16(a).m512(),b.m512()); }
 
-  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a,b); }
-  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a,vfloat16(b)); }
-  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a),b); }
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) { return _mm512_max_ps(a.m512(),b.m512()); }
+  __forceinline vfloat16 max(const vfloat16& a, float           b) { return _mm512_max_ps(a.m512(),vfloat16(b).m512()); }
+  __forceinline vfloat16 max(const float&    a, const vfloat16& b) { return _mm512_max_ps(vfloat16(a).m512(),b.m512()); }
 
   __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
-    const vint16 ai = _mm512_castps_si512(a);
-    const vint16 bi = _mm512_castps_si512(b);
-    const vint16 ci = _mm512_min_epi32(ai,bi);
-    return _mm512_castsi512_ps(ci);
+    const vint16 ai = _mm512_castps_si512(a.m512());
+    const vint16 bi = _mm512_castps_si512(b.m512());
+    const vint16 ci = _mm512_min_epi32(ai.m512i(),bi.m512i());
+    return _mm512_castsi512_ps(ci.m512i());
   }
 
   __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
-    const vint16 ai = _mm512_castps_si512(a);
-    const vint16 bi = _mm512_castps_si512(b);
-    const vint16 ci = _mm512_max_epi32(ai,bi);
-    return _mm512_castsi512_ps(ci);
+    const vint16 ai = _mm512_castps_si512(a.m512());
+    const vint16 bi = _mm512_castps_si512(b.m512());
+    const vint16 ci = _mm512_max_epi32(ai.m512i(),bi.m512i());
+    return _mm512_castsi512_ps(ci.m512i());
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Ternary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
-  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
-  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
-  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a.m512(),b.m512(),c.m512()); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a.m512(),b.m512(),c.m512()); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a.m512(),b.m512(),c.m512()); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a.m512(),b.m512(),c.m512()); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -270,46 +273,46 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_EQ); }
   __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
   __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
 
-  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_NE); }
   __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
   __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
 
-  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_LT); }
   __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
   __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
 
-  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_GE); }
   __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
   __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
 
-  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_GT); }
   __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
   __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
 
-  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_LE); }
   __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
   __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
 
-  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
-
-  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a.m512(),b.m512(),_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask.packedMask16(),a.m512(),b.m512(),_MM_CMPINT_LE); }
   
   __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
-    return _mm512_mask_blend_ps(s, f, t);
+    return _mm512_mask_blend_ps(s.packedMask16(), f.m512(), t.m512());
   }
 
   __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
@@ -332,55 +335,55 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   
   __forceinline vfloat16 floor(const vfloat16& a) {
-    return _mm512_floor_ps(a);
+    return _mm512_floor_ps(a.m512());
   }
   __forceinline vfloat16 ceil (const vfloat16& a) {
-    return _mm512_ceil_ps(a);
+    return _mm512_ceil_ps(a.m512());
   }
   __forceinline vfloat16 round (const vfloat16& a) {
-    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    return _mm512_roundscale_ps(a.m512(), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
   }
   __forceinline vint16 floori (const vfloat16& a) {
-    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+    return _mm512_cvt_roundps_epi32(a.m512(), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
-  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a.m512(), b.m512()); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a.m512(), b.m512()); }
 
   template<int i>
   __forceinline vfloat16 shuffle(const vfloat16& v) {
-    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+    return _mm512_permute_ps(v.m512(), _MM_SHUFFLE(i, i, i, i));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat16 shuffle(const vfloat16& v) {
-    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm512_permute_ps(v.m512(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i>
   __forceinline vfloat16 shuffle4(const vfloat16& v) {
-    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+    return _mm512_shuffle_f32x4(v.m512(), v.m512() ,_MM_SHUFFLE(i, i, i, i));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat16 shuffle4(const vfloat16& v) {
-    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm512_shuffle_f32x4(v.m512(), v.m512(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
-    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a.m512()), mm512_int2mask(0xcc), _mm512_castps_si512(b.m512()), (_MM_PERM_ENUM)0x4e));
   }
 
   __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
-    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b.m512()), mm512_int2mask(0x33), _mm512_castps_si512(a.m512()), (_MM_PERM_ENUM)0x4e));
   }
 
   __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
-    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v.m512())));
   }
 
   __forceinline vfloat16 reverse(const vfloat16& v) {
@@ -389,12 +392,12 @@ namespace embree
 
   template<int i>
   __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
-    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a.m512()),_mm512_castps_si512(b.m512()),i)); 
   };
 
   template<int i>
   __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
-    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c.m512()),mask.packedMask16(),_mm512_castps_si512(a.m512()),_mm512_castps_si512(b.m512()),i)); 
   };
  
   __forceinline vfloat16 shift_left_1(const vfloat16& a) {
@@ -406,27 +409,27 @@ namespace embree
     return align_shift_right<1>(zero,x);
   }
 
-  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v.m512()); }
 
 
-  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a.m512(), b.m128(), i); }
 
   template<int N, int i>
   vfloat<N> extractN(const vfloat16& v);
 
-  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
-  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
-  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
-  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v.m512());    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v.m512(), 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v.m512(), 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v.m512(), 3); }
 
-  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
-  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v.m512());    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v.m512(), 1); }
 
-  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
-  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v.m512(), i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v.m512());    }
 
-  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
-  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v.m512(), i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v.m512());    }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Transpose
@@ -513,11 +516,11 @@ namespace embree
   __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
  
   __forceinline size_t select_min(const vfloat16& v) { 
-    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v.m512()),_mm512_castps_si512(vreduce_min(v).m512()),_MM_CMPINT_EQ)));
   }
 
   __forceinline size_t select_max(const vfloat16& v) { 
-    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v.m512()),_mm512_castps_si512(vreduce_max(v).m512()),_MM_CMPINT_EQ)));
   }
 
   __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
diff --git a/common/simd/vfloat4_sse2.h b/common/simd/vfloat4_sse2.h
index fccf11fe0c..e8792d564b 100644
--- a/common/simd/vfloat4_sse2.h
+++ b/common/simd/vfloat4_sse2.h
@@ -37,21 +37,23 @@ namespace embree
     __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
 
     __forceinline vfloat(__m128 a) : v(a) {}
-    __forceinline operator const __m128&() const { return v; }
-    __forceinline operator       __m128&()       { return v; }
+    __forceinline const __m128& m128() const { return v; }
+    __forceinline __m128&       m128()       { return v; }
+    __forceinline __m128i       m128i() const      { return _mm_cvtps_epi32(v); }
+    __forceinline __m128i vec_int() const { return m128i(); }
 
     __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
     __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
 
-    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
-#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a.m128i())) {}
+#if defined(__aarch64__) || defined(_M_ARM64)
     __forceinline explicit vfloat(const vuint4& x) {
         v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
     }
 #else
     __forceinline explicit vfloat(const vuint4& x) {
-      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
-      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
+      const __m128i a   = _mm_and_si128(x.m128i(),_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x.m128i(),31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31 
       const __m128  af  = _mm_cvtepi32_ps(a);
       const __m128  bf  = _mm_castsi128_ps(b);  
       v  = _mm_add_ps(af,bf);
@@ -76,25 +78,27 @@ namespace embree
     static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
     static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
 
-    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v.m128()); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v.m128()); }
 
 #if defined(__AVX512VL__)
 
-    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
-    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask.packedMask8(),(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask.packedMask8(),(float*)ptr); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask.packedMask8(),v.m128()); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask.packedMask8(),v.m128()); }
 #elif defined(__AVX__)
-    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
-    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    // Can be mask.m128i()
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask.mask32()); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask.mask32()); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+//WARNING not sure about mask.128i()
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),v.m128()); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),v.m128()); }
 #else
-    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
-    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask.m128()); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask.m128()); }
 
     static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
     static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
@@ -114,7 +118,7 @@ namespace embree
 #endif
   }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vfloat4 load(const char* ptr) {
         return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
     }
@@ -128,7 +132,7 @@ namespace embree
     }
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vfloat4 load(const unsigned char* ptr) {
         return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
     }
@@ -143,7 +147,7 @@ namespace embree
     }
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vfloat4 load(const short* ptr) {
         return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
     }
@@ -158,26 +162,26 @@ namespace embree
 #endif
 
     static __forceinline vfloat4 load(const unsigned short* ptr) {
-      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)).m128(),vfloat4(1.0f/65535.0f).m128());
     }
     
     static __forceinline void store_nt(void* ptr, const vfloat4& v)
     {
 #if defined (__SSE4_1__)
-#if defined(__aarch64__)
-      _mm_stream_ps((float*)ptr,v);
+#if defined(__aarch64__) || defined(_M_ARM64)
+      _mm_stream_ps((float*)ptr,v.m128());
 #else
-      _mm_stream_ps((float*)ptr,v);
+      _mm_stream_ps((float*)ptr,v.m128());
 #endif
 #else
-      _mm_store_ps((float*)ptr,v);
+      _mm_store_ps((float*)ptr,v.m128());
 #endif
     }
 
     template<int scale = 4>
     static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
-#if defined(__AVX2__) && !defined(__aarch64__)
-      return _mm_i32gather_ps(ptr, index, scale);
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_i32gather_ps(ptr, index.m128i(), scale);
 #else
       return vfloat4(
         *(float*)(((char*)ptr)+scale*index[0]),
@@ -191,9 +195,9 @@ namespace embree
     static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
       vfloat4 r = zero;
 #if defined(__AVX512VL__)
-      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__)  && !defined(__aarch64__)
-      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+      return _mm_mmask_i32gather_ps(r.m128(), mask.packedMask8(), index.m128i(), ptr, scale);
+#elif defined(__AVX2__)  && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_mask_i32gather_ps(r.m128(), ptr, index.m128i(), mask.m128(), scale);
 #else
       if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
       if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
@@ -207,7 +211,7 @@ namespace embree
     static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
     {
 #if defined(__AVX512VL__)
-      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+      _mm_i32scatter_ps((float*)ptr, index.m128i(), v.m128(), scale);
 #else
       *(float*)(((char*)ptr)+scale*index[0]) = v[0];
       *(float*)(((char*)ptr)+scale*index[1]) = v[1];
@@ -220,7 +224,7 @@ namespace embree
     static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
     {
 #if defined(__AVX512VL__)
-      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+      _mm_mask_i32scatter_ps((float*)ptr ,mask.packedMask8(), index.m128i(), v.m128(), scale);
 #else
       if (likely(mask[0])) *(float*)(((char*)ptr)+scale*index[0]) = v[0];
       if (likely(mask[1])) *(float*)(((char*)ptr)+scale*index[1]) = v[1];
@@ -245,11 +249,11 @@ namespace embree
 
     friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
 #if defined(__AVX512VL__)
-      return _mm_mask_blend_ps(m, f, t);
-#elif defined(__SSE4_1__) || (defined(__aarch64__))
-      return _mm_blendv_ps(f, t, m);
+      return _mm_mask_blend_ps(m.packedMask8(), f.m128(), t.m128());
+#elif defined(__SSE4_1__) || (defined(__aarch64__) || defined(_M_ARM64))
+      return _mm_blendv_ps(f.m128(), t.m128(), m.m128());
 #else
-      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+      return _mm_or_ps(_mm_and_ps(m.m128(), t.m128()), _mm_andnot_ps(m.m128(), f.m128())); 
 #endif
     }
   };
@@ -271,79 +275,79 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
-  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
-  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a.m128i()); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a.m128()); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a.m128()); }
 
-  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
-  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a.m128i()); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a.m128()); }
 
   __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   __forceinline vfloat4 operator -(const vfloat4& a) {
-    return vnegq_f32(a);
+    return vnegq_f32(a.m128());
   }
 #else
-  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a.m128(), _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
 #endif
 
-#if defined(__aarch64__)
-  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a.m128()); }
 #else
-  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a.m128(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
 #endif
 
 #if defined(__AVX512VL__)
-  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a.m128(), vfloat4(zero).m128(), _CMP_LT_OQ), vfloat4(one).m128(), (-vfloat4(one)).m128()); }
 #else
-  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one).m128(), (-vfloat4(one)).m128(), _mm_cmplt_ps(a.m128(), vfloat4(zero).m128())); }
 #endif
 
-  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a.m128(),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
 
   __forceinline vfloat4 rcp(const vfloat4& a)
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
 #else
 
 #if defined(__AVX512VL__)
-    const vfloat4 r = _mm_rcp14_ps(a);
+    const vfloat4 r = _mm_rcp14_ps(a.m128());
 #else
-    const vfloat4 r = _mm_rcp_ps(a);
+    const vfloat4 r = _mm_rcp_ps(a.m128());
 #endif
 
 #if defined(__AVX2__)
-    return _mm_fmadd_ps(r, _mm_fnmadd_ps(a, r, vfloat4(1.0f)), r);                    // computes r + r * (1 - a * r)
+    return _mm_fmadd_ps(r.m128(), _mm_fnmadd_ps(a.m128(), r.m128(), vfloat4(1.0f).m128()), r.m128());                    // computes r + r * (1 - a * r)
 #else
-    return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r))));  // computes r + r * (1 - a * r)
+    return _mm_add_ps(r.m128(),_mm_mul_ps(r.m128(), _mm_sub_ps(vfloat4(1.0f).m128(), _mm_mul_ps(a.m128(), r.m128()))));  // computes r + r * (1 - a * r)
 #endif
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
-  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
-  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a.m128(),a.m128()); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a.m128()); }
 
   __forceinline vfloat4 rsqrt(const vfloat4& a)
   {
-#if defined(__aarch64__)
-    vfloat4 r = _mm_rsqrt_ps(a);
-    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
-    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
-    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vfloat4 r = _mm_rsqrt_ps(a.m128());
+    r = vmulq_f32(r.m128(), vrsqrtsq_f32(vmulq_f32(a.m128(), r.m128()), r.m128()));
+    r = vmulq_f32(r.m128(), vrsqrtsq_f32(vmulq_f32(a.m128(), r.m128()), r.m128()));
+    r = vmulq_f32(r.m128(), vrsqrtsq_f32(vmulq_f32(a.m128(), r.m128()), r.m128()));
     return r;
 #else
 
 #if defined(__AVX512VL__)
-    vfloat4 r = _mm_rsqrt14_ps(a);
+    vfloat4 r = _mm_rsqrt14_ps(a.m128());
 #else
-    vfloat4 r = _mm_rsqrt_ps(a);
+    vfloat4 r = _mm_rsqrt_ps(a.m128());
 #endif
 
 #if defined(__AVX2__)
-    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r, _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    r = _mm_fmadd_ps(_mm_set1_ps(1.5f), r.m128(), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128(), _mm_set1_ps(-0.5f)), r.m128()), _mm_mul_ps(r.m128(), r.m128())));
 #else
-    r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+    r = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r.m128()), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128(), _mm_set1_ps(-0.5f)), r.m128()), _mm_mul_ps(r.m128(), r.m128())));
 #endif
 
 #endif
@@ -351,11 +355,11 @@ namespace embree
   }
 
   __forceinline vboolf4 isnan(const vfloat4& a) {
-    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+    const vfloat4 b = _mm_and_ps(a.m128(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
 #if defined(__AVX512VL__)
-    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b.m128()), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
 #else
-    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b.m128()), _mm_set1_epi32(0x7f800000)));
 #endif
   }
 
@@ -363,63 +367,63 @@ namespace embree
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a.m128(), b.m128()); }
   __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
   __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
 
-  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a.m128(), b.m128()); }
   __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
   __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
 
-  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a.m128(), b.m128()); }
   __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
   __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
 
-  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a.m128(),b.m128()); }
   __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
   __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
 
-  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
-  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
-  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
-  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a.m128(),b.m128()); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a.m128(),b.m128()); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a.m128(),b.m128()); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a.m128(),_mm_castsi128_ps(b.m128i())); }
 
-  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
-  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
-  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a.m128(),b.m128()); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a.m128(),vfloat4(b).m128()); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a).m128(),b.m128()); }
 
-  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
-  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
-  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a.m128(),b.m128()); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a.m128(),vfloat4(b).m128()); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a).m128(),b.m128()); }
 
-#if defined(__SSE4_1__) || defined(__aarch64__)
+#if defined(__SSE4_1__) || defined(__aarch64__) || defined(_M_ARM64)
 
     __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_min_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_min_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 
     __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_max_epi32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_max_epi32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 
     __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_min_epu32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_min_epu32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 
     __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
-      const vint4 ai = _mm_castps_si128(a);
-      const vint4 bi = _mm_castps_si128(b);
-      const vint4 ci = _mm_max_epu32(ai,bi);
-      return _mm_castsi128_ps(ci);
+      const vint4 ai = _mm_castps_si128(a.m128());
+      const vint4 bi = _mm_castps_si128(b.m128());
+      const vint4 ci = _mm_max_epu32(ai.m128i(),bi.m128i());
+      return _mm_castsi128_ps(ci.m128i());
     }
 #else
     __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
@@ -435,11 +439,11 @@ namespace embree
   /// Ternary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__AVX2__) || defined(__ARM_NEON)
-  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
-  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
-  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
-  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#if defined(__AVX2__) || defined(__ARM_NEON) || defined(_M_ARM64)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a.m128(),b.m128(),c.m128()); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a.m128(),b.m128(),c.m128()); }
 #else
   __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
   __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
@@ -469,24 +473,24 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
-  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
-  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
-  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
-  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
-  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a.m128(), b.m128(), _MM_CMPINT_LE); }
 #else
-  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
-  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
-  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
-#if defined(__aarch64__)
-  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
-  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a.m128(), b.m128()); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a.m128(), b.m128()); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a.m128(), b.m128()); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a.m128(), b.m128()); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a.m128(), b.m128()); }
 #else
-  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
-  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a.m128(), b.m128()); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a.m128(), b.m128()); }
 #endif
-  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a.m128(), b.m128()); }
 #endif
 
   __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
@@ -515,12 +519,12 @@ namespace embree
   __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
-  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
-  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
-  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
-  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask.packedMask8(), a.m128(), b.m128(), _MM_CMPINT_LE); }
 #else
   __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
   __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
@@ -534,7 +538,7 @@ namespace embree
     __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
   {
 #if defined(__SSE4_1__) 
-    return _mm_blend_ps(f, t, mask);
+    return _mm_blend_ps(f.m128(), t.m128(), mask);
 #else
     return select(vboolf4(mask), t, f);
 #endif
@@ -560,16 +564,16 @@ namespace embree
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
   __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
   __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
   __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
 #elif defined (__SSE4_1__)
-  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
-  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO       ); }
-  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a.m128(), _MM_FROUND_TO_NEAREST_INT); }
 #else
   __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
   __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
@@ -579,12 +583,12 @@ namespace embree
   __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
 
   __forceinline vint4 floori(const vfloat4& a) {
-#if defined(__aarch64__)
-    return vcvtq_s32_f32(floor(a));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vcvtq_s32_f32(floor(a).m128());
 #elif defined(__SSE4_1__)
-    return vint4(floor(a));
+    return vint4(floor(a).m128i());
 #else
-    return vint4(a-vfloat4(0.5f));
+    return vint4((a-vfloat4(0.5f)).m128i());
 #endif
   }
 
@@ -592,34 +596,47 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
-  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a.m128(), b.m128()); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a.m128(), b.m128()); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       template<int i0, int i1, int i2, int i3>
       __forceinline vfloat4 shuffle(const vfloat4& v) {
+#if !defined(_M_ARM64)
           return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+#else
+          // Avoids C4576 (no mixing C+CPP syntax), and C4002 (comma inside macro invocation)
+          uint8x16_t _shuffle = _MN_SHUFFLE(i0, i1, i2, i3);
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _shuffle));
+#endif
       }
       template<int i0, int i1, int i2, int i3>
       __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+#if !defined(_M_ARM64)
           return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+#else
+          // Avoids C4576 (no mixing C+CPP syntax), and C4002 (comma inside macro invocation)
+          uint8x16_t _shuffle = _MF_SHUFFLE(i0, i1, i2, i3);
+          uint8x16x2_t _ab = {(uint8x16_t)a.v, (uint8x16_t)b.v};
+          return vreinterpretq_f32_u8(vqtbl2q_u8( _ab, _shuffle));
+#endif
       }
 #else
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat4 shuffle(const vfloat4& v) {
-    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.m128()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
-    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm_shuffle_ps(a.m128(), b.m128(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 #endif
 
-#if defined(__SSE3__) && !defined(__aarch64__)
-  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
-  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
-  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#if defined(__SSE3__) && !defined(__aarch64__) && !defined(_M_ARM64)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v.m128()); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v.m128()); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v.m128()))); }
 #endif
 
   template<int i>
@@ -627,15 +644,15 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   template<int i> __forceinline float extract(const vfloat4& a) { return a[i]; }
 #else
-  template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a)); }
-  template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+  template<int i> __forceinline float extract   (const vfloat4& a) { return _mm_cvtss_f32(shuffle<i>(a).m128()); }
+  template<>      __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a.m128()); }
 #endif
 
-#if defined (__SSE4_1__) && !defined(__aarch64__)
-  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+#if defined (__SSE4_1__) && !defined(__aarch64__) && !defined(_M_ARM64)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a.m128(), b.m128(), (dst << 4) | (src << 6) | clr); }
   template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
   template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
 #else
@@ -643,15 +660,15 @@ namespace embree
   template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
 #endif
 
-  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v.m128()); }
 
   __forceinline vfloat4 shift_right_1(const vfloat4& x) {
-    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4)); 
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x.m128()), 4)); 
   }
 
 #if defined (__AVX2__)
   __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
-    return _mm_permutevar_ps(a,index);
+    return _mm_permutevar_ps(a.m128(),index);
   }
 
   __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
@@ -661,7 +678,7 @@ namespace embree
 #if defined(__AVX512VL__)
   template<int i>
   __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
-    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a.m128()), _mm_castps_si128(b.m128()), i));
   }  
 #endif
 
@@ -736,24 +753,24 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-#if defined(__aarch64__)
-  __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
-  __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
-  __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v.m128()); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v.m128()); return vdupq_n_f32(h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v.m128()); return vdupq_n_f32(h); }
 #else
   __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
   __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
 #endif
 
-#if defined(__aarch64__)
-  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
-  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
-  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v.m128()); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v.m128()); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v.m128()); }
 #else
-  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
-  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
-  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v).m128()); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v).m128()); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v).m128()); }
 #endif
 
   __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v) 
diff --git a/common/simd/vfloat8_avx.h b/common/simd/vfloat8_avx.h
index b09d5e641d..a1d5a1cb92 100644
--- a/common/simd/vfloat8_avx.h
+++ b/common/simd/vfloat8_avx.h
@@ -35,11 +35,13 @@ namespace embree
     __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
 
     __forceinline vfloat(__m256 a) : v(a) {}
-    __forceinline operator const __m256&() const { return v; }
-    __forceinline operator       __m256&()       { return v; }
+    __forceinline const __m256& m256() const { return v; }
+    __forceinline __m256& m256() { return v; }
+    __forceinline __m256i m256i() const { return _mm256_cvtps_epi32(v); }
+    __forceinline __m256i vec_int() const { return m256i(); }
 
-    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
-    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a.m128()),a.m128(),1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a.m128()),b.m128(),1)) {}
 
     __forceinline explicit vfloat(const char* a) : v(_mm256_loadu_ps((const float*)a)) {}
     __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
@@ -96,22 +98,22 @@ namespace embree
     static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
     static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
 
-    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v.m256()); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v.m256()); }
 
 #if defined(__AVX512VL__)
 
-    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
-    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask.packedMask8(),(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask.packedMask8(),(float*)ptr); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask.packedMask8(),v.m256()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask.packedMask8(),v.m256()); }
 #else
     static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
     static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,_mm256_castps_si256(mask.v)); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v.m256()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),v.m256()); }
 #endif
     
 #if defined(__AVX2__)
@@ -121,13 +123,13 @@ namespace embree
 #endif
     
     static __forceinline void store_nt(void* ptr, const vfloat8& v) {
-      _mm256_stream_ps((float*)ptr,v);
+      _mm256_stream_ps((float*)ptr,v.m256());
     }
 
     template<int scale = 4>
     static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
-#if defined(__AVX2__) && !defined(__aarch64__)
-      return _mm256_i32gather_ps(ptr, index ,scale);
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm256_i32gather_ps(ptr, index.m256i() ,scale);
 #else
       return vfloat8(
           *(float*)(((char*)ptr)+scale*index[0]),
@@ -145,9 +147,9 @@ namespace embree
     static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
       vfloat8 r = zero;
 #if defined(__AVX512VL__)
-      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
-#elif defined(__AVX2__) && !defined(__aarch64__)
-      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+      return _mm256_mmask_i32gather_ps(r.m256(), mask.packedMask8(), index.m256i(), ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)&& !defined(_M_ARM64)
+      return _mm256_mask_i32gather_ps(r.m256(), ptr, index.m256i(), mask.m256(), scale);
 #else
       if (likely(mask[0])) r[0] = *(float*)(((char*)ptr)+scale*index[0]);
       if (likely(mask[1])) r[1] = *(float*)(((char*)ptr)+scale*index[1]);
@@ -165,7 +167,7 @@ namespace embree
     static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+      _mm256_i32scatter_ps((float*)ptr, ofs.m256i(), v.m256(), scale);
 #else
       *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
       *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -182,7 +184,7 @@ namespace embree
     static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+      _mm256_mask_i32scatter_ps((float*)ptr, mask.packedMask8(), ofs.m256i(), v.m256(), scale);
 #else
       if (likely(mask[0])) *(float*)(((char*)ptr)+scale*ofs[0]) = v[0];
       if (likely(mask[1])) *(float*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -208,17 +210,17 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
-  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a.m256i()); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a.m256()); }
 
-  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
-  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a.m256i()); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a.m256()); }
 
   __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
   __forceinline vfloat8 operator -(const vfloat8& a) {
     const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
-    return _mm256_xor_ps(a, mask);
+    return _mm256_xor_ps(a.m256(), mask);
   }
 #else
   __forceinline vfloat8 operator -(const vfloat8& a) {
@@ -229,10 +231,10 @@ namespace embree
 }
 #endif
 
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
   __forceinline vfloat8 abs(const vfloat8& a) {
     const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
-    return _mm256_and_ps(a, mask);
+    return _mm256_and_ps(a.m256(), mask);
   }
 #else
 __forceinline vfloat8 abs(const vfloat8& a) {
@@ -243,17 +245,17 @@ __forceinline vfloat8 abs(const vfloat8& a) {
 }
 #endif
 
-#if !defined(__aarch64__)
-  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one).m256(), (-vfloat8(one)).m256(), _mm256_cmp_ps(a.m256(), vfloat8(zero).m256(), _CMP_NGE_UQ)); }
 #else
-  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one).m256(), (-vfloat8(one)).m256(), _mm256_cmplt_ps(a.m256(), vfloat8(zero).m256())); }
 #endif
-  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a.m256(),_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
 
 
   static __forceinline vfloat8 rcp(const vfloat8& a)
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     vfloat8 ret;
     const float32x4_t one = vdupq_n_f32(1.0f);
     ret.v.lo = vdivq_f32(one, a.v.lo);
@@ -262,37 +264,37 @@ __forceinline vfloat8 abs(const vfloat8& a) {
 #endif
 
 #if defined(__AVX512VL__)
-    const vfloat8 r = _mm256_rcp14_ps(a);
+    const vfloat8 r = _mm256_rcp14_ps(a.m256());
 #else
-    const vfloat8 r = _mm256_rcp_ps(a);
+    const vfloat8 r = _mm256_rcp_ps(a.m256());
 #endif
 
 #if defined(__AVX2__)
     // First, compute 1 - a * r (which will be very close to 0)
-    const vfloat8 h_n = _mm256_fnmadd_ps(a, r, vfloat8(1.0f));
+    const vfloat8 h_n = _mm256_fnmadd_ps(a.m256(), r.m256(), vfloat8(1.0f).m256());
     // Then compute r + r * h_n
-    return _mm256_fmadd_ps(r, h_n, r);
+    return _mm256_fmadd_ps(r.m256(), h_n.m256(), r.m256());
 #else
-    return _mm256_add_ps(r,_mm256_mul_ps(r, _mm256_sub_ps(vfloat8(1.0f), _mm256_mul_ps(a, r))));  // computes r + r * (1 - a * r)
+    return _mm256_add_ps(r.m256(),_mm256_mul_ps(r.m256(), _mm256_sub_ps(vfloat8(1.0f).m256(), _mm256_mul_ps(a.m256(), r.m256()))));  // computes r + r * (1 - a * r)
 #endif
   }
-  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
-  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a.m256(),a.m256()); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a.m256()); }
 
   static __forceinline vfloat8 rsqrt(const vfloat8& a)
   {
 #if defined(__AVX512VL__)
-    const vfloat8 r = _mm256_rsqrt14_ps(a);
+    const vfloat8 r = _mm256_rsqrt14_ps(a.m256());
 #else
-    const vfloat8 r = _mm256_rsqrt_ps(a);
+    const vfloat8 r = _mm256_rsqrt_ps(a.m256());
 #endif
 
 #if defined(__AVX2__)
-    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
-                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r.m256(),
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a.m256(), _mm256_set1_ps(-0.5f)), r.m256()), _mm256_mul_ps(r.m256(), r.m256()))); 
 #else
-    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
-                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r.m256()),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a.m256(), _mm256_set1_ps(-0.5f)), r.m256()), _mm256_mul_ps(r.m256(), r.m256())));
 #endif
   }
 
@@ -300,64 +302,64 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a.m256(), b.m256()); }
   __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
   __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
 
-  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a.m256(), b.m256()); }
   __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
   __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
 
-  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a.m256(), b.m256()); }
   __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
   __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
 
-  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a.m256(), b.m256()); }
   __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
   __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
 
-  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
-  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
-  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
-  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a.m256(),b.m256()); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a.m256(),b.m256()); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a.m256(),b.m256()); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a.m256(),_mm256_castsi256_ps(b.m256i())); }
 
-  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
-  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
-  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a.m256(), b.m256()); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a.m256(), vfloat8(b).m256()); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a).m256(), b.m256()); }
 
-  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
-  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
-  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a.m256(), b.m256()); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a.m256(), vfloat8(b).m256()); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a).m256(), b.m256()); }
 
   /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
 #if defined(__AVX2__)
 
   static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
-    const vint8 ai = _mm256_castps_si256(a);
-    const vint8 bi = _mm256_castps_si256(b);
-    const vint8 ci = _mm256_min_epi32(ai,bi);
-    return _mm256_castsi256_ps(ci);
+    const vint8 ai = _mm256_castps_si256(a.m256());
+    const vint8 bi = _mm256_castps_si256(b.m256());
+    const vint8 ci = _mm256_min_epi32(ai.m256i(),bi.m256i());
+    return _mm256_castsi256_ps(ci.m256i());
   }
 
   static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
-    const vint8 ai = _mm256_castps_si256(a);
-    const vint8 bi = _mm256_castps_si256(b);
-    const vint8 ci = _mm256_max_epi32(ai,bi);
-    return _mm256_castsi256_ps(ci);
+    const vint8 ai = _mm256_castps_si256(a.m256());
+    const vint8 bi = _mm256_castps_si256(b.m256());
+    const vint8 ci = _mm256_max_epi32(ai.m256i(),bi.m256i());
+    return _mm256_castsi256_ps(ci.m256i());
   }
 
   static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
-    const vint8 ai = _mm256_castps_si256(a);
-    const vint8 bi = _mm256_castps_si256(b);
-    const vint8 ci = _mm256_min_epu32(ai,bi);
-    return _mm256_castsi256_ps(ci);
+    const vint8 ai = _mm256_castps_si256(a.m256());
+    const vint8 bi = _mm256_castps_si256(b.m256());
+    const vint8 ci = _mm256_min_epu32(ai.m256i(),bi.m256i());
+    return _mm256_castsi256_ps(ci.m256i());
   }
 
   static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
-    const vint8 ai = _mm256_castps_si256(a);
-    const vint8 bi = _mm256_castps_si256(b);
-    const vint8 ci = _mm256_max_epu32(ai,bi);
-    return _mm256_castsi256_ps(ci);
+    const vint8 ai = _mm256_castps_si256(a.m256());
+    const vint8 bi = _mm256_castps_si256(b.m256());
+    const vint8 ci = _mm256_max_epu32(ai.m256i(),bi.m256i());
+    return _mm256_castsi256_ps(ci.m256i());
   }
 
 #else
@@ -377,10 +379,10 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX2__)
-  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
-  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
-  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
-  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a.m256(),b.m256(),c.m256()); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a.m256(),b.m256(),c.m256()); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a.m256(),b.m256(),c.m256()); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a.m256(),b.m256(),c.m256()); }
 #else
   static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
   static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
@@ -409,44 +411,44 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
-  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
-  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
-  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
-  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
-  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a.m256(), b.m256(), _MM_CMPINT_LE); }
 
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
-    return _mm256_mask_blend_ps(m, f, t);
+    return _mm256_mask_blend_ps(m.packedMask8(), f.m256(), t.m256());
   }
-#elif !defined(__aarch64__)
-  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
-  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
-  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
-  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
-  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
-  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+#elif !defined(__aarch64__) && !defined(_M_ARM64)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_EQ_OQ);  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_NEQ_UQ); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_LT_OS);  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_NLT_US); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_NLE_US); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a.m256(), b.m256(), _CMP_LE_OS);  }
 
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
-    return _mm256_blendv_ps(f, t, m); 
+    return _mm256_blendv_ps(f.m256(), t.m256(), m.m256()); 
   }
 #else
-  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
-  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
-  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
-  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
-  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
-  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a.m256(), b.m256());  }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a.m256(), b.m256()); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a.m256(), b.m256());  }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a.m256(), b.m256());  }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a.m256(), b.m256());  }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a.m256(), b.m256());  }
 
   static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
-    return _mm256_blendv_ps(f, t, m);
+    return _mm256_blendv_ps(f.m256(), t.m256(), m.m256());
   }
 
 #endif
 
   template<int mask>
     __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
-    return _mm256_blend_ps(f, t, mask);
+    return _mm256_blend_ps(f.m256(), t.m256(), mask);
   }
 
   __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
@@ -475,12 +477,12 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
-  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
-  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
-  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
-  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
-  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask.packedMask8(), a.m256(), b.m256(), _MM_CMPINT_LE); }
 #else
   static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
   static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
@@ -510,14 +512,14 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   /// Rounding Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if !defined(__aarch64__)
-  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
-  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
-  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
-  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a.m256(), _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a.m256(), _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a.m256(), _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a.m256(), _MM_FROUND_TO_NEAREST_INT); }
 #else
-  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
-  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a.m256()); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a.m256()); }
 #endif
 
 
@@ -527,68 +529,68 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
-  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a.m256(), b.m256()); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a.m256(), b.m256()); }
 
   template<int i>
   __forceinline vfloat8 shuffle(const vfloat8& v) {
-    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+    return _mm256_permute_ps(v.m256(), _MM_SHUFFLE(i, i, i, i));
   }
 
   template<int i0, int i1>
   __forceinline vfloat8 shuffle4(const vfloat8& v) {
-    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_ps(v.m256(), v.m256(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
-    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_ps(a.m256(), b.m256(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat8 shuffle(const vfloat8& v) {
-    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm256_permute_ps(v.m256(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
-    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm256_shuffle_ps(a.m256(), b.m256(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
-#if !defined(__aarch64__)
-  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
-  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
-  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v.m256()); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v.m256()); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v.m256()))); }
 #endif
 
   __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
-  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
-  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
-  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a.m256(), b.m128(), i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a.m256(), i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a.m256());   }
 
-  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v.m256())); }
 
-#if defined (__AVX2__) && !defined(__aarch64__)
+#if defined (__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
   static __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
-    return _mm256_permutevar8x32_ps(a, index);
+    return _mm256_permutevar8x32_ps(a.m256(), index);
   }
 #endif
 
 #if defined(__AVX512VL__)
   template<int i>
   static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
-    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a.m256()), _mm256_castps_si256(b.m256()), i));
   }  
 #endif
 
 #if defined (__AVX_I__)
   template<const int mode>
   static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
-    return _mm256_cvtps_ph(a, mode);
+    return _mm256_cvtps_ph(a.m256(), mode);
   }
 
   static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
-    return _mm256_cvtph_ps(a);
+    return _mm256_cvtph_ps(a.m128i()());
   }
 #endif
 
@@ -600,12 +602,12 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
     const vfloat8 t0 = shuffle<1,2,3,0>(x);
     const vfloat8 t1 = shuffle4<1,0>(t0);
-    return _mm256_blend_ps(t0,t1,0x88);
+    return _mm256_blend_ps(t0.m256(),t1.m256(),0x88);
   }
 #endif
 
   __forceinline vint8 floori(const vfloat8& a) {
-    return vint8(floor(a));
+    return vint8(floor(a).m256i());
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -665,7 +667,7 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
   __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
   __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
   __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
@@ -713,7 +715,7 @@ __forceinline vfloat8 abs(const vfloat8& a) {
   //}
 
   __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
-    return _mm256_dp_ps(a,b,0x7F);
+    return _mm256_dp_ps(a.m256(),b.m256(),0x7F);
   }
 
   __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
diff --git a/common/simd/vint16_avx512.h b/common/simd/vint16_avx512.h
index 3720c3c9d6..e9cde2cdd8 100644
--- a/common/simd/vint16_avx512.h
+++ b/common/simd/vint16_avx512.h
@@ -38,8 +38,11 @@ namespace embree
     __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
 
     __forceinline vint(const __m512i& t) { v = t; }
-    __forceinline operator __m512i() const { return v; }
-    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+    __forceinline __m512i m512i() const { return v; }
+    __forceinline __m256i m256i() const { return _mm512_castsi512_si256(v); }
+    __forceinline __m512 m512() const { return _mm512_cvtepi32_ps(v); }
+    __forceinline __m512i vec_int() const { return m512i(); }
+    __forceinline __m512 vec_float() const { return m512(); }
 
     __forceinline vint(int i) {
       v = _mm512_set1_epi32(i);
@@ -58,23 +61,23 @@ namespace embree
     }
 
     __forceinline vint(const vint4& i) {
-      v = _mm512_broadcast_i32x4(i);
+      v = _mm512_broadcast_i32x4(i.m128i());
     }
 
     __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
-      v = _mm512_castsi128_si512(a);
-      v = _mm512_inserti32x4(v, b, 1);
-      v = _mm512_inserti32x4(v, c, 2);
-      v = _mm512_inserti32x4(v, d, 3);
+      v = _mm512_castsi128_si512(a.m128i());
+      v = _mm512_inserti32x4(v, b.m128i(), 1);
+      v = _mm512_inserti32x4(v, c.m128i(), 2);
+      v = _mm512_inserti32x4(v, d.m128i(), 3);
     }
 
     __forceinline vint(const vint8& i) {
-      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i.m256i()))));
     }
 
     __forceinline vint(const vint8& a, const vint8& b) {
-      v = _mm512_castsi256_si512(a);
-      v = _mm512_inserti64x4(v, b, 1);
+      v = _mm512_castsi256_si512(a.m256i());
+      v = _mm512_inserti64x4(v, b.m256i(), 1);
     }
    
     __forceinline explicit vint(const __m512& f) {
@@ -106,52 +109,52 @@ namespace embree
 
     static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
 
-    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
-    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask.packedMask16(),addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask.packedMask16(),addr); }
 
-    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
-    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v.m512i()); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v.m512i()); }
  
-    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
-    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask.packedMask16(),v2.m512i()); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask.packedMask16(),f.m512i()); }
 
-    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a.m512i()); }
 
     static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
-      return _mm512_mask_compress_epi32(v,mask,v);
+      return _mm512_mask_compress_epi32(v.m512i(),mask.packedMask16(),v.m512i());
     }
 
     static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
-      return _mm512_mask_compress_epi32(a,mask,b);
+      return _mm512_mask_compress_epi32(a.m512i(),mask.packedMask16(),b.m512i());
     }
 
     static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
-      return _mm512_mask_expand_epi32(b,mask,a);
+      return _mm512_mask_expand_epi32(b.m512i(),mask.packedMask16(),a.m512i());
     }
 
     template<int scale = 4>
     static __forceinline vint16 gather(const int* ptr, const vint16& index) {
-      return _mm512_i32gather_epi32(index,ptr,scale);
+      return _mm512_i32gather_epi32(index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
-      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask.packedMask16(),index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
-      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+      return _mm512_mask_i32gather_epi32(dest.m512i(),mask.packedMask16(),index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
-      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+      _mm512_i32scatter_epi32((int*)ptr,index.m512i(),v.m512i(),scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
-      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask.packedMask16(),index.m512i(),v.m512i(),scale);
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -169,65 +172,65 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a.m512i()); }
 
   __forceinline vint16 operator +(const vint16& a) { return a; }
-  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
   __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
 
-  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
   __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
 
-  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
   __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
 
-  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
   __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
 
-  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
   __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
 
-  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
   __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
 
-  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
-  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a.m512i(), n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a.m512i(), n); }
 
-  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
-  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a.m512i(), n.m512i()); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a.m512i(), n.m512i()); }
 
-  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
-  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
-  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a.m512i(), b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a.m512i(), b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a.m512i(), b); }
   
-  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
   __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
 
-  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a.m512i(), b.m512i()); }
   __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
   __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
   
-  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
-  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a.m512i(), b.m512i()); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a.m512i(), b.m512i()); }
 
-  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
-  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c.m512i(),mask.packedMask16(),a.m512i(),b.m512i()); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c.m512i(),mask.packedMask16(),a.m512i(),b.m512i()); }
 
-  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
-  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c.m512i(),m.packedMask16(),a.m512i(),b.m512i()); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c.m512i(),m.packedMask16(),a.m512i(),b.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -256,107 +259,107 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
   __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
   __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
   
-  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
   __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
   __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
   
-  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
   __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
   __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
   
-  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
   __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
   __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
 
-  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
   __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
   __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
 
-  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
   __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
   __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
 
-  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
-  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
-  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
-
-  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
-  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
-  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_GT); }
     
  
   __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
-    return _mm512_mask_or_epi32(f,m,t,t); 
+    return _mm512_mask_or_epi32(f.m512i(),m.packedMask16(),t.m512i(),t.m512i()); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
-  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a.m512i(), b.m512i()); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a.m512i(), b.m512i()); }
 
   template<int i>
     __forceinline vint16 shuffle(const vint16& v) {
-    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint16 shuffle(const vint16& v) {
-    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i>
   __forceinline vint16 shuffle4(const vint16& v) {
-    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v.m512i()), _mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint16 shuffle4(const vint16& v) {
-    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v.m512i()), _mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i>
   __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
-    return _mm512_alignr_epi32(a, b, i);
+    return _mm512_alignr_epi32(a.m512i(), b.m512i(), i);
   };
 
   __forceinline int toScalar(const vint16& v) {
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v.m512i()));
   }
 
-  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a.m512i(), b.m128i(), i); }
 
   template<int N, int i>
   vint<N> extractN(const vint16& v);
 
-  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
-  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
-  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
-  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v.m512i());       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v.m512i(), 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v.m512i(), 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v.m512i(), 3); }
 
-  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
-  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v.m512i());       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v.m512i(), 1); }
 
-  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
-  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v.m512i(), i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v.m512i());       }
 
-  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
-  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v.m512i(), i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v.m512i());       }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
@@ -399,12 +402,12 @@ namespace embree
 
   __forceinline vint16 conflict(const vint16& index)
   {
-    return _mm512_conflict_epi32(index);
+    return _mm512_conflict_epi32(index.m512i());
   }
 
   __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
   {
-    return _mm512_mask_conflict_epi32(dest,mask,index);
+    return _mm512_mask_conflict_epi32(dest.m512i(),mask.packedMask16(),index.m512i());
   }    
 
   __forceinline vint16 convert_uint32_t(const __m512& f) {
@@ -412,7 +415,7 @@ namespace embree
   }
 
   __forceinline vint16 permute(vint16 v, vint16 index) {
-    return _mm512_permutexvar_epi32(index,v);  
+    return _mm512_permutexvar_epi32(index.m512i(),v.m512i());  
   }
 
   __forceinline vint16 reverse(const vint16 &a) {
@@ -446,7 +449,7 @@ namespace embree
     __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
   {
 #if defined(__AVX512PF__)
-    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+    _mm512_mask_prefetch_i64gather_pd(offset.m512i(), mask.packedMask8(), base_addr, scale, hint);
 #endif
   }
   
diff --git a/common/simd/vint4_sse2.h b/common/simd/vint4_sse2.h
index e9e4a5a2c2..5dfb40ef5f 100644
--- a/common/simd/vint4_sse2.h
+++ b/common/simd/vint4_sse2.h
@@ -37,17 +37,24 @@ namespace embree
     __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
 
     __forceinline vint(__m128i a) : v(a) {}
-    __forceinline operator const __m128i&() const { return v; }
-    __forceinline operator       __m128i&()       { return v; }
+    __forceinline const __m128i& m128i() const { return v; }
+    __forceinline       __m128i& m128i()       { return v; }
+    __forceinline       __m128 m128() const       { return _mm_cvtepi32_ps(v); }
+    __forceinline       __m128d m128d() const       { return _mm_cvtepi32_pd(v); }
+    __forceinline       __m128 vec_float() const { return m128(); }
+    __forceinline       __m128i vec_int() const { return m128i(); }
 
     __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
     __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
 
+#if !defined(_M_ARM64) || defined(__clang__)
     __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#endif
 #if defined(__AVX512VL__)
-    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a.packedMask8())) {}
 #else
-    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+//#warning some doubt about this conversion
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a.v)) {}
 #endif
 
     __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
@@ -74,39 +81,39 @@ namespace embree
     static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
     static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
 
-    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v.m128i()); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v.m128i()); }
     
 #if defined(__AVX512VL__)
 
     static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
-      return _mm_mask_compress_epi32(v, mask, v);
+      return _mm_mask_compress_epi32(v.m128i(), mask.packedMask8(), v.m128i());
     }
     static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
-      return _mm_mask_compress_epi32(a, mask, b);
+      return _mm_mask_compress_epi32(a.m128i(), mask.packedMask8(), b.m128i());
     }
 
-    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
-    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask.packedMask8(),ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask.packedMask8(),ptr); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask.packedMask8(),v.m128i()); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask.packedMask8(),v.m128i()); }
 #elif defined(__AVX__)
-    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
-    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask.mask32())); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask.mask32())); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),_mm_castsi128_ps(i.m128i())); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),_mm_castsi128_ps(i.m128i())); }
 #else
-    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
-    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask.mask32()); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask.mask32()); }
 
     static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
     static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
 #endif
 
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vint4 load(const unsigned char* ptr) {
         return _mm_load4epu8_epi32(((__m128i*)ptr));
     }
@@ -134,7 +141,7 @@ namespace embree
 #endif
 
     static __forceinline vint4 load(const unsigned short* ptr) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       return __m128i(vmovl_u16(vld1_u16(ptr)));
 #elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
@@ -144,13 +151,13 @@ namespace embree
     } 
 
     static __forceinline void store(unsigned char* ptr, const vint4& v) {
-#if defined(__aarch64__)
-        int32x4_t x = v;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        int32x4_t x = v.v;
         uint16x4_t y = vqmovn_u32(uint32x4_t(x));
         uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
         vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
 #elif defined(__SSE4_1__)
-      __m128i x = v;
+      __m128i x = v.m128i();
       x = _mm_packus_epi32(x, x);
       x = _mm_packus_epi16(x, x);
       *(int*)ptr = _mm_cvtsi128_si32(x);
@@ -161,7 +168,7 @@ namespace embree
     }
 
     static __forceinline void store(unsigned short* ptr, const vint4& v) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       uint32x4_t x = uint32x4_t(v.v);
       uint16x4_t y = vqmovn_u32(x);
       vst1_u16(ptr, y);
@@ -172,7 +179,7 @@ namespace embree
     }
 
     static __forceinline vint4 load_nt(void* ptr) {
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
       return _mm_stream_load_si128((__m128i*)ptr);
 #else
       return _mm_load_si128((__m128i*)ptr); 
@@ -180,17 +187,17 @@ namespace embree
     }
     
     static __forceinline void store_nt(void* ptr, const vint4& v) {
-#if !defined(__aarch64__) && defined(__SSE4_1__)
-      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#if !defined(__aarch64__) || defined(_M_ARM64) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v.m128i()));
 #else
-      _mm_store_si128((__m128i*)ptr,v);
+      _mm_store_si128((__m128i*)ptr,v.m128i());
 #endif
     }
 
     template<int scale = 4>
     static __forceinline vint4 gather(const int* ptr, const vint4& index) {
-#if defined(__AVX2__) && !defined(__aarch64__)
-      return _mm_i32gather_epi32(ptr, index, scale);
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_i32gather_epi32(ptr, index.m128i(), scale);
 #else
       return vint4(
           *(int*)(((char*)ptr)+scale*index[0]),
@@ -204,9 +211,9 @@ namespace embree
     static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
       vint4 r = zero;
 #if defined(__AVX512VL__)
-      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__) && !defined(__aarch64__)
-      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+      return _mm_mmask_i32gather_epi32(r.m128i(), mask.packedMask8(), index.m128i(), ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_mask_i32gather_epi32(r.m128i(), ptr, index.m128i(), mask.mask32(), scale);
 #else
       if (likely(mask[0])) r[0] = *(int*)(((char*)ptr)+scale*index[0]);
       if (likely(mask[1])) r[1] = *(int*)(((char*)ptr)+scale*index[1]);
@@ -220,7 +227,7 @@ namespace embree
     static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
     {
 #if defined(__AVX512VL__)
-      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+      _mm_i32scatter_epi32((int*)ptr, index.m128i(), v.m128i(), scale);
 #else
       *(int*)(((char*)ptr)+scale*index[0]) = v[0];
       *(int*)(((char*)ptr)+scale*index[1]) = v[1];
@@ -233,7 +240,7 @@ namespace embree
     static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
     {
 #if defined(__AVX512VL__)
-      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+      _mm_mask_i32scatter_epi32((int*)ptr, mask.packedMask8(), index.m128i(), v.m128i(), scale);
 #else
       if (likely(mask[0])) *(int*)(((char*)ptr)+scale*index[0]) = v[0];
       if (likely(mask[1])) *(int*)(((char*)ptr)+scale*index[1]) = v[1];
@@ -242,7 +249,7 @@ namespace embree
 #endif
     }
 
-#if defined(__x86_64__) || defined(__aarch64__)
+#if defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
 #endif
 
@@ -255,13 +262,13 @@ namespace embree
 
     friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
 #if defined(__AVX512VL__)
-      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
-#elif defined(__aarch64__)
+      return _mm_mask_blend_epi32(m.packedMask8(), (__m128i)f.m128i(), (__m128i)t.m128i());
+#elif defined(__aarch64__) || defined(_M_ARM64)
       return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
 #elif defined(__SSE4_1__)
-      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f.m128i()), _mm_castsi128_ps(t.m128i()), m.m128())); 
 #else
-      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+      return _mm_or_si128(_mm_and_si128(m.mask32(), t.m128i()), _mm_andnot_si128(m.mask32(), f.m128i())); 
 #endif
     }
   };
@@ -271,57 +278,57 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a.m128i()); }
 #else
-  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a.m128i()); }
 #endif
 
   __forceinline vint4 operator +(const vint4& a) { return a; }
-  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
-#if defined(__aarch64__)
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128i()); }
+#if defined(__aarch64__) || defined(_M_ARM64)
   __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
 #elif defined(__SSSE3__)
-  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a.m128i()); }
 #endif
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a.m128i(), b.m128i()); }
   __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
   __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
 
-  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a.m128i(), b.m128i()); }
   __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
   __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
 
-#if (defined(__aarch64__)) || defined(__SSE4_1__)
-  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#if (defined(__aarch64__)) || defined(_M_ARM64) || defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a.m128i(), b.m128i()); }
 #else
   __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
 #endif
   __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
   __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
 
-  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a.m128i(), b.m128i()); }
   __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
   __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
 
-  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a.m128i(), b.m128i()); }
   __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
   __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
 
-  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a.m128i(), b.m128i()); }
   __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
   __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
 
-  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
-  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a.m128i(), n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a.m128i(), n); }
 
-  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
-  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
-  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a.m128i(), b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a.m128i(), b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a.m128i(), b); }
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -333,7 +340,7 @@ namespace embree
   __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
   __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
 
-#if (defined(__aarch64__)) || defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(_M_ARM64) || defined(__SSE4_1__)
   __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
   __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
 #endif
@@ -352,18 +359,18 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a.m128i(),b.m128i(),_MM_CMPINT_LE); }
 #else
-  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128i(), b.m128i())); }
   __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
-  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a.m128i(), b.m128i())); }
   __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
-  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a.m128i(), b.m128i())); }
   __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
 #endif
 
@@ -393,12 +400,12 @@ namespace embree
   __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
-  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
-  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
-  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
-  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_LE); }
 #else
   __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
   __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
@@ -411,18 +418,18 @@ namespace embree
   template<int mask>
   __forceinline vint4 select(const vint4& t, const vint4& f) {
 #if defined(__SSE4_1__) 
-    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f.m128i()), _mm_castsi128_ps(t.m128i()), mask));
 #else
     return select(vboolf4(mask), t, f);
 #endif    
   }
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
-  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
-  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a.m128i(), b.m128i()); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a.m128i(), b.m128i()); }
 
-  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
-  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a.m128i(), b.m128i()); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a.m128i(), b.m128i()); }
 
 #else
   __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
@@ -438,33 +445,44 @@ namespace embree
   // Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
-  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()))); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64)
     template<int i0, int i1, int i2, int i3>
     __forceinline vint4 shuffle(const vint4& v) {
+#if !defined(_M_ARM64)
         return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+#else
+        uint8x16_t _shuffle = _MN_SHUFFLE(i0, i1, i2, i3);
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _shuffle));
+#endif
     }
     template<int i0, int i1, int i2, int i3>
     __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+#if !defined(_M_ARM64)
         return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+#else
+        uint8x16x2_t _ab = {(uint8x16_t)a.v, (uint8x16_t)b.v};
+        uint8x16_t _shuffle = _MF_SHUFFLE(i0, i1, i2, i3);
+        return vreinterpretq_s32_u8(vqtbl2q_u8( _ab, _shuffle));
+#endif
     }
 #else
   template<int i0, int i1, int i2, int i3>
   __forceinline vint4 shuffle(const vint4& v) {
-    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm_shuffle_epi32(v.m128i(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
-    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 #endif
 #if defined(__SSE3__)
-  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
-  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
-  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v.m128i()))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v.m128i()))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v.m128i()))); }
 #endif
 
   template<int i>
@@ -472,23 +490,27 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__) && !defined(__aarch64__)
-  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
-  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#if defined(__SSE4_1__) && !defined(__aarch64__) && !defined(_M_ARM64)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b.m128i(), src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a.m128i(), b, dst); }
 #else
   template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
   template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b.m128i()); }
   
-  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v.m128i()); }
   
 #if defined(__aarch64__)
   __forceinline size_t toSizeT(const vint4& v) {
     uint64x2_t x = uint64x2_t(v.v);
     return x[0];
   }
+#elif defined(_M_ARM64)
+  __forceinline size_t toSizeT(const vint4& v) {
+    return v.v.n128_u64[0];
+  }
 #else
 __forceinline size_t toSizeT(const vint4& v) { 
 #if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
@@ -497,7 +519,7 @@ __forceinline size_t toSizeT(const vint4& v) {
     // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
     return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
 #else
-    return _mm_cvtsi128_si64(v); 
+    return _mm_cvtsi128_si64(v.m128i()); 
 #endif
   }
 #endif
@@ -505,12 +527,12 @@ __forceinline size_t toSizeT(const vint4& v) {
 #if defined(__AVX512VL__)
 
   __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
-    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a.m128i()),index.m128i()));
   }
 
   template<int i>
   __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
-    return _mm_alignr_epi32(a, b, i);    
+    return _mm_alignr_epi32(a.m128i(), b.m128i(), i);    
   }  
 #endif
 
@@ -518,16 +540,16 @@ __forceinline size_t toSizeT(const vint4& v) {
   /// Reductions
   ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) || defined(__SSE4_1__)
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__SSE4_1__)
 
-#if defined(__aarch64__)
-    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
-    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
-    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+#if defined(__aarch64__) || defined(_M_ARM64)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v.v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v.v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v.v); return vdupq_n_s32(h); }
 
-    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
-    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
-    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v.v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v.v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v.v); }
 #else
   __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
   __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
@@ -556,7 +578,7 @@ __forceinline size_t toSizeT(const vint4& v) {
   /// Sorting networks
   ////////////////////////////////////////////////////////////////////////////////
 
-#if (defined(__aarch64__)) || defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(_M_ARM64) || defined(__SSE4_1__)
 
   __forceinline vint4 usort_ascending(const vint4& v)
   {
diff --git a/common/simd/vint8_avx.h b/common/simd/vint8_avx.h
index 48f5a9b203..9411966ce5 100644
--- a/common/simd/vint8_avx.h
+++ b/common/simd/vint8_avx.h
@@ -39,11 +39,14 @@ namespace embree
     __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
 
     __forceinline vint(__m256i a) : v(a) {}
-    __forceinline operator const __m256i&() const { return v; }
-    __forceinline operator       __m256i&()       { return v; }
-
-    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
-    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline const __m256i& m256i() const { return v; }
+    __forceinline       __m256i& m256i()       { return v; }
+    __forceinline       __m256 m256() const { return _mm256_cvtepi32_ps(v); }
+    __forceinline       __m256 vec_float() const { return m256(); }
+    __forceinline       __m256i vec_int() const { return m256i(); }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),a.m128i(),1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),b.m128i(),1)) {}
     __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
  
     __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
@@ -73,17 +76,17 @@ namespace embree
     static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
     static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
 
-    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
-    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask.mask32())); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask.mask32())); }
 
-    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f.m256i())); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f.m256i())); }
     
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f.m256i())); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f.m256i())); }
 
     static __forceinline void store_nt(void* ptr, const vint8& v) {
-      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v.m256i()));
     }
 
     static __forceinline vint8 load(const unsigned char* ptr) {
@@ -190,7 +193,7 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a.m256i()); }
 
   __forceinline vint8 operator +(const vint8& a) { return a; }
   __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
@@ -212,15 +215,15 @@ namespace embree
   __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
   __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
 
-  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
   __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
 
-  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
   __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
 
-  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
   __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
 
@@ -315,51 +318,51 @@ namespace embree
   __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
 
   __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
-    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f.m256i()), _mm256_castsi256_ps(t.m256i()), m.m256())); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
-  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
 
   template<int i>
   __forceinline vint8 shuffle(const vint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1>
   __forceinline vint8 shuffle4(const vint8& v) {
-    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(v.m256i(), v.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
-    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(a.m256i(), b.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint8 shuffle(const vint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
-    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
-  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v.m256i()))))); }
 
   __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
-  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
-  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
-  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a.m256i(), b.m128i(), i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a.m256i(), i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a.m256i()); }
 
-  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v.m256i())); }
 
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/common/simd/vint8_avx2.h b/common/simd/vint8_avx2.h
index d48efac3f4..6f04c926fa 100644
--- a/common/simd/vint8_avx2.h
+++ b/common/simd/vint8_avx2.h
@@ -38,11 +38,14 @@ namespace embree
     __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
 
     __forceinline vint(__m256i a) : v(a) {}
-    __forceinline operator const __m256i&() const { return v; }
-    __forceinline operator       __m256i&()       { return v; }
-
-    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
-    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline const __m256i& m256i() const { return v; }
+    __forceinline __m256i& m256i()       { return v; }
+    __forceinline       __m256 m256() const { return _mm256_cvtepi32_ps(v); }
+    __forceinline       __m256 vec_float() const { return m256(); }
+    __forceinline       __m256i vec_int() const { return m256i(); }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),a.m128i(),1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),b.m128i(),1)) {}
     __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
  
     __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
@@ -54,9 +57,9 @@ namespace embree
     __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
 
 #if defined(__AVX512VL__)
-    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a.packedMask8())) {}
 #else
-    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a.m256())) {}
 #endif
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -83,29 +86,29 @@ namespace embree
     static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
     static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
 
-    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v.m256i()); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v.m256i())); }
 
 #if defined(__AVX512VL__)
 
     static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
-      return _mm256_mask_compress_epi32(v, mask, v);
+      return _mm256_mask_compress_epi32(v.m256i(), mask.packedMask8(), v.m256i());
     }
     static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
-      return _mm256_mask_compress_epi32(a, mask, b);
+      return _mm256_mask_compress_epi32(a.m256i(), mask.packedMask8(), b.m256i());
     }
 
-    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
-    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask.packedMask8(),ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask.packedMask8(),ptr); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask.packedMask8(),v.m256i()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask.packedMask8(),v.m256i()); }
 #else
-    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
-    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask.mask32())); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask.mask32())); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask.mask32(),v.m256i()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask.mask32(),v.m256i()); }
 #endif
     
     static __forceinline vint8 load_nt(void* ptr) {
@@ -113,7 +116,7 @@ namespace embree
     }
 
     static __forceinline void store_nt(void* ptr, const vint8& v) {
-      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v.m256i()));
     }
 
     static __forceinline void store(unsigned char* ptr, const vint8& i)
@@ -129,16 +132,16 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
-      return _mm256_i32gather_epi32(ptr, index, scale);
+      return _mm256_i32gather_epi32(ptr, index.m256i(), scale);
     }
 
     template<int scale = 4>
     static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
       vint8 r = zero;
 #if defined(__AVX512VL__)
-      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+      return _mm256_mmask_i32gather_epi32(r.m256i(), mask.packedMask8(), index.m256i(), ptr, scale);
 #else
-      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+      return _mm256_mask_i32gather_epi32(r.m256i(), ptr, index.m256i(), mask.mask32(), scale);
 #endif
     }
 
@@ -146,7 +149,7 @@ namespace embree
     static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+      _mm256_i32scatter_epi32((int*)ptr, ofs.m256i(), v.m256i(), scale);
 #else
       *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
       *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -163,7 +166,7 @@ namespace embree
     static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask.packedMask8(), ofs.m256i(), v.m256i(), scale);
 #else
       if (likely(mask[0])) *(int*)(((char*)ptr)+scale*ofs[0]) = v[0];
       if (likely(mask[1])) *(int*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -191,67 +194,67 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a.m256i()); }
 #else
-  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a.m256i()); }
 #endif
 
   __forceinline vint8 operator +(const vint8& a) { return a; }
-  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
-  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256i()); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a.m256i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a.m256i(), b.m256i()); }
   __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
   __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
 
-  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a.m256i(), b.m256i()); }
   __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
   __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
 
-  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a.m256i(), b.m256i()); }
   __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
   __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
 
-  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a.m256i(), b.m256i()); }
   __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
   __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
 
-  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a.m256i(), b.m256i()); }
   __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
   __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
 
-  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a.m256i(), b.m256i()); }
   __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
   __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
 
-  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
-  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a.m256i(), n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a.m256i(), n); }
 
-  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
-  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a.m256i(), n.m256i()); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a.m256i(), n.m256i()); }
 
-  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
-  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
-  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a.m256i(), b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a.m256i(), b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a.m256i(), b); }
 
-  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
-  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
-  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a.m256i(), b.m256i()); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a.m256i(), b.m256i()); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a.m256i(), b.m256i()); }
   
-  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a.m256i(), b.m256i()); }
   __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
   __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
 
-  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a.m256i(), b.m256i()); }
   __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
   __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
 
-  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
-  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a.m256i(), b.m256i()); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a.m256i(), b.m256i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -280,32 +283,32 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
-  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
-  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
-  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
-  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
-  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a.m256i(),b.m256i(),_MM_CMPINT_LE); }
 
   static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
-    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+    return _mm256_mask_blend_epi32(m.packedMask8(), (__m256i)f.m256i(), (__m256i)t.m256i());
   }
 #else
-  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256i(), b.m256i())); }
   static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
-  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256i(), a.m256i())); }
   static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
-  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256i(), b.m256i())); }
   static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
 
   static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
-    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f.m256i()), _mm256_castsi256_ps(t.m256i()), m.m256()));
   }
 #endif
 
   template<int mask>
   __forceinline vint8 select(const vint8& t, const vint8& f) {
-    return _mm256_blend_epi32(f, t, mask);
+    return _mm256_blend_epi32(f.m256i(), t.m256i(), mask);
   }
 
   __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
@@ -334,12 +337,12 @@ namespace embree
   __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
-  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
-  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
-  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
-  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
-  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LE); }
 #else
   static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
   static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
@@ -353,61 +356,61 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
-  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a.m256i(), b.m256i()); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a.m256i(), b.m256i()); }
 
   template<int i>
   __forceinline vint8 shuffle(const vint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1>
   __forceinline vint8 shuffle4(const vint8& v) {
-    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(v.m256i(), v.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
-    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(a.m256i(), b.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint8 shuffle(const vint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
-    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
-  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v.m256i()))))); }
 
   __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
 
-  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
-  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
-  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a.m256i(), b.m128i(), i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a.m256i(), i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a.m256i()); }
 
-  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v.m256i())); }
 
 #if !defined(__aarch64__)
   __forceinline vint8 permute(const vint8& v, const __m256i& index) {
-    return _mm256_permutevar8x32_epi32(v, index);
+    return _mm256_permutevar8x32_epi32(v.m256i(), index);
   }
 
   __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
-    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v.m256i()), index));
   }
 
   template<int i>
   static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
 #if defined(__AVX512VL__)
-    return _mm256_alignr_epi32(a, b, i);    
+    return _mm256_alignr_epi32(a.m256i(), b.m256i(), i);    
 #else
-    return _mm256_alignr_epi8(a, b, 4*i);
+    return _mm256_alignr_epi8(a.m256i(), b.m256i(), 4*i);
 #endif
   }  
 
diff --git a/common/simd/vllong4_avx2.h b/common/simd/vllong4_avx2.h
index 6c86845877..de68b30b67 100644
--- a/common/simd/vllong4_avx2.h
+++ b/common/simd/vllong4_avx2.h
@@ -36,8 +36,7 @@ namespace embree
     __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
 
     __forceinline vllong(const __m256i& t) { v = t; }
-    __forceinline operator __m256i() const { return v; }
-    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+    __forceinline __m256i m256i() const { return v; }
 
 
     __forceinline vllong(long long i) {
@@ -63,7 +62,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
-      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a.m256i()));
     }
 
     static __forceinline vllong4 loadu(const void* addr)
@@ -80,26 +79,26 @@ namespace embree
     }
 
     static __forceinline void store(void* ptr, const vllong4& v) {
-      _mm256_store_si256((__m256i*)ptr,v);
+      _mm256_store_si256((__m256i*)ptr,v.m256i());
     }
 
     static __forceinline void storeu(void* ptr, const vllong4& v) {
-      _mm256_storeu_si256((__m256i*)ptr,v);
+      _mm256_storeu_si256((__m256i*)ptr,v.m256i());
     }
 
     static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
 #if defined(__AVX512VL__)
-      _mm256_mask_storeu_epi64(ptr,mask,f);
+      _mm256_mask_storeu_epi64(ptr,mask.packedMask8(),f.m256i());
 #else
-      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+      _mm256_maskstore_pd((double*)ptr,mask.mask32(),_mm256_castsi256_pd(f.m256i()));
 #endif
     }
 
     static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
 #if defined(__AVX512VL__)
-      _mm256_mask_store_epi64(ptr,mask,f);
+      _mm256_mask_store_epi64(ptr,mask.packedMask8(),f.m256i());
 #else
-      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+      _mm256_maskstore_pd((double*)ptr,mask.mask32(),_mm256_castsi256_pd(f.m256i()));
 #endif
     }
 
@@ -118,9 +117,9 @@ namespace embree
   
   __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
   #if defined(__AVX512VL__)
-    return _mm256_mask_blend_epi64(m, f, t);
+    return _mm256_mask_blend_epi64(m.packedMask8(), f.m256i(), t.m256i());
   #else
-    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f.m256i()), _mm256_castsi256_pd(t.m256i()), m.m256d()));
   #endif
   }
 
@@ -129,51 +128,51 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a.m256i()); }
 #else
-  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a.m256i()); }
 #endif
 
   __forceinline vllong4 operator +(const vllong4& a) { return a; }
-  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a.m256i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
   __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
 
-  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
   __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
 
   /* only low 32bit part */
-  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
   __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
 
-  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
   __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
 
-  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
   __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
 
-  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a.m256i(), b.m256i()); }
   __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
   __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
 
-  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a.m256i(), (int)n); }
   //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
 
-  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a.m256i(), n.m256i()); }
   //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
   //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
 
-  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a.m256i(), (int)b); }
   
   //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
   //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
@@ -184,8 +183,8 @@ namespace embree
   //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
 
 #if defined(__AVX512VL__)
-  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
-  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c.m256i(),m.packedMask8(),a.m256i(),b.m256i()); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c.m256i(),m.packedMask8(),a.m256i(),b.m256i()); }
 #else
   __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
   __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
@@ -218,17 +217,17 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a.m256i(),b.m256i(),_MM_CMPINT_LE); }
 #else
-  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a.m256i(),b.m256i()); }
   __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
-  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
-  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a.m256i(),b.m256i()); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b.m256i(),a.m256i()); }
   __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
   __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
 #endif
@@ -259,12 +258,12 @@ namespace embree
   __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
-  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
-  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
-  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
-  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LE); }
 #else
   __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
   __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
@@ -280,7 +279,7 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vllong4 shuffle(const vllong4& v) {
-    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v.m256i()), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
   }
 
   template<int i>
@@ -290,25 +289,25 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vllong4 shuffle2(const vllong4& v) {
-    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v.m256i()), _mm256_castsi256_pd(v.m256i()), (i1 << 4) | i0));
   }
 
   __forceinline long long toScalar(const vllong4& v) {
-    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v.m256i()));
   }
 
 #if defined(__AVX512VL__)
   __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
     // workaround for GCC 7.x
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
-    return _mm256_permutex2var_epi64(a,index,a);
+    return _mm256_permutex2var_epi64(a.m256i(),index,a.m256i());
 #else
-    return _mm256_permutexvar_epi64(index,a);
+    return _mm256_permutexvar_epi64(index,a.m256i());
 #endif
   }
 
   __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
-    return _mm256_permutex2var_epi64(a,index,b);
+    return _mm256_permutex2var_epi64(a.m256i(),index.m256i(),b.m256i());
   }
 
 #endif
diff --git a/common/simd/vllong8_avx512.h b/common/simd/vllong8_avx512.h
index ee69411637..afa12d6049 100644
--- a/common/simd/vllong8_avx512.h
+++ b/common/simd/vllong8_avx512.h
@@ -36,8 +36,7 @@ namespace embree
     __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
 
     __forceinline vllong(const __m512i& t) { v = t; }
-    __forceinline operator __m512i() const { return v; }
-    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+    __forceinline __m512i m512i() const { return v; }
 
     __forceinline vllong(long long i) {
       v = _mm512_set1_epi64(i);
@@ -54,7 +53,7 @@ namespace embree
     }
    
     __forceinline vllong(const vllong<4>& i) {
-      v = _mm512_broadcast_i64x4(i);
+      v = _mm512_broadcast_i64x4(i.m256i());
     }
     
     ////////////////////////////////////////////////////////////////////////////////
@@ -71,7 +70,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
-      _mm512_stream_si512((__m512i*)ptr,a);
+      _mm512_stream_si512((__m512i*)ptr,a.m512i());
     }
 
     static __forceinline vllong8 loadu(const void* addr) {
@@ -91,31 +90,31 @@ namespace embree
     }
 
     static __forceinline void store(void* ptr, const vllong8& v) {
-      _mm512_store_si512(ptr,v);
+      _mm512_store_si512(ptr,v.m512i());
     }
 
     static __forceinline void storeu(void* ptr, const vllong8& v) {
-      _mm512_storeu_si512(ptr,v);
+      _mm512_storeu_si512(ptr,v.m512i());
     }
 
     static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
-      _mm512_mask_storeu_epi64(ptr,mask,f);
+      _mm512_mask_storeu_epi64(ptr,mask.packedMask8(),f.m512i());
     }
 
     static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
-      _mm512_mask_store_epi64(addr,mask,v2);
+      _mm512_mask_store_epi64(addr,mask.packedMask8(),v2.m512i());
     }
 
     static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
-      return _mm512_mask_compress_epi64(v,mask,v);
+      return _mm512_mask_compress_epi64(v.m512i(),mask.packedMask8(),v.m512i());
     }
 
     static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
-      return _mm512_mask_compress_epi64(a,mask,b);
+      return _mm512_mask_compress_epi64(a.m512i(),mask.packedMask8(),b.m512i());
     }
 
     static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
-      return _mm512_mask_expand_epi64(b,mask,a);
+      return _mm512_mask_expand_epi64(b.m512i(),mask.packedMask8(),a.m512i());
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -131,62 +130,62 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a.m512i()); }
 
   __forceinline vllong8 operator +(const vllong8& a) { return a; }
-  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
   __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
 
-  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
   __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
 
-  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
   __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
 
-  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
   __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
 
-  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
   __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
 
-  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
   __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
 
-  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
-  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a.m512i(), n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a.m512i(), n); }
 
-  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
-  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a.m512i(), n.m512i()); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a.m512i(), n.m512i()); }
 
-  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
-  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
-  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a.m512i(), b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a.m512i(), b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a.m512i(), b); }
 
-  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
   __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
 
-  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a.m512i(), b.m512i()); }
   __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
   __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
   
-  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
-  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c.m512i(),m.packedMask8(),a.m512i(),b.m512i()); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c.m512i(),m.packedMask8(),a.m512i(),b.m512i()); }
 
-  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
-  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c.m512i(),m.packedMask8(),a.m512i(),b.m512i()); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c.m512i(),m.packedMask8(),a.m512i(),b.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -214,46 +213,46 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
   __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
   __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
   
-  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
   __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
   __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
   
-  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
   __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
   __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
   
-  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
   __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
   __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
 
-  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
   __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
   __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
 
-  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
   __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
   __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
 
-  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
     
-  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
-  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
-  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
-  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
-  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
-  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask.packedMask8(),a.m512i(),b.m512i(),_MM_CMPINT_LE); }
 
   __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
-    return _mm512_mask_or_epi64(f,m,t,t); 
+    return _mm512_mask_or_epi64(f.m512i(),m.packedMask8(),t.m512i(),t.m512i()); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -262,7 +261,7 @@ namespace embree
 
   template<int i0, int i1>
   __forceinline vllong8 shuffle(const vllong8& v) {
-    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v.m512i()), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
   }
 
   template<int i>
@@ -272,12 +271,12 @@ namespace embree
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vllong8 shuffle(const vllong8& v) {
-    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm512_permutex_epi64(v.m512i(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1>
   __forceinline vllong8 shuffle4(const vllong8& v) {
-    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+    return _mm512_shuffle_i64x2(v.m512i(), v.m512i(), _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
   }
 
   template<int i>
@@ -287,11 +286,11 @@ namespace embree
 
   template<int i>
   __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
-    return _mm512_alignr_epi64(a, b, i);
+    return _mm512_alignr_epi64(a.m512i(), b.m512i(), i);
   };
 
   __forceinline long long toScalar(const vllong8& v) {
-    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v.m512i()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -329,7 +328,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
   __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
-    return _mm512_permutexvar_epi64(index,v);  
+    return _mm512_permutexvar_epi64(index.m512i(),v.m512i());  
   }
 
   __forceinline vllong8 reverse(const vllong8& a) {
diff --git a/common/simd/vuint16_avx512.h b/common/simd/vuint16_avx512.h
index c9eb6682ff..a4c6468b8c 100644
--- a/common/simd/vuint16_avx512.h
+++ b/common/simd/vuint16_avx512.h
@@ -38,19 +38,19 @@ namespace embree
     __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
 
     __forceinline vuint(const __m512i& t) { v = t; }
-    __forceinline operator __m512i() const { return v; }
-    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+    __forceinline __m512i m512i() const { return v; }
+    __forceinline __m256i m256i() const { return _mm512_castsi512_si256(v); }
 
     __forceinline vuint(unsigned int i) {
       v = _mm512_set1_epi32(i);
     }
 
     __forceinline vuint(const vuint4& i) {
-      v = _mm512_broadcast_i32x4(i);
+      v = _mm512_broadcast_i32x4(i.m128i());
     }
 
     __forceinline vuint(const vuint8& i) {
-      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i.m256i()))));
     }
     
     __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
@@ -83,7 +83,7 @@ namespace embree
     ////////////////////////////////////////////////////////////////////////////////
 
     static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
-      _mm512_stream_si512((__m512i*)ptr,a);
+      _mm512_stream_si512((__m512i*)ptr,a.m512i());
     }
 
     static __forceinline vuint16 loadu(const void* addr)
@@ -106,56 +106,56 @@ namespace embree
 
 
     static __forceinline void store(void* ptr, const vuint16& v) {
-      _mm512_store_si512(ptr,v);
+      _mm512_store_si512(ptr,v.m512i());
     }
 
     static __forceinline void storeu(void* ptr, const vuint16& v) {
-      _mm512_storeu_si512(ptr,v);
+      _mm512_storeu_si512(ptr,v.m512i());
     }
 
     static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
-      _mm512_mask_storeu_epi32(ptr,mask,f);
+      _mm512_mask_storeu_epi32(ptr,mask.packedMask16(),f.m512i());
     }
 
     static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
-      _mm512_mask_store_epi32(addr,mask,v2);
+      _mm512_mask_store_epi32(addr,mask.packedMask16(),v2.m512i());
     }
 
     static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
-      return _mm512_mask_compress_epi32(v,mask,v);
+      return _mm512_mask_compress_epi32(v.m512i(),mask.packedMask16(),v.m512i());
     }
 
     static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
-      return _mm512_mask_compress_epi32(a,mask,b);
+      return _mm512_mask_compress_epi32(a.m512i(),mask.packedMask16(),b.m512i());
     }
 
     static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
-      return _mm512_mask_expand_epi32(b,mask,a);
+      return _mm512_mask_expand_epi32(b.m512i(),mask.packedMask16(),a.m512i());
     }
 
     template<int scale = 4>
     static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
-      return _mm512_i32gather_epi32(index,ptr,scale);
+      return _mm512_i32gather_epi32(index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
-      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask.packedMask16(),index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
-      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+      return _mm512_mask_i32gather_epi32(dest.m512i(),mask.packedMask16(),index.m512i(),ptr,scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
-      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+      _mm512_i32scatter_epi32((int*)ptr,index.m512i(),v.m512i(),scale);
     }
 
     template<int scale = 4>
     static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
-      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask.packedMask16(),index.m512i(),v.m512i(),scale);
     }
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -173,62 +173,62 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a.m512i()); }
 
   __forceinline vuint16 operator +(const vuint16& a) { return a; }
-  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
   __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
 
-  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
   __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
 
-  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
   __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
 
-  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
   __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
 
-  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
   __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
 
-  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a.m512i(), b.m512i()); }
   __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
   __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
 
-  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
-  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a.m512i(), n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a.m512i(), n); }
 
-  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
-  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a.m512i(), n.m512i()); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a.m512i(), n.m512i()); }
 
-  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
-  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
-  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a.m512i(), b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a.m512i(), b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a.m512i(), b); }
   
-  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a.m512i(), b.m512i()); }
   __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
   __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
 
-  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a.m512i(), b.m512i()); }
   __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
   __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
   
-  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
-  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c.m512i(),mask.packedMask16(),a.m512i(),b.m512i()); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c.m512i(),mask.packedMask16(),a.m512i(),b.m512i()); }
 
-  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
-  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c.m512i(),m.packedMask16(),a.m512i(),b.m512i()); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c.m512i(),m.packedMask16(),a.m512i(),b.m512i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -257,47 +257,47 @@ namespace embree
   /// Comparison Operators + Select
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
   __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
   __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
   
-  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
   __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
   __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
   
-  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
   __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
   __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
   
-  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
   __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
   __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
 
-  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
   __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
   __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
 
-  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
   __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
   __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
 
-  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
-
-  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a.m512i(),b.m512i(),_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask.packedMask16(),a.m512i(),b.m512i(),_MM_CMPINT_LE); }
     
  
   __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
-    return _mm512_mask_or_epi32(f,m,t,t); 
+    return _mm512_mask_or_epi32(f.m512i(),m.packedMask16(),t.m512i(),t.m512i()); 
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -306,31 +306,31 @@ namespace embree
 
   template<int i>
   __forceinline vuint16 shuffle(const vuint16& v) {
-    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint16 shuffle(const vuint16& v) {
-    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i>
   __forceinline vuint16 shuffle4(const vuint16& v) {
-    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v.m512i()), _mm512_castsi512_ps(v.m512i()) ,_MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint16 shuffle4(const vuint16& v) {
-    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v.m512i()), _mm512_castsi512_ps(v.m512i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i>
   __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
-    return _mm512_alignr_epi32(a, b, i);
+    return _mm512_alignr_epi32(a.m512i(), b.m512i(), i);
   };
 
   __forceinline unsigned int toScalar(const vuint16& v) {
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v.m512i()));
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -373,7 +373,7 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
   
   __forceinline vuint16 permute(vuint16 v, vuint16 index) {
-    return _mm512_permutexvar_epi32(index,v);  
+    return _mm512_permutexvar_epi32(index.m512i(),v.m512i());  
   }
 
   __forceinline vuint16 reverse(const vuint16& a) {
diff --git a/common/simd/vuint4_sse2.h b/common/simd/vuint4_sse2.h
index c2e86c6633..67f3578e46 100644
--- a/common/simd/vuint4_sse2.h
+++ b/common/simd/vuint4_sse2.h
@@ -37,8 +37,8 @@ namespace embree
     __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
 
     __forceinline vuint(const __m128i a) : v(a) {}
-    __forceinline operator const __m128i&() const { return v; }
-    __forceinline operator       __m128i&()       { return v; }
+    __forceinline const __m128i& m128i() const { return v; }
+    __forceinline __m128i& m128i()       { return v; }
 
 
     __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
@@ -49,9 +49,9 @@ namespace embree
 #endif
 
 #if defined(__AVX512VL__)
-    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a.packedMask8())) {}
 #else
-    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128(a.m128())) {}
 #endif
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -72,30 +72,30 @@ namespace embree
     static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
     static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
 
-    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v.m128i()); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v.m128i()); }
     
 #if defined(__AVX512VL__)
-    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
-    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask.packedMask8(),ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask.packedMask8(),ptr); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask.packedMask8(),v.m128i()); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask.packedMask8(),v.m128i()); }
 #elif defined(__AVX__)
-    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
-    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask.mask32())); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask.mask32())); }
 
-    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
-    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),_mm_castsi128_ps(i.m128i())); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask.mask32(),_mm_castsi128_ps(i.m128i())); }
 #else
-    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
-    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask.mask32()); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask.mask32()); }
 
     static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
     static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     static __forceinline vuint4 load(const unsigned char* ptr) {
         return _mm_load4epu8_epi32(((__m128i*)ptr));
     }
@@ -114,7 +114,7 @@ namespace embree
 #endif
 
     static __forceinline vuint4 load(const unsigned short* ptr) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       return _mm_load4epu16_epi32(((__m128i*)ptr));
 #elif defined (__SSE4_1__)
       return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
@@ -124,7 +124,7 @@ namespace embree
     } 
 
     static __forceinline vuint4 load_nt(void* ptr) {
-#if (defined(__aarch64__)) || defined(__SSE4_1__)
+#if (defined(__aarch64__)) || defined(_M_ARM64) || defined(__SSE4_1__)
       return _mm_stream_load_si128((__m128i*)ptr); 
 #else
       return _mm_load_si128((__m128i*)ptr); 
@@ -132,17 +132,17 @@ namespace embree
     }
     
     static __forceinline void store_nt(void* ptr, const vuint4& v) {
-#if !defined(__aarch64__) && defined(__SSE4_1__)
-      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#if !defined(__aarch64__) || defined(_M_ARM64) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v.m128i()));
 #else
-      _mm_store_si128((__m128i*)ptr,v);
+      _mm_store_si128((__m128i*)ptr,v.m128i());
 #endif
     }
 
     template<int scale = 4>
     static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
-#if defined(__AVX2__) && !defined(__aarch64__)
-      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_i32gather_epi32((const int*)ptr, index.m128i(), scale);
 #else
       return vuint4(
           *(unsigned int*)(((char*)ptr)+scale*index[0]),
@@ -156,9 +156,9 @@ namespace embree
     static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
       vuint4 r = zero;
 #if defined(__AVX512VL__)
-      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
-#elif defined(__AVX2__) && !defined(__aarch64__)
-      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+      return _mm_mmask_i32gather_epi32(r.m128i(), mask.packedMask8(), index.m128i(), ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
+      return _mm_mask_i32gather_epi32(r.m128i(), (const int*)ptr, index.m128i(), mask.mask32(), scale);
 #else
       if (likely(mask[0])) r[0] = *(unsigned int*)(((char*)ptr)+scale*index[0]);
       if (likely(mask[1])) r[1] = *(unsigned int*)(((char*)ptr)+scale*index[1]);
@@ -177,11 +177,11 @@ namespace embree
 
     friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
 #if defined(__AVX512VL__)
-      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+      return _mm_mask_blend_epi32(m.packedMask8(), (__m128i)f.m128i(), (__m128i)t.m128i());
 #elif defined(__SSE4_1__)
-      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f.m128i()), _mm_castsi128_ps(t.m128i()), m.m128())); 
 #else
-      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+      return _mm_or_si128(_mm_and_si128(m.mask32(), t.m128i()), _mm_andnot_si128(m.mask32(), f.m128i())); 
 #endif
     }
   };
@@ -191,23 +191,23 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a.m128i()); }
 #else
-  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a.m128i()); }
 #endif
 
   __forceinline vuint4 operator +(const vuint4& a) { return a; }
-  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a.m128i(), b.m128i()); }
   __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
   __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
 
-  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a.m128i(), b.m128i()); }
   __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
   __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
 
@@ -219,24 +219,33 @@ namespace embree
 //  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
 //  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
 
-  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a.m128i(), b.m128i()); }
   __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
   __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
 
-  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a.m128i(), b.m128i()); }
   __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
   __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
 
-  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
-  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a.m128i(), b.m128i()); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a.m128i() ^ vuint4(b); }
   __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
 
-  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
-  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+#if !defined(_M_ARM64)
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a.m128i(), n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a.m128i(), n); }
 
-  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
-  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
-  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a.m128i(), b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a.m128i(), b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a.m128i(), b); }
+#else
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a.m128i(), (int)n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a.m128i(), (int)n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a.m128i(), (int)b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a.m128i(), (int)b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a.m128i(), (int)b); }
+#endif
   
   ////////////////////////////////////////////////////////////////////////////////
   /// Assignment Operators
@@ -267,14 +276,14 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a.m128i(),b.m128i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a.m128i(),b.m128i(),_MM_CMPINT_NE); }
   //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
   //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
   //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
   //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
 #else
-  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a.m128i(), b.m128i())); }
   __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
   //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
   //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
@@ -308,8 +317,8 @@ namespace embree
   //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask.packedMask8(), a.m128i(), b.m128i(), _MM_CMPINT_NE); }
   //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
   //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
   //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
@@ -326,7 +335,7 @@ namespace embree
   template<int mask>
   __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
 #if defined(__SSE4_1__) 
-    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f.m128i()), _mm_castsi128_ps(t.m128i()), mask));
 #else
     return select(vboolf4(mask), t, f);
 #endif    
@@ -350,33 +359,44 @@ namespace embree
   // Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
-  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()))); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& v) {
+#if !defined(_M_ARM64)
     return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+#else
+    uint8x16_t _shuffle = _MN_SHUFFLE(i0, i1, i2, i3);
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _shuffle));
+#endif
   }
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+#if !defined(_M_ARM64)
     return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+#else
+    uint8x16x2_t _ab = {(uint8x16_t)a.v, (uint8x16_t)b.v};
+    uint8x16_t _shuffle = _MF_SHUFFLE(i0, i1, i2, i3);
+    return vreinterpretq_s32_u8(vqtbl2q_u8( _ab, _shuffle));
+#endif
   }
 #else
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& v) {
-    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+    return _mm_shuffle_epi32(v.m128i(), _MM_SHUFFLE(i3, i2, i1, i0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
-    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a.m128i()), _mm_castsi128_ps(b.m128i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 #endif
 #if defined(__SSE3__)
-  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
-  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
-  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v.m128i()))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v.m128i()))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v.m128i()))); }
 #endif
 
   template<int i>
@@ -384,17 +404,17 @@ namespace embree
     return shuffle<i,i,i,i>(v);
   }
 
-#if defined(__SSE4_1__) && !defined(__aarch64__)
-  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
-  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#if defined(__SSE4_1__) && !defined(__aarch64__) && !defined(_M_ARM64)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b.m128i(), src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a.m128i(), b, dst); }
 #else
   template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
   template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
 #endif
 
-  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b.m128i()); }
 
-  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v.m128i()); }
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Reductions
diff --git a/common/simd/vuint8_avx.h b/common/simd/vuint8_avx.h
index cb8b5158c1..cf462b5124 100644
--- a/common/simd/vuint8_avx.h
+++ b/common/simd/vuint8_avx.h
@@ -39,11 +39,11 @@ namespace embree
     __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
 
     __forceinline vuint(__m256i a) : v(a) {}
-    __forceinline operator const __m256i&() const { return v; }
-    __forceinline operator       __m256i&()       { return v; }
+    __forceinline const __m256i& m256i() const { return v; }
+    __forceinline __m256i& m256i()       { return v; }
 
-    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
-    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),a.m128i(),1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),b.m128i(),1)) {}
     __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
  
     __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
@@ -71,17 +71,17 @@ namespace embree
     static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
     static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
 
-    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
-    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask.mask32())); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask.mask32())); }
 
-    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f.m256i())); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f.m256i())); }
     
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.v),_mm256_castsi256_ps(f)); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.m256()),_mm256_castsi256_ps(f.m256i())); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,_mm256_castps_si256(mask.m256()),_mm256_castsi256_ps(f.m256i())); }
 
     static __forceinline void store_nt(void* ptr, const vuint8& v) {
-      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v.m256i()));
     }
 
     static __forceinline vuint8 load(const unsigned char* ptr) {
@@ -188,7 +188,7 @@ namespace embree
   /// Unary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a.m256i()); }
 
   __forceinline vuint8 operator +(const vuint8& a) { return a; }
 
@@ -208,15 +208,15 @@ namespace embree
   //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
   //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
 
-  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
   __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
 
-  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
   __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
 
-  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
   __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
   __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
 
@@ -295,7 +295,7 @@ namespace embree
   __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
 
   __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
-    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f.m256i()), _mm256_castsi256_ps(t.m256i()), m.m256())); 
   }
 
 
@@ -303,43 +303,43 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
-  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()))); }
 
   template<int i>
   __forceinline vuint8 shuffle(const vuint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1>
   __forceinline vuint8 shuffle4(const vuint8& v) {
-    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(v.m256i(), v.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
-    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(a.m256i(), b.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint8 shuffle(const vuint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
-    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
-  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v.m256i()))))); }
 
-  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
-  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
-  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a.m256i(), b.m128i(), i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a.m256i(), i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a.m256i()); }
 
-  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v.m256i())); }
 
 
   ////////////////////////////////////////////////////////////////////////////////
diff --git a/common/simd/vuint8_avx2.h b/common/simd/vuint8_avx2.h
index 959143724b..fb6ec62ae8 100644
--- a/common/simd/vuint8_avx2.h
+++ b/common/simd/vuint8_avx2.h
@@ -38,11 +38,11 @@ namespace embree
     __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
 
     __forceinline vuint(__m256i a) : v(a) {}
-    __forceinline operator const __m256i&() const { return v; }
-    __forceinline operator       __m256i&()       { return v; }
+    __forceinline const __m256i& m256i() const { return v; }
+    __forceinline __m256i& m256i()       { return v; }
 
-    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
-    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),a.m128i(),1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a.m128i()),b.m128i(),1)) {}
     __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
  
     __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
@@ -54,9 +54,9 @@ namespace embree
     __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
 
 #if defined(__AVX512VL__)
-    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a.packedMask8())) {}
 #else
-    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a.m256())) {}
 #endif
 
     ////////////////////////////////////////////////////////////////////////////////
@@ -82,29 +82,29 @@ namespace embree
     static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
     static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
 
-    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
-    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v.m256i()); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v.m256i())); }
 
 #if defined(__AVX512VL__)
 
     static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
-      return _mm256_mask_compress_epi32(v, mask, v);
+      return _mm256_mask_compress_epi32(v.m256i(), mask.packedMask8(), v.m256i());
     }
     static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
-      return _mm256_mask_compress_epi32(a, mask, b);
+      return _mm256_mask_compress_epi32(a.m256i(), mask.packedMask8(), b.m256i());
     }
 
-    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
-    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask.packedMask8(),ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask.packedMask8(),ptr); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask.packedMask8(),v.m256i()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask.packedMask8(),v.m256i()); }
 #else
-    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
-    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask.mask32())); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask.mask32())); }
 
-    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
-    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask.mask32(),v.m256i()); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask.mask32(),v.m256i()); }
 #endif
     
     static __forceinline vuint8 load_nt(void* ptr) {
@@ -112,7 +112,7 @@ namespace embree
     }
 
     static __forceinline void store_nt(void* ptr, const vuint8& v) {
-      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v.m256i()));
     }
 
     static __forceinline void store(unsigned char* ptr, const vuint8& i)
@@ -128,16 +128,16 @@ namespace embree
 
     template<int scale = 4>
     static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
-      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+      return _mm256_i32gather_epi32((const int*) ptr, index.m256i(), scale);
     }
 
     template<int scale = 4>
     static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
       vuint8 r = zero;
 #if defined(__AVX512VL__)
-      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+      return _mm256_mmask_i32gather_epi32(r.m256i(), mask.packedMask8(), index.m256i(), (const int*) ptr, scale);
 #else
-      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+      return _mm256_mask_i32gather_epi32(r.m256i(), (const int*) ptr, index.m256i(), mask.mask32(), scale);
 #endif
     }
 
@@ -145,7 +145,7 @@ namespace embree
     static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+      _mm256_i32scatter_epi32((int*)ptr, ofs.m256i(), v.m256i(), scale);
 #else
       *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
       *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -162,7 +162,7 @@ namespace embree
     static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
     {
 #if defined(__AVX512VL__)
-      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask.packedMask8(), ofs.m256i(), v.m256i(), scale);
 #else
       if (likely(mask[0])) *(unsigned int*)(((char*)ptr)+scale*ofs[0]) = v[0];
       if (likely(mask[1])) *(unsigned int*)(((char*)ptr)+scale*ofs[1]) = v[1];
@@ -190,9 +190,9 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a.m256i()); }
 #else
-  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a.m256i()); }
 #endif
 
   __forceinline vuint8 operator +(const vuint8& a) { return a; }
@@ -201,11 +201,11 @@ namespace embree
   /// Binary Operators
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a.m256i(), b.m256i()); }
   __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
   __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
 
-  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a.m256i(), b.m256i()); }
   __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
   __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
 
@@ -213,37 +213,37 @@ namespace embree
   //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
   //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
 
-  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a.m256i(), b.m256i()); }
   __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
   __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
 
-  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a.m256i(), b.m256i()); }
   __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
   __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
 
-  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a.m256i(), b.m256i()); }
   __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
   __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
 
-  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
-  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a.m256i(), n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a.m256i(), n); }
 
-  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
-  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a.m256i(), n.m256i()); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a.m256i(), n.m256i()); }
 
-  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
-  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
-  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a.m256i(), b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a.m256i(), b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a.m256i(), b); }
 
-  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
-  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
-  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a.m256i(), b.m256i()); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a.m256i(), b.m256i()); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a.m256i(), b.m256i()); }
   
-  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a.m256i(), b.m256i()); }
   __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
   __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
 
-  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a.m256i(), b.m256i()); }
   __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
   __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
 
@@ -274,18 +274,18 @@ namespace embree
   ////////////////////////////////////////////////////////////////////////////////
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
-  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
-  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
-  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
-  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
-  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a.m256i(),b.m256i(),_MM_CMPINT_LE); }
 
   __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
-    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+    return _mm256_mask_blend_epi32(m.packedMask8(), (__m256i)f.m256i(), (__m256i)t.m256i());
   }
 #else
-  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256i(), b.m256i())); }
   __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
   //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
   //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
@@ -293,13 +293,13 @@ namespace embree
   //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
 
   __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
-    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f.m256i()), _mm256_castsi256_ps(t.m256i()), m.m256()));
   }
 #endif
 
   template<int mask>
   __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
-    return _mm256_blend_epi32(f, t, mask);
+    return _mm256_blend_epi32(f.m256i(), t.m256i(), mask);
   }
 
   __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
@@ -328,12 +328,12 @@ namespace embree
   //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
 
 #if defined(__AVX512VL__)
-  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
-  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
-  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
-  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
-  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
-  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask.packedMask8(), a.m256i(), b.m256i(), _MM_CMPINT_LE); }
 #else
   __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
   __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
@@ -347,59 +347,59 @@ namespace embree
   /// Movement/Shifting/Shuffling Functions
   ////////////////////////////////////////////////////////////////////////////////
 
-  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
-  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a.m256i(), b.m256i()); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a.m256i(), b.m256i()); }
 
   template<int i>
   __forceinline vuint8 shuffle(const vuint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i, i, i, i)));
   }
 
   template<int i0, int i1>
   __forceinline vuint8 shuffle4(const vuint8& v) {
-    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(v.m256i(), v.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1>
   __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
-    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+    return _mm256_permute2f128_si256(a.m256i(), b.m256i(), (i1 << 4) | (i0 << 0));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint8 shuffle(const vuint8& v) {
-    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
   template<int i0, int i1, int i2, int i3>
   __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
-    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a.m256i()), _mm256_castsi256_ps(b.m256i()), _MM_SHUFFLE(i3, i2, i1, i0)));
   }
 
-  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
-  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v.m256i()))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v.m256i()))))); }
 
-  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
-  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
-  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a.m256i(), b.m128i(), i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a.m256i(), i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a.m256i()); }
 
-  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v.m256i())); }
 
 #if !defined(__aarch64__)
   __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
-    return _mm256_permutevar8x32_epi32(v, index);
+    return _mm256_permutevar8x32_epi32(v.m256i(), index);
   }
 
   __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
-    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v.m256i()), index));
   }
 
   template<int i>
   __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
 #if defined(__AVX512VL__)
-    return _mm256_alignr_epi32(a, b, i);    
+    return _mm256_alignr_epi32(a.m256i(), b.m256i(), i);    
 #else
-    return _mm256_alignr_epi8(a, b, 4*i);
+    return _mm256_alignr_epi8(a.m256i(), b.m256i(), 4*i);
 #endif
   }  
 #endif // !defined(__aarch64__)
diff --git a/common/sys/intrinsics.h b/common/sys/intrinsics.h
index f5074bb29d..d5ae320526 100644
--- a/common/sys/intrinsics.h
+++ b/common/sys/intrinsics.h
@@ -9,7 +9,7 @@
 #include <intrin.h>
 #endif
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/arm/emulation.h"
 #else
 #include <immintrin.h>
@@ -74,7 +74,7 @@ namespace embree
   }
   
   __forceinline int bsf(int v) {
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
@@ -82,14 +82,14 @@ namespace embree
   }
   
   __forceinline unsigned bsf(unsigned v) {
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
     return _tzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanForward(&r,v); return r;
 #endif
   }
   
-#if defined(__X86_64__) || defined (__aarch64__)
+#if defined(__X86_64__) || defined (__aarch64__) || defined(_M_ARM64)
   __forceinline size_t bsf(size_t v) {
 #if defined(__AVX2__) 
     return _tzcnt_u64(v);
@@ -113,7 +113,7 @@ namespace embree
     return i;
   }
   
-#if defined(__X86_64__) || defined (__aarch64__)
+#if defined(__X86_64__) || defined (__aarch64__) || defined(_M_ARM64)
   __forceinline size_t bscf(size_t& v) 
   {
     size_t i = bsf(v);
@@ -123,7 +123,7 @@ namespace embree
 #endif
   
   __forceinline int bsr(int v) {
-#if defined(__AVX2__)  && !defined(__aarch64__)
+#if defined(__AVX2__)  && !defined(__aarch64__) && !defined(_M_ARM64)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
@@ -131,14 +131,14 @@ namespace embree
   }
   
   __forceinline unsigned bsr(unsigned v) {
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
     return 31 - _lzcnt_u32(v);
 #else
     unsigned long r = 0; _BitScanReverse(&r,v); return r;
 #endif
   }
   
-#if defined(__X86_64__) || defined (__aarch64__)
+#if defined(__X86_64__) || defined (__aarch64__) || defined(_M_ARM64)
   __forceinline size_t bsr(size_t v) {
 #if defined(__AVX2__) 
     return 63 -_lzcnt_u64(v);
@@ -150,7 +150,7 @@ namespace embree
   
   __forceinline int lzcnt(const int x)
   {
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
     return _lzcnt_u32(x);
 #else
     if (unlikely(x == 0)) return 32;
@@ -474,7 +474,7 @@ namespace embree
   
 #else
   
-#if defined(__SSE4_2__) || defined(__ARM_NEON)
+#if defined(__SSE4_2__) || defined(__ARM_NEON) || defined(_M_ARM64)
   
   __forceinline int popcnt(int in) {
     return _mm_popcnt_u32(in);
diff --git a/common/sys/platform.h b/common/sys/platform.h
index 6dc0cf3318..8f7089dd1d 100644
--- a/common/sys/platform.h
+++ b/common/sys/platform.h
@@ -58,7 +58,7 @@
 #endif
 
 /* detect 64 bit platform */
-#if defined(__X86_64__) || defined(__aarch64__)
+#if defined(__X86_64__) || defined(__aarch64__) || defined(_M_ARM64)
 #define __64BIT__
 #endif
 
diff --git a/common/sys/sysinfo.cpp b/common/sys/sysinfo.cpp
index 5f375cd95c..f0a2cedda8 100644
--- a/common/sys/sysinfo.cpp
+++ b/common/sys/sysinfo.cpp
@@ -98,7 +98,7 @@ namespace embree
     name[2] = cpuinfo[2];
     name[3] = 0;
     return (char*)name;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) || defined(_M_ARM64)
     return "ARM";
 #else
     return "Unknown";
@@ -174,7 +174,7 @@ namespace embree
     if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
     if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
     
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) || defined(_M_ARM64)
     return CPU::ARM;
 #endif
     
@@ -359,7 +359,7 @@ namespace embree
 #endif
     return cpu_features;
 
-#elif defined(__ARM_NEON) || defined(__EMSCRIPTEN__)
+#elif defined(__ARM_NEON) || defined(_M_ARM64) || defined(__EMSCRIPTEN__)
 
     int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
     cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
diff --git a/common/sys/sysinfo.h b/common/sys/sysinfo.h
index 02eacbe717..0adc8b494a 100644
--- a/common/sys/sysinfo.h
+++ b/common/sys/sysinfo.h
@@ -55,7 +55,7 @@
 #  define isa sse
 #  define ISA SSE
 #  define ISA_STR "SSE"
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) || defined(_M_ARM64)
 // NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
 #define isa sse2
 #define ISA NEON
diff --git a/common/sys/thread.cpp b/common/sys/thread.cpp
index 8b072067e6..a3ea4f9c04 100644
--- a/common/sys/thread.cpp
+++ b/common/sys/thread.cpp
@@ -6,7 +6,7 @@
 #include "estring.h"
 
 #include <iostream>
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/arm/emulation.h"
 #else
 #include <xmmintrin.h>
diff --git a/kernels/builders/bvh_builder_morton.h b/kernels/builders/bvh_builder_morton.h
index 87d4786810..67d23a3e55 100644
--- a/kernels/builders/bvh_builder_morton.h
+++ b/kernels/builders/bvh_builder_morton.h
@@ -88,7 +88,7 @@ namespace embree
           const vfloat4 lower = (vfloat4)box.lower;
           const vfloat4 upper = (vfloat4)box.upper;
           const vfloat4 centroid = lower+upper;
-          return vint4((centroid-base)*scale);
+          return vint4(((centroid-base)*scale).m128i());
         }
 
         __forceinline unsigned int code (const BBox3fa& box) const
diff --git a/kernels/builders/heuristic_binning.h b/kernels/builders/heuristic_binning.h
index d66726d09b..e097726937 100644
--- a/kernels/builders/heuristic_binning.h
+++ b/kernels/builders/heuristic_binning.h
@@ -63,12 +63,12 @@ namespace embree
           assert(i[2] >= 0 && (size_t)i[2] < num);
           
           // we clamp to handle corner cases that could calculate out of bounds bin
-          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)).m128i());
         }
 
         /*! faster but unsafe binning */
         __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
-          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale).m128i());
         }
 
         /*! faster but unsafe binning */
diff --git a/kernels/builders/primrefgen_presplit.h b/kernels/builders/primrefgen_presplit.h
index a63371235d..6de59f1480 100644
--- a/kernels/builders/primrefgen_presplit.h
+++ b/kernels/builders/primrefgen_presplit.h
@@ -47,7 +47,7 @@ namespace embree
         Vec3ia iupper(floor(gupper));
         
         /* this ignores dimensions that are empty */
-        iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))).m128i();
         
         /* compute a morton code for the lower and upper grid coordinates. */
         const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
@@ -85,7 +85,7 @@ namespace embree
         Vec3ia iupper(floor(gupper));
         
         /* this ignores dimensions that are empty */
-        iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper))).m128i();
         
         /* compute a morton code for the lower and upper grid coordinates. */
         const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
diff --git a/kernels/bvh/bvh.cpp b/kernels/bvh/bvh.cpp
index f6cf626465..b4d1d0ff3b 100644
--- a/kernels/bvh/bvh.cpp
+++ b/kernels/bvh/bvh.cpp
@@ -183,7 +183,7 @@ namespace embree
   template class BVHN<8>;
 #endif
 
-#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__) || defined(_M_ARM64)
   template class BVHN<4>;
 #endif
 }
diff --git a/kernels/bvh/bvh_builder_morton.cpp b/kernels/bvh/bvh_builder_morton.cpp
index f93fa16340..f90b85c508 100644
--- a/kernels/bvh/bvh_builder_morton.cpp
+++ b/kernels/bvh/bvh_builder_morton.cpp
@@ -122,7 +122,7 @@ namespace embree
         }
 
         Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
-        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+        BBox3fx box_o = BBox3fx((Vec3fx)lower.m128(),(Vec3fx)upper.m128());
 #if ROTATE_TREE
         if (N == 4)
           box_o.lower.a = unsigned(current.size());
@@ -177,7 +177,7 @@ namespace embree
           v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
         }
         Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
-        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+        BBox3fx box_o = BBox3fx((Vec3fx)lower.m128(),(Vec3fx)upper.m128());
 #if ROTATE_TREE
         if (N == 4)
           box_o.lower.a = current.size();
@@ -242,7 +242,7 @@ namespace embree
           v2[i] = 0;
         }
         Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
-        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+        BBox3fx box_o = BBox3fx((Vec3fx)lower.m128(),(Vec3fx)upper.m128());
 #if ROTATE_TREE
         if (N == 4)
           box_o.lower.a = current.size();
@@ -299,7 +299,7 @@ namespace embree
           v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
         }
         Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
-        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+        BBox3fx box_o = BBox3fx((Vec3fx)lower.m128(),(Vec3fx)upper.m128());
 #if ROTATE_TREE
         if (N == 4)
           box_o.lower.a = current.size();
diff --git a/kernels/bvh/bvh_node_qaabb.h b/kernels/bvh/bvh_node_qaabb.h
index 99671ddc5a..248e35cd16 100644
--- a/kernels/bvh/bvh_node_qaabb.h
+++ b/kernels/bvh/bvh_node_qaabb.h
@@ -55,12 +55,12 @@ namespace embree
       if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
       assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
       const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
-      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
-      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale)).vec_int()),MIN_QUAN);
+      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale)).vec_int()),MAX_QUAN);
       
       /* lower/upper correction */
-      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
-      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower.vec_float()),decode_scale,minF)) > lower;
+      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper.vec_float()),decode_scale,minF)) < upper;
       ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
       iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
       
@@ -75,8 +75,8 @@ namespace embree
       scale = decode_scale;
       
 #if defined(DEBUG)
-      vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
-      vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+      vfloat<N> extract_lower( vint<N>::loadu(lower_quant).vec_float() );
+      vfloat<N> extract_upper( vint<N>::loadu(upper_quant).vec_float() );
       vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
       vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
       assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
@@ -96,20 +96,20 @@ namespace embree
 #if defined(__AVX512F__) // KNL
     __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
 #endif
-    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x).vec_float()),scale.x,vfloat<N>(start.x)); }
     
-    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x).vec_float()),scale.x,vfloat<N>(start.x)); }
     
-    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y).vec_float()),scale.y,vfloat<N>(start.y)); }
     
-    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y).vec_float()),scale.y,vfloat<N>(start.y)); }
     
-    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z).vec_float()),scale.z,vfloat<N>(start.z)); }
     
-    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z).vec_float()),scale.z,vfloat<N>(start.z)); }
     
     template <int M>
-      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset).vec_float()); }
     
 #if defined(__AVX512F__)
     __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
diff --git a/kernels/bvh/bvh_statistics.cpp b/kernels/bvh/bvh_statistics.cpp
index 40f9043736..66ac082e4d 100644
--- a/kernels/bvh/bvh_statistics.cpp
+++ b/kernels/bvh/bvh_statistics.cpp
@@ -159,7 +159,7 @@ namespace embree
   template class BVHNStatistics<8>;
 #endif
 
-#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__) || defined(_M_ARM64)
   template class BVHNStatistics<4>;
 #endif
 }
diff --git a/kernels/bvh/bvh_traverser1.h b/kernels/bvh/bvh_traverser1.h
index 8ce01b57f5..ecf764fb1d 100644
--- a/kernels/bvh/bvh_traverser1.h
+++ b/kernels/bvh/bvh_traverser1.h
@@ -24,22 +24,22 @@ namespace embree
     {
       const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
       const vboolf<N> m_geq = d >= dist;
-      const vboolf<N> m_geq_shift = m_geq << 1;
+      const vboolf<N> m_geq_shift = m_geq.v << 1;
       dist = select(m_geq,d,dist);
       dist = select(m_geq_shift,dist_shift,dist);
     }
 
     template<int N>
     __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
-      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero).vec_int()));
     }
 
     __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
-      return toScalar(permutex2var((__m256i)index,n0,n1));
+      return toScalar(permutex2var((__m256i)index.m256i(),n0,n1));
     }
 
     __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
-      return toScalar(permute(n,index));
+      return toScalar(permute(n,index.m256i()));
     }
 
 #endif
@@ -287,7 +287,7 @@ namespace embree
           distance_i = align_shift_right<1>(distance_i,distance_i);
           cur = permuteExtract(distance_i,n0,n1);
           BVH::prefetch(cur,types);
-          const vint8 new_dist(permute(distance_i,vint8(zero)));
+          const vint8 new_dist(permute(distance_i,vint8(zero).m256i()));
           mask &= mask-1;
           isort_update<8>(dist,new_dist);
 
diff --git a/kernels/bvh/node_intersector1.h b/kernels/bvh/node_intersector1.h
index 17641fa888..3bbc5ec51d 100644
--- a/kernels/bvh/node_intersector1.h
+++ b/kernels/bvh/node_intersector1.h
@@ -9,7 +9,7 @@
 #define __FMA_X4__
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
 #define __FMA_X4__
 #endif
 
@@ -40,7 +40,7 @@ namespace embree
         rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
 #if defined(__FMA_X4__)
         const Vec3fa ray_org_rdir = ray_org*ray_rdir;
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
         org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
 #else
           //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
@@ -65,7 +65,7 @@ namespace embree
         dir  = Vec3vf<N>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
         rdir = Vec3vf<N>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
 #if defined(__FMA_X4__)
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
         org_rdir = org*rdir;
 #else
         neg_org_rdir = -(org*rdir);
@@ -82,7 +82,7 @@ namespace embree
       Vec3fa org_xyz, dir_xyz;
       Vec3vf<N> org, dir, rdir;
 #if defined(__FMA_X4__)
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(_M_ARM64)
       Vec3vf<N> org_rdir;
 #else
         //aarch64 version are keeping negation of the org_rdir and use madd
@@ -430,7 +430,7 @@ namespace embree
       __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist)
     {
 #if defined(__FMA_X4__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
@@ -454,7 +454,7 @@ namespace embree
       const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
       const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
       const vbool4 vmask = asInt(tNear) <= asInt(tFar);
@@ -485,7 +485,7 @@ namespace embree
       __forceinline size_t intersectNode<8>(const typename BVH8::AABBNode* node, const TravRay<8,false>& ray, vfloat8& dist)
     {
 #if defined(__AVX2__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
@@ -567,7 +567,7 @@ namespace embree
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
 #if defined(__FMA_X4__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
@@ -652,7 +652,7 @@ namespace embree
       const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
       const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
 #if defined (__FMA_X4__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
@@ -750,7 +750,7 @@ namespace embree
       const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
 
 #if defined(__FMA_X4__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
@@ -774,7 +774,7 @@ namespace embree
       const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
 #endif
       
-#if defined(__aarch64__) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+#if (defined(__aarch64__) || defined(_M_ARM64)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
       const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
       const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
       const vbool4 vmask = asInt(tNear) > asInt(tFar);
@@ -847,7 +847,7 @@ namespace embree
       const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
 
 #if defined(__AVX2__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
@@ -938,7 +938,7 @@ namespace embree
       const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
       const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
 #if defined(__FMA_X4__)
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
diff --git a/kernels/bvh/node_intersector_frustum.h b/kernels/bvh/node_intersector_frustum.h
index cad4e6de2d..877eb501f8 100644
--- a/kernels/bvh/node_intersector_frustum.h
+++ b/kernels/bvh/node_intersector_frustum.h
@@ -75,7 +75,7 @@ namespace embree
         min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
         max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
 
-#if defined (__aarch64__)
+#if defined (__aarch64__) || defined(_M_ARM64)
         neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
         neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
 #else
@@ -99,7 +99,7 @@ namespace embree
       Vec3fa min_rdir;
       Vec3fa max_rdir;
 
-#if defined (__aarch64__)
+#if defined (__aarch64__) || defined(_M_ARM64)
       Vec3fa neg_min_org_rdir;
       Vec3fa neg_max_org_rdir;
 #else
@@ -199,7 +199,7 @@ namespace embree
       const vfloat<N> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
       const vfloat<N> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
 
-#if defined (__aarch64__)
+#if defined (__aarch64__) || defined(_M_ARM64)
       const vfloat<N> fminX = madd(bminX, vfloat<N>(frustum.min_rdir.x), vfloat<N>(frustum.neg_min_org_rdir.x));
       const vfloat<N> fminY = madd(bminY, vfloat<N>(frustum.min_rdir.y), vfloat<N>(frustum.neg_min_org_rdir.y));
       const vfloat<N> fminZ = madd(bminZ, vfloat<N>(frustum.min_rdir.z), vfloat<N>(frustum.neg_min_org_rdir.z));
diff --git a/kernels/bvh/node_intersector_packet.h b/kernels/bvh/node_intersector_packet.h
index 4deacd620d..708a008a07 100644
--- a/kernels/bvh/node_intersector_packet.h
+++ b/kernels/bvh/node_intersector_packet.h
@@ -39,7 +39,7 @@ namespace embree
         org = ray_org;
         dir = ray_dir;
         rdir = rcp_safe(ray_dir);
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
         neg_org_rdir = -(org * rdir);
 #elif defined(__AVX2__)
         org_rdir = org * rdir;
@@ -57,7 +57,7 @@ namespace embree
       Vec3vf<K> org;
       Vec3vf<K> dir;
       Vec3vf<K> rdir;
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       Vec3vf<K> neg_org_rdir;
 #elif defined(__AVX2__)
       Vec3vf<K> org_rdir;
@@ -123,7 +123,7 @@ namespace embree
                                          const TravRayKFast<K>& ray, vfloat<K>& dist)
 
     {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
@@ -210,7 +210,7 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
@@ -320,7 +320,7 @@ namespace embree
       const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
       const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
@@ -489,7 +489,7 @@ namespace embree
       const vfloat<N> lower_z = node->dequantizeLowerZ();
       const vfloat<N> upper_z = node->dequantizeUpperZ();
 
-  #if defined(__aarch64__)
+  #if defined(__aarch64__) || defined(_M_ARM64)
       const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
       const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
       const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
@@ -581,7 +581,7 @@ namespace embree
         const vfloat<K> lower_z = node->template dequantizeLowerZ<K>(i,time);
         const vfloat<K> upper_z = node->template dequantizeUpperZ<K>(i,time);
         
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
         const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
         const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
         const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
diff --git a/kernels/common/accel.h b/kernels/common/accel.h
index 7d959377ae..49b30b9bc4 100644
--- a/kernels/common/accel.h
+++ b/kernels/common/accel.h
@@ -306,7 +306,7 @@ namespace embree
         intersector16.intersect(valid,this,ray,context);
       }
       
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
       __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, RayQueryContext* context) {
         const vint<4> mask = valid.mask32();
         intersect4(&mask,(RTCRayHit4&)ray,context);
@@ -367,7 +367,7 @@ namespace embree
         intersector16.occluded(valid,this,ray,context);
       }
       
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
       __forceinline void occluded(const vbool4& valid, RayK<4>& ray, RayQueryContext* context) {
         const vint<4> mask = valid.mask32();
         occluded4(&mask,(RTCRay4&)ray,context);
diff --git a/kernels/common/acceln.cpp b/kernels/common/acceln.cpp
index 9edb684db7..696d6fd9e7 100644
--- a/kernels/common/acceln.cpp
+++ b/kernels/common/acceln.cpp
@@ -89,7 +89,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded4(valid,ray,context);
-#if defined(__SSE2__) || defined(__ARM_NEON)
+#if defined(__SSE2__) || defined(__ARM_NEON) || defined(_M_ARM64)
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       if (unlikely(none(valid0 & hit0))) break;
@@ -103,7 +103,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded8(valid,ray,context);
-#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) || defined(_M_ARM64) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
@@ -119,7 +119,7 @@ namespace embree
     for (size_t i=0; i<This->accels.size(); i++) {
       if (This->accels[i]->isEmpty()) continue;
       This->accels[i]->intersectors.occluded16(valid,ray,context);
-#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+#if defined(__SSE2__) || defined(__ARM_NEON) || defined(_M_ARM64) // FIXME: use higher ISA
       vbool4 valid0 = asBool(((vint4*)valid)[0]);
       vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
       vbool4 valid1 = asBool(((vint4*)valid)[1]);
diff --git a/kernels/common/buffer.h b/kernels/common/buffer.h
index 984ed43ddd..6f91b0c6bf 100644
--- a/kernels/common/buffer.h
+++ b/kernels/common/buffer.h
@@ -420,7 +420,7 @@ namespace embree
     __forceinline const Vec3fa operator [](size_t i) const
     {
       assert(i<num);
-      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)).m128());
     }
     
     /*! writes the i'th element */
diff --git a/kernels/common/default.h b/kernels/common/default.h
index 3b00ad3c88..e0be632a80 100644
--- a/kernels/common/default.h
+++ b/kernels/common/default.h
@@ -234,7 +234,7 @@ namespace embree
     const vfloat<N> timeScaled = time * numTimeSegments;
     const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
     ftime = timeScaled - itimef;
-    return vint<N>(itimef);
+    return vint<N>(itimef.vec_int());
   }
 
   template<int N>
@@ -243,7 +243,7 @@ namespace embree
     const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
     const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
     ftime = timeScaled - itimef;
-    return vint<N>(itimef);
+    return vint<N>(itimef.vec_int());
   }
 
   /* calculate overlapping time segment range */
diff --git a/kernels/common/isa.h b/kernels/common/isa.h
index 9e1132e1a0..be08a75292 100644
--- a/kernels/common/isa.h
+++ b/kernels/common/isa.h
@@ -44,7 +44,7 @@ namespace embree
 #define SELECT_SYMBOL_DEFAULT(features,intersector) \
   intersector = isa::intersector;
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #if !defined(EMBREE_TARGET_SIMD4)
 #define EMBREE_TARGET_SIMD4
 #endif
diff --git a/kernels/common/state.cpp b/kernels/common/state.cpp
index cce5eafce1..871977ce5c 100644
--- a/kernels/common/state.cpp
+++ b/kernels/common/state.cpp
@@ -149,7 +149,7 @@ namespace embree
   }
 
   bool State::checkISASupport() {
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(_M_ARM64)
     /*
      * NEON CPU type is a mixture of NEON and SSE2
      */
@@ -175,7 +175,7 @@ namespace embree
      * functions */
 #if defined(DEBUG)
 #if defined(EMBREE_TARGET_SSE2)
-#if !defined(__ARM_NEON)
+#if !defined(__ARM_NEON) || !defined(_M_ARM64)
     assert(sse2::getISA() <= SSE2);
 #endif
 #endif
diff --git a/kernels/geometry/curveNi_intersector.h b/kernels/geometry/curveNi_intersector.h
index 137ec06d0c..ecb5144a23 100644
--- a/kernels/geometry/curveNi_intersector.h
+++ b/kernels/geometry/curveNi_intersector.h
@@ -31,8 +31,8 @@ namespace embree
         const float scale  = *prim.scale(N);
 #else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
-        const Vec3fa offset = Vec3fa(offset_scale);
-        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa offset = Vec3fa(offset_scale.m128());
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale).m128());
 #endif
         const Vec3fa org1 = (ray.org-offset)*scale;
         const Vec3fa dir1 = ray.dir*scale;
@@ -313,8 +313,8 @@ namespace embree
         const float scale  = *prim.scale(N);
 #else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
-        const Vec3fa offset = Vec3fa(offset_scale);
-        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa offset = Vec3fa(offset_scale.m128());
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale).m128());
 #endif
         const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
         const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
diff --git a/kernels/geometry/curveNi_mb_intersector.h b/kernels/geometry/curveNi_mb_intersector.h
index 4c14c2f004..1186ab4df2 100644
--- a/kernels/geometry/curveNi_mb_intersector.h
+++ b/kernels/geometry/curveNi_mb_intersector.h
@@ -32,8 +32,8 @@ namespace embree
         const float scale  = *prim.scale(N);
 #else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
-        const Vec3fa offset = Vec3fa(offset_scale);
-        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa offset = Vec3fa(offset_scale.m128());
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale).m128());
 #endif
         const Vec3fa org1 = (ray.org-offset)*scale;
         const Vec3fa dir1 = ray.dir*scale;
@@ -283,8 +283,8 @@ namespace embree
         const float scale  = *prim.scale(N);
 #else
         const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
-        const Vec3fa offset = Vec3fa(offset_scale);
-        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa offset = Vec3fa(offset_scale.m128());
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale).m128());
 #endif
         const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
         const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
diff --git a/kernels/geometry/grid_soa.cpp b/kernels/geometry/grid_soa.cpp
index 615070be9d..6772b9779e 100644
--- a/kernels/geometry/grid_soa.cpp
+++ b/kernels/geometry/grid_soa.cpp
@@ -33,8 +33,8 @@ namespace embree
         
         /* encode UVs */
         for (unsigned i=0; i<dim_offset; i+=VSIZEX) {
-          const vintx iu = (vintx) clamp(vfloatx::load(&local_grid_u[i])*(0x10000/8.0f), vfloatx(0.0f), vfloatx(0xFFFF));
-          const vintx iv = (vintx) clamp(vfloatx::load(&local_grid_v[i])*(0x10000/8.0f), vfloatx(0.0f), vfloatx(0xFFFF));
+          const vintx iu = (vintx) clamp(vfloatx::load(&local_grid_u[i])*(0x10000/8.0f), vfloatx(0.0f), vfloatx(0xFFFF)).vec_int();
+          const vintx iv = (vintx) clamp(vfloatx::load(&local_grid_v[i])*(0x10000/8.0f), vfloatx(0.0f), vfloatx(0xFFFF)).vec_int();
           vintx::storeu(&local_grid_uv[i], (iv << 16) | iu);
         }
 
diff --git a/kernels/geometry/grid_soa.h b/kernels/geometry/grid_soa.h
index 64d9813434..da45e7e154 100644
--- a/kernels/geometry/grid_soa.h
+++ b/kernels/geometry/grid_soa.h
@@ -238,8 +238,8 @@ namespace embree
         typedef typename vfloat::Int vint;
         const vint iu  = asInt(uv) & 0xffff;
         const vint iv  = srl(asInt(uv),16);
-	const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
-	const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+	const vfloat u = (vfloat)iu.vec_float() * vfloat(8.0f/0x10000);
+	const vfloat v = (vfloat)iv.vec_float() * vfloat(8.0f/0x10000);
 	return Vec2<vfloat>(u,v);
       }
       
diff --git a/kernels/geometry/instance_intersector.cpp b/kernels/geometry/instance_intersector.cpp
index a9209c69c3..75deefea25 100644
--- a/kernels/geometry/instance_intersector.cpp
+++ b/kernels/geometry/instance_intersector.cpp
@@ -306,7 +306,7 @@ namespace embree
       return occluded;
     }
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
     template struct InstanceIntersectorK<4>;
     template struct InstanceIntersectorKMB<4>;
 #endif
diff --git a/kernels/geometry/linei.h b/kernels/geometry/linei.h
index 3305025fc9..d87f54451e 100644
--- a/kernels/geometry/linei.h
+++ b/kernels/geometry/linei.h
@@ -651,8 +651,8 @@ namespace embree
     pL = lerp(aL,bL,vfloat8(ftime));
     pR = lerp(aR,bR,vfloat8(ftime));
     
-    pL = select(vboolf4(leftExists), pL, Vec4vf8(inf));
-    pR = select(vboolf4(rightExists), pR, Vec4vf8(inf));
+    pL = select(vboolf8(leftExists), pL, Vec4vf8(inf));
+    pR = select(vboolf8(rightExists), pR, Vec4vf8(inf));
   }
 
   template<>
diff --git a/kernels/geometry/subgrid_intersector_moeller.h b/kernels/geometry/subgrid_intersector_moeller.h
index 2666847333..5b7960d791 100644
--- a/kernels/geometry/subgrid_intersector_moeller.h
+++ b/kernels/geometry/subgrid_intersector_moeller.h
@@ -25,8 +25,8 @@ namespace embree
       const vint<M> syM(sy + stepY); 
       const float inv_resX = rcp((float)((int)g.resX-1));
       const float inv_resY = rcp((float)((int)g.resY-1));          
-      hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
-      hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+      hit.U = (hit.U + (vfloat<M>)sxM.vec_float() * hit.absDen) * inv_resX;
+      hit.V = (hit.V + (vfloat<M>)syM.vec_float() * hit.absDen) * inv_resY;
     }
     
     template<int M, bool filter>
diff --git a/kernels/geometry/subgrid_intersector_pluecker.h b/kernels/geometry/subgrid_intersector_pluecker.h
index 4919f927ae..0b2511f10e 100644
--- a/kernels/geometry/subgrid_intersector_pluecker.h
+++ b/kernels/geometry/subgrid_intersector_pluecker.h
@@ -22,8 +22,8 @@ namespace embree
       const vint<M> syM(sy + stepY);
       const float inv_resX = rcp((float)((int)g.resX-1));
       const float inv_resY = rcp((float)((int)g.resY-1));          
-      hit.U = (hit.U + vfloat<M>(sxM) * hit.UVW) * inv_resX;
-      hit.V = (hit.V + vfloat<M>(syM) * hit.UVW) * inv_resY;
+      hit.U = (hit.U + vfloat<M>(sxM.vec_float()) * hit.UVW) * inv_resX;
+      hit.V = (hit.V + vfloat<M>(syM.vec_float()) * hit.UVW) * inv_resY;
     }
     
     template<int M, bool filter>
diff --git a/kernels/subdiv/feature_adaptive_eval_grid.h b/kernels/subdiv/feature_adaptive_eval_grid.h
index 4755aba28d..d06d5658fe 100644
--- a/kernels/subdiv/feature_adaptive_eval_grid.h
+++ b/kernels/subdiv/feature_adaptive_eval_grid.h
@@ -148,13 +148,13 @@ namespace embree
         }
 #else
         foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
-            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
-            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix.vec_float())-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy.vec_float())-srange.lower.y)*scale_y);
             const Vec3vfx p = patch.eval(lu,lv);
             Vec3vfx n = zero;
             if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv));
-            const vfloatx u = vfloatx(ix)*rcp_swidth;
-            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vfloatx u = vfloatx(ix.vec_float())*rcp_swidth;
+            const vfloatx v = vfloatx(iy.vec_float())*rcp_sheight;
             const vintx ofs = (iy-y0)*dwidth+(ix-x0);
             if (likely(all(valid)) && all(iy==iy[0])) {
               const unsigned ofs2 = ofs[0];
diff --git a/kernels/subdiv/feature_adaptive_eval_simd.h b/kernels/subdiv/feature_adaptive_eval_simd.h
index edab0db12f..a788244924 100644
--- a/kernels/subdiv/feature_adaptive_eval_simd.h
+++ b/kernels/subdiv/feature_adaptive_eval_simd.h
@@ -156,8 +156,8 @@ namespace embree
           /* parametrization for arbitrary polygons */
           else 
           {
-            const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
-            const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const vint l = (vint)floor(0.5f*uv.x).m128i(); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const vint h = (vint)floor(0.5f*uv.y).m128i(); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
             const vint i = (h<<2)+l; assert(all(valid,i<Nc));
             foreach_unique(valid,i,[&](const vbool& valid, const int i) {
 #if PATCH_USE_GREGORY == 2
diff --git a/kernels/subdiv/linear_bezier_patch.h b/kernels/subdiv/linear_bezier_patch.h
index e93a86d7a3..06a23aac51 100644
--- a/kernels/subdiv/linear_bezier_patch.h
+++ b/kernels/subdiv/linear_bezier_patch.h
@@ -53,8 +53,8 @@ namespace embree
       __forceinline BBox<Vec2fa> bounds() const
       {
         const BBox<vfloat4> b = LR.bounds();
-        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
-        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        const BBox<Vec2fa> bl(Vec2fa(b.lower.m128()),Vec2fa(b.upper.m128()));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower).m128()),Vec2fa(shuffle<2,3,2,3>(b.upper).m128()));
         return merge(bl,br);
       }
     };
@@ -280,18 +280,18 @@ namespace embree
         : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {}
       
       __forceinline CubicBezierCurve<Vec2fa> getL() const {
-        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3));
+        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0.m128()),Vec2fa(LR.v1.m128()),Vec2fa(LR.v2.m128()),Vec2fa(LR.v3.m128()));
       }
       
       __forceinline CubicBezierCurve<Vec2fa> getR() const {
-        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3)));
+        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0).m128()),Vec2fa(shuffle<2,3,2,3>(LR.v1).m128()),Vec2fa(shuffle<2,3,2,3>(LR.v2).m128()),Vec2fa(shuffle<2,3,2,3>(LR.v3).m128()));
       }
       
       __forceinline BBox<Vec2fa> bounds() const
       {
         const BBox<vfloat4> b = LR.bounds();
-        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
-        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        const BBox<Vec2fa> bl(Vec2fa(b.lower.m128()),Vec2fa(b.upper.m128()));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower).m128()),Vec2fa(shuffle<2,3,2,3>(b.upper).m128()));
         return merge(bl,br);
       }
       
@@ -363,27 +363,27 @@ namespace embree
       __forceinline Vec2fa eval(const float u, const float v) const
       {
         const vfloat4 p = LR.eval(u);
-        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v));
+        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v).m128());
       }
       
       __forceinline Vec2fa eval_du(const float u, const float v) const
       {
         const vfloat4 dpdu = LR.eval_dt(u);
-        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v));
+        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v).m128());
       }
       
       __forceinline Vec2fa eval_dv(const float u, const float v) const
       {
         const vfloat4 p = LR.eval(u);
-        return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p));
+        return Vec2fa((shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p)).m128());
       }
       
       __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const
       {
         vfloat4 p0, dp0du; LR.eval(u,p0,dp0du);
-        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v));
-        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v));
-        dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0));
+        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v).m128());
+        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v).m128());
+        dpdv = Vec2fa((shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0)).m128());
       }
       
       __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const {
diff --git a/kernels/subdiv/patch_eval_grid.h b/kernels/subdiv/patch_eval_grid.h
index 167e1ebe1c..15f2a416e5 100644
--- a/kernels/subdiv/patch_eval_grid.h
+++ b/kernels/subdiv/patch_eval_grid.h
@@ -80,13 +80,13 @@ namespace embree
         }
 #else
         foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
-            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
-            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix.vec_float())-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy.vec_float())-srange.lower.y)*scale_y);
             const Vec3vfx p = patch->patch.eval(lu,lv);
             Vec3vfx n = zero;
             if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv));
-            const vfloatx u = vfloatx(ix)*rcp_swidth;
-            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vfloatx u = vfloatx(ix.vec_float())*rcp_swidth;
+            const vfloatx v = vfloatx(iy.vec_float())*rcp_sheight;
             const vintx ofs = (iy-y0)*dwidth+(ix-x0);
             if (likely(all(valid)) && all(iy==iy[0])) {
               const unsigned ofs2 = ofs[0];
diff --git a/kernels/subdiv/patch_eval_simd.h b/kernels/subdiv/patch_eval_simd.h
index fef88a4492..76ab69030d 100644
--- a/kernels/subdiv/patch_eval_simd.h
+++ b/kernels/subdiv/patch_eval_simd.h
@@ -65,8 +65,8 @@ namespace embree
         vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth)
         {
           vbool ret = false;
-          const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
-          const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
+          const vint l = (vint)floor(0.5f*U).m128i(); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
+          const vint h = (vint)floor(0.5f*V).m128i(); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
           const vint i = (h<<2)+l; assert(all(valid,i<patch->N));
           foreach_unique(valid,i,[&](const vbool& valid, const int i) {
               ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1);
diff --git a/kernels/subdiv/tessellation.h b/kernels/subdiv/tessellation.h
index abde4f2bde..dbc7fd87b0 100644
--- a/kernels/subdiv/tessellation.h
+++ b/kernels/subdiv/tessellation.h
@@ -112,8 +112,8 @@ namespace embree
     const vint8 grid_u_segments = vint8(swidth)-1;
     const vint8 grid_v_segments = vint8(sheight)-1;
     
-    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments));
-    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments));
+    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments.m256()));
+    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments.m256()));
     
     unsigned int index = 0;
     vint8 v_i( zero );
@@ -126,8 +126,8 @@ namespace embree
       for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8)
       {
         const vbool8 m_u = u_i < grid_u_segments;
-	const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f);
-	const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f);
+	const vfloat8 u = select(m_u, vfloat8((x0+u_i).m256()) * inv_grid_u_segments, 1.0f);
+	const vfloat8 v = select(m_v, vfloat8((y0+v_i).m256()) * inv_grid_v_segments, 1.0f);
 	vfloat8::storeu(&u_array[index + x],u);
 	vfloat8::storeu(&v_array[index + x],v);	   
       }
diff --git a/tutorials/common/math/random_sampler.h b/tutorials/common/math/random_sampler.h
index 0f472d9898..95717a1c3b 100644
--- a/tutorials/common/math/random_sampler.h
+++ b/tutorials/common/math/random_sampler.h
@@ -108,7 +108,7 @@ __forceinline Vec3fa RandomSampler_get3D(RandomSampler& self)
   const int u = RandomSampler_getUInt(self);
   const int v = RandomSampler_getUInt(self);
   const int w = RandomSampler_getUInt(self);
-  return Vec3fa(srl(Vec3ia(u,v,w), 1)) * 4.656612873077392578125e-10f;
+  return Vec3fa(srl(Vec3ia(u,v,w), 1).m128()) * 4.656612873077392578125e-10f;
 }
 
 } // namespace embree
diff --git a/tutorials/common/scenegraph/scenegraph.cpp b/tutorials/common/scenegraph/scenegraph.cpp
index 85d78ba948..e9a06fd9cd 100644
--- a/tutorials/common/scenegraph/scenegraph.cpp
+++ b/tutorials/common/scenegraph/scenegraph.cpp
@@ -700,10 +700,10 @@ namespace embree
       const vfloat4 v1 = vfloat4::loadu(&positions[idx+1]);
       const vfloat4 v2 = vfloat4::loadu(&positions[idx+2]);
       const vfloat4 v3 = vfloat4::loadu(&positions[idx+3]);
-      positions_o[4*i+0] = Vec3ff((1.0f/6.0f)*v0 + (2.0f/3.0f)*v1 + (1.0f/6.0f)*v2);
-      positions_o[4*i+1] = Vec3ff((2.0f/3.0f)*v1 + (1.0f/3.0f)*v2);
-      positions_o[4*i+2] = Vec3ff((1.0f/3.0f)*v1 + (2.0f/3.0f)*v2);
-      positions_o[4*i+3] = Vec3ff((1.0f/6.0f)*v1 + (2.0f/3.0f)*v2 + (1.0f/6.0f)*v3);
+      positions_o[4*i+0] = Vec3ff(((1.0f/6.0f)*v0 + (2.0f/3.0f)*v1 + (1.0f/6.0f)*v2).m128());
+      positions_o[4*i+1] = Vec3ff(((2.0f/3.0f)*v1 + (1.0f/3.0f)*v2).m128());
+      positions_o[4*i+2] = Vec3ff(((1.0f/3.0f)*v1 + (2.0f/3.0f)*v2).m128());
+      positions_o[4*i+3] = Vec3ff(((1.0f/6.0f)*v1 + (2.0f/3.0f)*v2 + (1.0f/6.0f)*v3).m128());
     }
     return positions_o;
   }
@@ -719,10 +719,10 @@ namespace embree
       vfloat4 v1 = vfloat4::loadu(&positions[idx+1]);
       vfloat4 v2 = vfloat4::loadu(&positions[idx+2]);
       vfloat4 v3 = vfloat4::loadu(&positions[idx+3]);
-      positions_o[4*i+0] = Vec3ff( 6.0f*v0 - 7.0f*v1 + 2.0f*v2);
-      positions_o[4*i+1] = Vec3ff( 2.0f*v1 - 1.0f*v2);
-      positions_o[4*i+2] = Vec3ff(-1.0f*v1 + 2.0f*v2);
-      positions_o[4*i+3] = Vec3ff( 2.0f*v1 - 7.0f*v2 + 6.0f*v3);
+      positions_o[4*i+0] = Vec3ff(( 6.0f*v0 - 7.0f*v1 + 2.0f*v2).m128());
+      positions_o[4*i+1] = Vec3ff(( 2.0f*v1 - 1.0f*v2).m128());
+      positions_o[4*i+2] = Vec3ff((-1.0f*v1 + 2.0f*v2).m128());
+      positions_o[4*i+3] = Vec3ff(( 2.0f*v1 - 7.0f*v2 + 6.0f*v3).m128());
     }
     return positions_o;
   }
@@ -739,10 +739,10 @@ namespace embree
       vfloat4 v1 = vfloat4::loadu(&positions[idx+1]);
       vfloat4 v2 = vfloat4::loadu(&positions[idx+2]);
       vfloat4 v3 = vfloat4::loadu(&positions[idx+3]);
-      positions_o[2*i+0] = Vec3ff(v0);
-      positions_o[2*i+1] = Vec3ff(v3);
-      tangents_o[2*i+0] = Vec3ff(3.0f*(v1-v0));
-      tangents_o[2*i+1] = Vec3ff(3.0f*(v3-v2));
+      positions_o[2*i+0] = Vec3ff(v0.m128());
+      positions_o[2*i+1] = Vec3ff(v3.m128());
+      tangents_o[2*i+0] = Vec3ff((3.0f*(v1-v0)).m128());
+      tangents_o[2*i+1] = Vec3ff((3.0f*(v3-v2)).m128());
     }
     return std::make_pair(positions_o,tangents_o);
   }
@@ -814,7 +814,7 @@ namespace embree
 
     for (ssize_t i=ipos, j=opos; i<ipos+4 && j<(ssize_t)out[0].size(); i++, j++) {
       for (size_t k=0; k<in.size(); k++) {
-        if (any(abs((vfloat4)in[k][i].m128-(vfloat4)out[k][j].m128) > 0.01f*(vfloat4)max(abs(in[k][i]),abs(out[k][j])).m128))
+        if (any(abs((vfloat4)in[k][i].v-(vfloat4)out[k][j].v) > 0.01f*(vfloat4)max(abs(in[k][i]),abs(out[k][j])).v))
           return false;
       }
     }