RenderKit · dousse-adobe · Jun 7, 2023 · Jun 8, 2023 · Feb 10, 2026 · Feb 10, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -221,6 +221,10 @@ OPTION(EMBREE_MIN_WIDTH "Enables min-width feature to enlarge curve and point th
 IF (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_OSX_ARCHITECTURES STREQUAL "") OR ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES))
   MESSAGE(STATUS "Building for Apple silicon")
   SET(EMBREE_ARM ON)
+# CMAKE_SYSTEM_PROCESSOR is unreliable on windows where it would report AMD64 with cross compilation
+ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
+  MESSAGE(STATUS "Building for Windows ARM64 (MSVC)")
+  SET(EMBREE_ARM ON)
 ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
   MESSAGE(STATUS "Building for AArch64")
   SET(EMBREE_ARM ON)

diff --git a/common/cmake/check_arm_neon.cpp b/common/cmake/check_arm_neon.cpp
@@ -1,7 +1,7 @@
 // Copyright 2009-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#if !defined(__ARM_NEON)
+#if !defined(__ARM_NEON) && !defined(_M_ARM64)
 #error "No ARM Neon support"
 #endif
 

diff --git a/common/cmake/msvc.cmake b/common/cmake/msvc.cmake
@@ -1,11 +1,18 @@
 ## Copyright 2009-2021 Intel Corporation
 ## SPDX-License-Identifier: Apache-2.0
 
-SET(FLAGS_SSE2  "/D__SSE__ /D__SSE2__")
-SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
-SET(FLAGS_AVX   "${FLAGS_SSE42} /arch:AVX")
-SET(FLAGS_AVX2  "${FLAGS_SSE42} /arch:AVX2")
-SET(FLAGS_AVX512  "${FLAGS_AVX2} /arch:AVX512")
+IF (EMBREE_ARM)
+  SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
+  SET(FLAGS_SSE42 "/D__SSE4_2__  /D__SSE4_1__")
+  SET(FLAGS_AVX "/D__AVX__ /D__SSE4_2__  /D__SSE4_1__  /D__BMI__ /D__BMI2__ /D__LZCNT__")
+  SET(FLAGS_AVX2 "/D__AVX2__ /D__AVX__ /D__SSE4_2__  /D__SSE4_1__  /D__BMI__ /D__BMI2__ /D__LZCNT__")
+ELSE()
+  SET(FLAGS_SSE2  "/D__SSE__ /D__SSE2__")
+  SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
+  SET(FLAGS_AVX   "${FLAGS_SSE42} /arch:AVX")
+  SET(FLAGS_AVX2  "${FLAGS_SSE42} /arch:AVX2")
+  SET(FLAGS_AVX512  "${FLAGS_AVX2} /arch:AVX512")
+ENDIF()
 
 SET(COMMON_CXX_FLAGS "")
 SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /EHsc")        # catch C++ exceptions only and extern "C" functions never throw a C++ exception
@@ -17,6 +24,10 @@ IF (EMBREE_STACK_PROTECTOR)
 ELSE()
   SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /GS-")          # do not protect against return address overrides
 ENDIF()
+IF (EMBREE_ARM)
+  # sse2neon uses the new preprocessor
+  SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /Zc:preprocessor")
+ENDIF()
 MACRO(DISABLE_STACK_PROTECTOR_FOR_FILE file)
   IF (EMBREE_STACK_PROTECTOR)
     SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "/GS-")

diff --git a/common/math/bbox.h b/common/math/bbox.h
@@ -82,7 +82,7 @@ namespace embree
     return lower > upper;
   }
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<> __forceinline bool BBox<Vec3fa>::empty() const {
     return !all(le_mask(lower,upper));
   }
@@ -233,7 +233,7 @@ namespace embree
 /// SSE / AVX / MIC specializations
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined (__SSE__) || defined(__ARM_NEON)
+#if defined (__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 

diff --git a/common/math/color.h b/common/math/color.h
@@ -160,7 +160,7 @@ namespace embree
   }
   __forceinline const Color rcp  ( const Color& a )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __m128 reciprocal = _mm_rcp_ps(a.m128);
     reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
     reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
@@ -173,11 +173,11 @@ namespace embree
 #endif
     return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r))));   // computes r + r * (1 - a * r)
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
   __forceinline const Color rsqrt( const Color& a )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __m128 r = _mm_rsqrt_ps(a.m128);
     r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
     r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
@@ -191,7 +191,7 @@ namespace embree
 #endif
     return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
   __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
 

diff --git a/common/math/emath.h b/common/math/emath.h
@@ -12,7 +12,7 @@
 #  include "math_sycl.h"
 #else
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/arm/emulation.h"
 #else
 #include <emmintrin.h>
@@ -60,14 +60,22 @@ namespace embree
 
   __forceinline float rcp  ( const float x )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // Move scalar to vector register and do rcp.
       __m128 a;
+#if !defined(_M_ARM64)
       a[0] = x;
+#else
+      a.n128_f32[0] = x;
+#endif
       float32x4_t reciprocal = vrecpeq_f32(a);
       reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
       reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+#if !defined(_M_ARM64)
       return reciprocal[0];
+#else
+      return reciprocal.n128_f32[0];
+#endif
 #else
 
     const __m128 a = _mm_set_ss(x);
@@ -84,58 +92,93 @@ namespace embree
     return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
 #endif
 
-#endif  //defined(__aarch64__)
+#endif  //defined(__aarch64__) || defined(_M_ARM64)
   }
 
   __forceinline float signmsk ( const float x ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128i b;
+#if !defined(_M_ARM64)
       a[0] = x;
       b[0] = 0x80000000;
+#else
+      a.n128_f32[0] = x;
+      b.n128_i32[0] = 0x80000000;
+#endif
       a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
 #endif
   }
   __forceinline float xorf( const float x, const float y ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128 b;
+#if !defined(_M_ARM64)
       a[0] = x;
       b[0] = y;
+#else
+      a.n128_f32[0] = x;
+      b.n128_f32[0] = y;
+#endif
       a = _mm_xor_ps(a, b);
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
 #endif
   }
   __forceinline float andf( const float x, const unsigned y ) {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
       __m128i b;
+#if  !defined(_M_ARM64)
       a[0] = x;
       b[0] = y;
+#else
+      a.n128_f32[0] = x;
+      b.n128_u32[0] = y;
+#endif
       a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+#if !defined(_M_ARM64)
       return a[0];
+#else
+      return a.n128_f32[0];
+#endif
 #else
     return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
 #endif
   }
   __forceinline float rsqrt( const float x )
   {
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
       // FP and Neon shares same vector register in arm64
       __m128 a;
+#if !defined(_M_ARM64)
       a[0] = x;
+#else
+      a.n128_f32[0] = x;
+#endif
       __m128 value = _mm_rsqrt_ps(a);
       value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
       value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+#if !defined(_M_ARM64)  
       return value[0];
+#else
+      return value.n128_f32[0];
+#endif
 #else
 
     const __m128 a = _mm_set_ss(x);
@@ -204,15 +247,24 @@ namespace embree
   __forceinline double floor( const double x ) { return ::floor (x); }
   __forceinline double ceil ( const double x ) { return ::ceil (x); }
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __forceinline float mini(float a, float b) {
         // FP and Neon shares same vector register in arm64
         __m128 x;
         __m128 y;
+#if !defined(_M_ARM64)
         x[0] = a;
         y[0] = b;
+#else
+      x.n128_f32[0] = a;
+      y.n128_f32[0] = b;
+#endif
         x = _mm_min_ps(x, y);
-        return x[0];
+#if !defined(_M_ARM64)
+      return x[0];
+#else
+      return x.n128_f32[0];
+#endif
     }
 #elif defined(__SSE4_1__)
   __forceinline float mini(float a, float b) {
@@ -223,15 +275,24 @@ namespace embree
   }
 #endif
 
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(_M_ARM64)
     __forceinline float maxi(float a, float b) {
         // FP and Neon shares same vector register in arm64
         __m128 x;
         __m128 y;
+#if !defined(_M_ARM64)
         x[0] = a;
         y[0] = b;
+#else
+      x.n128_f32[0] = a;
+      y.n128_f32[0] = b;
+#endif
         x = _mm_max_ps(x, y);
-        return x[0];
+#if !defined(_M_ARM64)
+      return x[0];
+#else
+      return x.n128_f32[0];
+#endif
     }
 #elif defined(__SSE4_1__)
   __forceinline float maxi(float a, float b) {
@@ -250,7 +311,7 @@ namespace embree
   __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
   __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
   __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
-#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
   __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
 #endif
 #if defined(__EMSCRIPTEN__)
@@ -270,7 +331,7 @@ namespace embree
   __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
   __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
   __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
-#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
+#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
   __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
 #endif
 #if defined(__EMSCRIPTEN__)
@@ -423,7 +484,7 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
     return x | (y << 1) | (z << 2);
   }
 
-#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)
 
   template<>
     __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)

diff --git a/common/math/linearspace3.h b/common/math/linearspace3.h
@@ -96,7 +96,7 @@ namespace embree
   /*! compute transposed matrix */
   template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
     vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
-    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+    return LinearSpace3<Vec3fa>(Vec3fa(rx.m128()),Vec3fa(ry.m128()),Vec3fa(rz.m128())); 
   }
 #endif
 

diff --git a/common/math/vec2.h b/common/math/vec2.h
@@ -205,7 +205,7 @@ namespace embree
 
 #include "vec2fa.h"
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
 #include "../simd/sse.h"
 #endif
 
@@ -221,7 +221,7 @@ namespace embree
 {
   template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 
-#if defined(__SSE__) || defined(__ARM_NEON)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
   template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
 #endif