Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ OPTION(EMBREE_MIN_WIDTH "Enables min-width feature to enlarge curve and point th
IF (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_OSX_ARCHITECTURES STREQUAL "") OR ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES))
MESSAGE(STATUS "Building for Apple silicon")
SET(EMBREE_ARM ON)
# CMAKE_SYSTEM_PROCESSOR is unreliable on windows where it would report AMD64 with cross compilation
ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
MESSAGE(STATUS "Building for Windows ARM64 (MSVC)")
SET(EMBREE_ARM ON)
ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
MESSAGE(STATUS "Building for AArch64")
SET(EMBREE_ARM ON)
Expand Down
2 changes: 1 addition & 1 deletion common/cmake/check_arm_neon.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#if !defined(__ARM_NEON)
#if !defined(__ARM_NEON) && !defined(_M_ARM64)
#error "No ARM Neon support"
#endif

Expand Down
21 changes: 16 additions & 5 deletions common/cmake/msvc.cmake
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
## Copyright 2009-2021 Intel Corporation
## SPDX-License-Identifier: Apache-2.0

SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
SET(FLAGS_AVX "${FLAGS_SSE42} /arch:AVX")
SET(FLAGS_AVX2 "${FLAGS_SSE42} /arch:AVX2")
SET(FLAGS_AVX512 "${FLAGS_AVX2} /arch:AVX512")
IF (EMBREE_ARM)
SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "/D__SSE4_2__ /D__SSE4_1__")
SET(FLAGS_AVX "/D__AVX__ /D__SSE4_2__ /D__SSE4_1__ /D__BMI__ /D__BMI2__ /D__LZCNT__")
SET(FLAGS_AVX2 "/D__AVX2__ /D__AVX__ /D__SSE4_2__ /D__SSE4_1__ /D__BMI__ /D__BMI2__ /D__LZCNT__")
ELSE()
SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
SET(FLAGS_AVX "${FLAGS_SSE42} /arch:AVX")
SET(FLAGS_AVX2 "${FLAGS_SSE42} /arch:AVX2")
SET(FLAGS_AVX512 "${FLAGS_AVX2} /arch:AVX512")
ENDIF()

SET(COMMON_CXX_FLAGS "")
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /EHsc") # catch C++ exceptions only and extern "C" functions never throw a C++ exception
Expand All @@ -17,6 +24,10 @@ IF (EMBREE_STACK_PROTECTOR)
ELSE()
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /GS-") # do not protect against return address overrides
ENDIF()
IF (EMBREE_ARM)
# sse2neon uses the new preprocessor
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /Zc:preprocessor")
ENDIF()
MACRO(DISABLE_STACK_PROTECTOR_FOR_FILE file)
IF (EMBREE_STACK_PROTECTOR)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "/GS-")
Expand Down
4 changes: 2 additions & 2 deletions common/math/bbox.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ namespace embree
return lower > upper;
}

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
template<> __forceinline bool BBox<Vec3fa>::empty() const {
return !all(le_mask(lower,upper));
}
Expand Down Expand Up @@ -233,7 +233,7 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////

#if defined (__SSE__) || defined(__ARM_NEON)
#if defined (__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/sse.h"
#endif

Expand Down
8 changes: 4 additions & 4 deletions common/math/color.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ namespace embree
}
__forceinline const Color rcp ( const Color& a )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
Expand All @@ -173,11 +173,11 @@ namespace embree
#endif
return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r)

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}
__forceinline const Color rsqrt( const Color& a )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
Expand All @@ -191,7 +191,7 @@ namespace embree
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }

Expand Down
89 changes: 75 additions & 14 deletions common/math/emath.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# include "math_sycl.h"
#else

#if defined(__ARM_NEON)
#if defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/arm/emulation.h"
#else
#include <emmintrin.h>
Expand Down Expand Up @@ -60,14 +60,22 @@ namespace embree

__forceinline float rcp ( const float x )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// Move scalar to vector register and do rcp.
__m128 a;
#if !defined(_M_ARM64)
a[0] = x;
#else
a.n128_f32[0] = x;
#endif
float32x4_t reciprocal = vrecpeq_f32(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
#if !defined(_M_ARM64)
return reciprocal[0];
#else
return reciprocal.n128_f32[0];
#endif
#else

const __m128 a = _mm_set_ss(x);
Expand All @@ -84,58 +92,93 @@ namespace embree
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}

__forceinline float signmsk ( const float x ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = 0x80000000;
#else
a.n128_f32[0] = x;
b.n128_i32[0] = 0x80000000;
#endif
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#endif
}
__forceinline float xorf( const float x, const float y ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128 b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = y;
#else
a.n128_f32[0] = x;
b.n128_f32[0] = y;
#endif
a = _mm_xor_ps(a, b);
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
#endif
}
__forceinline float andf( const float x, const unsigned y ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = y;
#else
a.n128_f32[0] = x;
b.n128_u32[0] = y;
#endif
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
#endif
}
__forceinline float rsqrt( const float x )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
#if !defined(_M_ARM64)
a[0] = x;
#else
a.n128_f32[0] = x;
#endif
__m128 value = _mm_rsqrt_ps(a);
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
#if !defined(_M_ARM64)
return value[0];
#else
return value.n128_f32[0];
#endif
#else

const __m128 a = _mm_set_ss(x);
Expand Down Expand Up @@ -204,15 +247,24 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); }

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__forceinline float mini(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
#if !defined(_M_ARM64)
x[0] = a;
y[0] = b;
#else
x.n128_f32[0] = a;
y.n128_f32[0] = b;
#endif
x = _mm_min_ps(x, y);
return x[0];
#if !defined(_M_ARM64)
return x[0];
#else
return x.n128_f32[0];
#endif
}
#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) {
Expand All @@ -223,15 +275,24 @@ namespace embree
}
#endif

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__forceinline float maxi(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
#if !defined(_M_ARM64)
x[0] = a;
y[0] = b;
#else
x.n128_f32[0] = a;
y.n128_f32[0] = b;
#endif
x = _mm_max_ps(x, y);
return x[0];
#if !defined(_M_ARM64)
return x[0];
#else
return x.n128_f32[0];
#endif
}
#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) {
Expand All @@ -250,7 +311,7 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; }
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif
#if defined(__EMSCRIPTEN__)
Expand All @@ -270,7 +331,7 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; }
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif
#if defined(__EMSCRIPTEN__)
Expand Down Expand Up @@ -423,7 +484,7 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
return x | (y << 1) | (z << 2);
}

#if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)

template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
Expand Down
2 changes: 1 addition & 1 deletion common/math/linearspace3.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ namespace embree
/*! compute transposed matrix */
template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const {
vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz));
return LinearSpace3<Vec3fa>(Vec3fa(rx.m128()),Vec3fa(ry.m128()),Vec3fa(rz.m128()));
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions common/math/vec2.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ namespace embree

#include "vec2fa.h"

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/sse.h"
#endif

Expand All @@ -221,7 +221,7 @@ namespace embree
{
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#endif

Expand Down
Loading
Loading