initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// A vector consisting of 16 bytes
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) BVec16
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying vector type
#if defined(JPH_USE_SSE)
using Type = __m128i;
#elif defined(JPH_USE_NEON)
using Type = uint8x16_t;
#else
using Type = struct { uint64 mData[2]; };
#endif
/// Constructor
BVec16() = default; ///< Intentionally not initialized for performance reasons
BVec16(const BVec16 &inRHS) = default;
BVec16 & operator = (const BVec16 &inRHS) = default;
JPH_INLINE BVec16(Type inRHS) : mValue(inRHS) { }
/// Create a vector from 16 bytes
JPH_INLINE BVec16(uint8 inB0, uint8 inB1, uint8 inB2, uint8 inB3, uint8 inB4, uint8 inB5, uint8 inB6, uint8 inB7, uint8 inB8, uint8 inB9, uint8 inB10, uint8 inB11, uint8 inB12, uint8 inB13, uint8 inB14, uint8 inB15);
/// Create a vector from two uint64's
JPH_INLINE BVec16(uint64 inV0, uint64 inV1);
/// Comparison
JPH_INLINE bool operator == (BVec16Arg inV2) const;
JPH_INLINE bool operator != (BVec16Arg inV2) const { return !(*this == inV2); }
/// Vector with all zeros
static JPH_INLINE BVec16 sZero();
/// Replicate int inV across all components
static JPH_INLINE BVec16 sReplicate(uint8 inV);
/// Load 16 bytes from memory
static JPH_INLINE BVec16 sLoadByte16(const uint8 *inV);
/// Equals (component wise), highest bit of each component that is set is considered true
static JPH_INLINE BVec16 sEquals(BVec16Arg inV1, BVec16Arg inV2);
/// Logical or (component wise)
static JPH_INLINE BVec16 sOr(BVec16Arg inV1, BVec16Arg inV2);
/// Logical xor (component wise)
static JPH_INLINE BVec16 sXor(BVec16Arg inV1, BVec16Arg inV2);
/// Logical and (component wise)
static JPH_INLINE BVec16 sAnd(BVec16Arg inV1, BVec16Arg inV2);
/// Logical not (component wise)
static JPH_INLINE BVec16 sNot(BVec16Arg inV1);
/// Get component by index
JPH_INLINE uint8 operator [] (uint inCoordinate) const { JPH_ASSERT(inCoordinate < 16); return mU8[inCoordinate]; }
JPH_INLINE uint8 & operator [] (uint inCoordinate) { JPH_ASSERT(inCoordinate < 16); return mU8[inCoordinate]; }
/// Test if any of the components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAnyTrue() const;
/// Test if all components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAllTrue() const;
/// Store if mU8[0] is true in bit 0, mU8[1] in bit 1, etc. (true is when highest bit of component is set)
JPH_INLINE int GetTrues() const;
/// To String
friend ostream & operator << (ostream &inStream, BVec16Arg inV)
{
inStream << uint(inV.mU8[0]) << ", " << uint(inV.mU8[1]) << ", " << uint(inV.mU8[2]) << ", " << uint(inV.mU8[3]) << ", "
<< uint(inV.mU8[4]) << ", " << uint(inV.mU8[5]) << ", " << uint(inV.mU8[6]) << ", " << uint(inV.mU8[7]) << ", "
<< uint(inV.mU8[8]) << ", " << uint(inV.mU8[9]) << ", " << uint(inV.mU8[10]) << ", " << uint(inV.mU8[11]) << ", "
<< uint(inV.mU8[12]) << ", " << uint(inV.mU8[13]) << ", " << uint(inV.mU8[14]) << ", " << uint(inV.mU8[15]);
return inStream;
}
union
{
Type mValue;
uint8 mU8[16];
uint64 mU64[2];
};
};
static_assert(std::is_trivial<BVec16>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "BVec16.inl"

View File

@@ -0,0 +1,177 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
// SPDX-License-Identifier: MIT
JPH_NAMESPACE_BEGIN
BVec16::BVec16(uint8 inB0, uint8 inB1, uint8 inB2, uint8 inB3, uint8 inB4, uint8 inB5, uint8 inB6, uint8 inB7, uint8 inB8, uint8 inB9, uint8 inB10, uint8 inB11, uint8 inB12, uint8 inB13, uint8 inB14, uint8 inB15)
{
#if defined(JPH_USE_SSE)
mValue = _mm_set_epi8(char(inB15), char(inB14), char(inB13), char(inB12), char(inB11), char(inB10), char(inB9), char(inB8), char(inB7), char(inB6), char(inB5), char(inB4), char(inB3), char(inB2), char(inB1), char(inB0));
#elif defined(JPH_USE_NEON)
uint8x8_t v1 = vcreate_u8(uint64(inB0) | (uint64(inB1) << 8) | (uint64(inB2) << 16) | (uint64(inB3) << 24) | (uint64(inB4) << 32) | (uint64(inB5) << 40) | (uint64(inB6) << 48) | (uint64(inB7) << 56));
uint8x8_t v2 = vcreate_u8(uint64(inB8) | (uint64(inB9) << 8) | (uint64(inB10) << 16) | (uint64(inB11) << 24) | (uint64(inB12) << 32) | (uint64(inB13) << 40) | (uint64(inB14) << 48) | (uint64(inB15) << 56));
mValue = vcombine_u8(v1, v2);
#else
mU8[0] = inB0;
mU8[1] = inB1;
mU8[2] = inB2;
mU8[3] = inB3;
mU8[4] = inB4;
mU8[5] = inB5;
mU8[6] = inB6;
mU8[7] = inB7;
mU8[8] = inB8;
mU8[9] = inB9;
mU8[10] = inB10;
mU8[11] = inB11;
mU8[12] = inB12;
mU8[13] = inB13;
mU8[14] = inB14;
mU8[15] = inB15;
#endif
}
BVec16::BVec16(uint64 inV0, uint64 inV1)
{
mU64[0] = inV0;
mU64[1] = inV1;
}
bool BVec16::operator == (BVec16Arg inV2) const
{
return sEquals(*this, inV2).TestAllTrue();
}
BVec16 BVec16::sZero()
{
#if defined(JPH_USE_SSE)
return _mm_setzero_si128();
#elif defined(JPH_USE_NEON)
return vdupq_n_u8(0);
#else
return BVec16(0, 0);
#endif
}
BVec16 BVec16::sReplicate(uint8 inV)
{
#if defined(JPH_USE_SSE)
return _mm_set1_epi8(char(inV));
#elif defined(JPH_USE_NEON)
return vdupq_n_u8(inV);
#else
uint64 v(inV);
v |= v << 8;
v |= v << 16;
v |= v << 32;
return BVec16(v, v);
#endif
}
BVec16 BVec16::sLoadByte16(const uint8 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
#elif defined(JPH_USE_NEON)
return vld1q_u8(inV);
#else
return BVec16(inV[0], inV[1], inV[2], inV[3], inV[4], inV[5], inV[6], inV[7], inV[8], inV[9], inV[10], inV[11], inV[12], inV[13], inV[14], inV[15]);
#endif
}
BVec16 BVec16::sEquals(BVec16Arg inV1, BVec16Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_cmpeq_epi8(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vceqq_u8(inV1.mValue, inV2.mValue);
#else
auto equals = [](uint64 inV1, uint64 inV2) {
uint64 r = inV1 ^ ~inV2; // Bits that are equal are 1
r &= r << 1; // Combine bit 0 through 1
r &= r << 2; // Combine bit 0 through 3
r &= r << 4; // Combine bit 0 through 7
r &= 0x8080808080808080UL; // Keep only the highest bit of each byte
return r;
};
return BVec16(equals(inV1.mU64[0], inV2.mU64[0]), equals(inV1.mU64[1], inV2.mU64[1]));
#endif
}
BVec16 BVec16::sOr(BVec16Arg inV1, BVec16Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_or_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vorrq_u8(inV1.mValue, inV2.mValue);
#else
return BVec16(inV1.mU64[0] | inV2.mU64[0], inV1.mU64[1] | inV2.mU64[1]);
#endif
}
BVec16 BVec16::sXor(BVec16Arg inV1, BVec16Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_xor_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return veorq_u8(inV1.mValue, inV2.mValue);
#else
return BVec16(inV1.mU64[0] ^ inV2.mU64[0], inV1.mU64[1] ^ inV2.mU64[1]);
#endif
}
BVec16 BVec16::sAnd(BVec16Arg inV1, BVec16Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_and_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vandq_u8(inV1.mValue, inV2.mValue);
#else
return BVec16(inV1.mU64[0] & inV2.mU64[0], inV1.mU64[1] & inV2.mU64[1]);
#endif
}
BVec16 BVec16::sNot(BVec16Arg inV1)
{
#if defined(JPH_USE_SSE)
return sXor(inV1, sReplicate(0xff));
#elif defined(JPH_USE_NEON)
return vmvnq_u8(inV1.mValue);
#else
return BVec16(~inV1.mU64[0], ~inV1.mU64[1]);
#endif
}
int BVec16::GetTrues() const
{
#if defined(JPH_USE_SSE)
return _mm_movemask_epi8(mValue);
#else
int result = 0;
for (int i = 0; i < 16; ++i)
result |= int(mU8[i] >> 7) << i;
return result;
#endif
}
bool BVec16::TestAnyTrue() const
{
#if defined(JPH_USE_SSE)
return _mm_movemask_epi8(mValue) != 0;
#else
return ((mU64[0] | mU64[1]) & 0x8080808080808080UL) != 0;
#endif
}
bool BVec16::TestAllTrue() const
{
#if defined(JPH_USE_SSE)
return _mm_movemask_epi8(mValue) == 0b1111111111111111;
#else
return ((mU64[0] & mU64[1]) & 0x8080808080808080UL) == 0x8080808080808080UL;
#endif
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,158 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/MathTypes.h>
JPH_NAMESPACE_BEGIN
/// Holds a 4x4 matrix of floats with the last column consisting of doubles
class [[nodiscard]] alignas(JPH_DVECTOR_ALIGNMENT) DMat44
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying column type
using Type = Vec4::Type;
using DType = DVec3::Type;
using DTypeArg = DVec3::TypeArg;
// Argument type
using ArgType = DMat44Arg;
/// Constructor
DMat44() = default; ///< Intentionally not initialized for performance reasons
JPH_INLINE DMat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, DVec3Arg inC4);
DMat44(const DMat44 &inM2) = default;
DMat44 & operator = (const DMat44 &inM2) = default;
JPH_INLINE explicit DMat44(Mat44Arg inM);
JPH_INLINE DMat44(Mat44Arg inRot, DVec3Arg inT);
JPH_INLINE DMat44(Type inC1, Type inC2, Type inC3, DTypeArg inC4);
/// Zero matrix
static JPH_INLINE DMat44 sZero();
/// Identity matrix
static JPH_INLINE DMat44 sIdentity();
/// Rotate from quaternion
static JPH_INLINE DMat44 sRotation(QuatArg inQuat) { return DMat44(Mat44::sRotation(inQuat), DVec3::sZero()); }
/// Get matrix that translates
static JPH_INLINE DMat44 sTranslation(DVec3Arg inV) { return DMat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), inV); }
/// Get matrix that rotates and translates
static JPH_INLINE DMat44 sRotationTranslation(QuatArg inR, DVec3Arg inT) { return DMat44(Mat44::sRotation(inR), inT); }
/// Get inverse matrix of sRotationTranslation
static JPH_INLINE DMat44 sInverseRotationTranslation(QuatArg inR, DVec3Arg inT);
/// Get matrix that scales (produces a matrix with (inV, 1) on its diagonal)
static JPH_INLINE DMat44 sScale(Vec3Arg inV) { return DMat44(Mat44::sScale(inV), DVec3::sZero()); }
/// Convert to Mat44 rounding to nearest
JPH_INLINE Mat44 ToMat44() const { return Mat44(mCol[0], mCol[1], mCol[2], Vec3(mCol3)); }
/// Comparison
JPH_INLINE bool operator == (DMat44Arg inM2) const;
JPH_INLINE bool operator != (DMat44Arg inM2) const { return !(*this == inM2); }
/// Test if two matrices are close
JPH_INLINE bool IsClose(DMat44Arg inM2, float inMaxDistSq = 1.0e-12f) const;
/// Multiply matrix by matrix
JPH_INLINE DMat44 operator * (Mat44Arg inM) const;
/// Multiply matrix by matrix
JPH_INLINE DMat44 operator * (DMat44Arg inM) const;
/// Multiply vector by matrix
JPH_INLINE DVec3 operator * (Vec3Arg inV) const;
/// Multiply vector by matrix
JPH_INLINE DVec3 operator * (DVec3Arg inV) const;
/// Multiply vector by only 3x3 part of the matrix
JPH_INLINE Vec3 Multiply3x3(Vec3Arg inV) const { return GetRotation().Multiply3x3(inV); }
/// Multiply vector by only 3x3 part of the matrix
JPH_INLINE DVec3 Multiply3x3(DVec3Arg inV) const;
/// Multiply vector by only 3x3 part of the transpose of the matrix (\f$result = this^T \: inV\f$)
JPH_INLINE Vec3 Multiply3x3Transposed(Vec3Arg inV) const { return GetRotation().Multiply3x3Transposed(inV); }
/// Scale a matrix: result = this * Mat44::sScale(inScale)
JPH_INLINE DMat44 PreScaled(Vec3Arg inScale) const;
/// Scale a matrix: result = Mat44::sScale(inScale) * this
JPH_INLINE DMat44 PostScaled(Vec3Arg inScale) const;
/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
JPH_INLINE DMat44 PreTranslated(Vec3Arg inTranslation) const;
/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
JPH_INLINE DMat44 PreTranslated(DVec3Arg inTranslation) const;
/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
JPH_INLINE DMat44 PostTranslated(Vec3Arg inTranslation) const;
/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
JPH_INLINE DMat44 PostTranslated(DVec3Arg inTranslation) const;
/// Access to the columns
JPH_INLINE Vec3 GetAxisX() const { return Vec3(mCol[0]); }
JPH_INLINE void SetAxisX(Vec3Arg inV) { mCol[0] = Vec4(inV, 0.0f); }
JPH_INLINE Vec3 GetAxisY() const { return Vec3(mCol[1]); }
JPH_INLINE void SetAxisY(Vec3Arg inV) { mCol[1] = Vec4(inV, 0.0f); }
JPH_INLINE Vec3 GetAxisZ() const { return Vec3(mCol[2]); }
JPH_INLINE void SetAxisZ(Vec3Arg inV) { mCol[2] = Vec4(inV, 0.0f); }
JPH_INLINE DVec3 GetTranslation() const { return mCol3; }
JPH_INLINE void SetTranslation(DVec3Arg inV) { mCol3 = inV; }
JPH_INLINE Vec3 GetColumn3(uint inCol) const { JPH_ASSERT(inCol < 3); return Vec3(mCol[inCol]); }
JPH_INLINE void SetColumn3(uint inCol, Vec3Arg inV) { JPH_ASSERT(inCol < 3); mCol[inCol] = Vec4(inV, 0.0f); }
JPH_INLINE Vec4 GetColumn4(uint inCol) const { JPH_ASSERT(inCol < 3); return mCol[inCol]; }
JPH_INLINE void SetColumn4(uint inCol, Vec4Arg inV) { JPH_ASSERT(inCol < 3); mCol[inCol] = inV; }
/// Transpose 3x3 subpart of matrix
JPH_INLINE Mat44 Transposed3x3() const { return GetRotation().Transposed3x3(); }
/// Inverse 4x4 matrix
JPH_INLINE DMat44 Inversed() const;
/// Inverse 4x4 matrix when it only contains rotation and translation
JPH_INLINE DMat44 InversedRotationTranslation() const;
/// Get rotation part only (note: retains the first 3 values from the bottom row)
JPH_INLINE Mat44 GetRotation() const { return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1)); }
/// Updates the rotation part of this matrix (the first 3 columns)
JPH_INLINE void SetRotation(Mat44Arg inRotation);
/// Convert to quaternion
JPH_INLINE Quat GetQuaternion() const { return GetRotation().GetQuaternion(); }
/// Get matrix that transforms a direction with the same transform as this matrix (length is not preserved)
JPH_INLINE Mat44 GetDirectionPreservingMatrix() const { return GetRotation().Inversed3x3().Transposed3x3(); }
/// Works identical to Mat44::Decompose
JPH_INLINE DMat44 Decompose(Vec3 &outScale) const { return DMat44(GetRotation().Decompose(outScale), mCol3); }
/// To String
friend ostream & operator << (ostream &inStream, DMat44Arg inM)
{
inStream << inM.mCol[0] << ", " << inM.mCol[1] << ", " << inM.mCol[2] << ", " << inM.mCol3;
return inStream;
}
private:
Vec4 mCol[3]; ///< Rotation columns
DVec3 mCol3; ///< Translation column, 4th element is assumed to be 1
};
static_assert(std::is_trivial<DMat44>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "DMat44.inl"

View File

@@ -0,0 +1,310 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/DVec3.h>
JPH_NAMESPACE_BEGIN
DMat44::DMat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, DVec3Arg inC4) :
mCol { inC1, inC2, inC3 },
mCol3(inC4)
{
}
DMat44::DMat44(Type inC1, Type inC2, Type inC3, DTypeArg inC4) :
mCol { inC1, inC2, inC3 },
mCol3(inC4)
{
}
DMat44::DMat44(Mat44Arg inM) :
mCol { inM.GetColumn4(0), inM.GetColumn4(1), inM.GetColumn4(2) },
mCol3(inM.GetTranslation())
{
}
DMat44::DMat44(Mat44Arg inRot, DVec3Arg inT) :
mCol { inRot.GetColumn4(0), inRot.GetColumn4(1), inRot.GetColumn4(2) },
mCol3(inT)
{
}
DMat44 DMat44::sZero()
{
return DMat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), DVec3::sZero());
}
DMat44 DMat44::sIdentity()
{
return DMat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), DVec3::sZero());
}
DMat44 DMat44::sInverseRotationTranslation(QuatArg inR, DVec3Arg inT)
{
Mat44 m = Mat44::sRotation(inR.Conjugated());
DMat44 dm(m, DVec3::sZero());
dm.SetTranslation(-dm.Multiply3x3(inT));
return dm;
}
bool DMat44::operator == (DMat44Arg inM2) const
{
return mCol[0] == inM2.mCol[0]
&& mCol[1] == inM2.mCol[1]
&& mCol[2] == inM2.mCol[2]
&& mCol3 == inM2.mCol3;
}
bool DMat44::IsClose(DMat44Arg inM2, float inMaxDistSq) const
{
for (int i = 0; i < 3; ++i)
if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
return false;
return mCol3.IsClose(inM2.mCol3, double(inMaxDistSq));
}
DVec3 DMat44::operator * (Vec3Arg inV) const
{
#if defined(JPH_USE_AVX)
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
return DVec3::sFixW(_mm256_add_pd(mCol3.mValue, _mm256_cvtps_pd(t)));
#elif defined(JPH_USE_SSE)
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
__m128d low = _mm_add_pd(mCol3.mValue.mLow, _mm_cvtps_pd(t));
__m128d high = _mm_add_pd(mCol3.mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(t, t, _MM_SHUFFLE(2, 2, 2, 2))));
return DVec3({ low, high });
#elif defined(JPH_USE_NEON)
float32x4_t t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
float64x2_t low = vaddq_f64(mCol3.mValue.val[0], vcvt_f64_f32(vget_low_f32(t)));
float64x2_t high = vaddq_f64(mCol3.mValue.val[1], vcvt_high_f64_f32(t));
return DVec3::sFixW({ low, high });
#else
return DVec3(
mCol3.mF64[0] + double(mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2]),
mCol3.mF64[1] + double(mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2]),
mCol3.mF64[2] + double(mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]));
#endif
}
DVec3 DMat44::operator * (DVec3Arg inV) const
{
#if defined(JPH_USE_AVX)
__m256d t = _mm256_add_pd(mCol3.mValue, _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.mF64[0])));
t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.mF64[1])));
t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.mF64[2])));
return DVec3::sFixW(t);
#elif defined(JPH_USE_SSE)
__m128d xxxx = _mm_set1_pd(inV.mF64[0]);
__m128d yyyy = _mm_set1_pd(inV.mF64[1]);
__m128d zzzz = _mm_set1_pd(inV.mF64[2]);
__m128 col0 = mCol[0].mValue;
__m128 col1 = mCol[1].mValue;
__m128 col2 = mCol[2].mValue;
__m128d t_low = _mm_add_pd(mCol3.mValue.mLow, _mm_mul_pd(_mm_cvtps_pd(col0), xxxx));
t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
__m128d t_high = _mm_add_pd(mCol3.mValue.mHigh, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx));
t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
return DVec3({ t_low, t_high });
#elif defined(JPH_USE_NEON)
float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
float32x4_t col0 = mCol[0].mValue;
float32x4_t col1 = mCol[1].mValue;
float32x4_t col2 = mCol[2].mValue;
float64x2_t t_low = vaddq_f64(mCol3.mValue.val[0], vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx));
t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
float64x2_t t_high = vaddq_f64(mCol3.mValue.val[1], vmulq_f64(vcvt_high_f64_f32(col0), xxxx));
t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
return DVec3::sFixW({ t_low, t_high });
#else
return DVec3(
mCol3.mF64[0] + double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2],
mCol3.mF64[1] + double(mCol[0].mF32[1]) * inV.mF64[0] + double(mCol[1].mF32[1]) * inV.mF64[1] + double(mCol[2].mF32[1]) * inV.mF64[2],
mCol3.mF64[2] + double(mCol[0].mF32[2]) * inV.mF64[0] + double(mCol[1].mF32[2]) * inV.mF64[1] + double(mCol[2].mF32[2]) * inV.mF64[2]);
#endif
}
DVec3 DMat44::Multiply3x3(DVec3Arg inV) const
{
#if defined(JPH_USE_AVX)
__m256d t = _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.mF64[0]));
t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.mF64[1])));
t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.mF64[2])));
return DVec3::sFixW(t);
#elif defined(JPH_USE_SSE)
__m128d xxxx = _mm_set1_pd(inV.mF64[0]);
__m128d yyyy = _mm_set1_pd(inV.mF64[1]);
__m128d zzzz = _mm_set1_pd(inV.mF64[2]);
__m128 col0 = mCol[0].mValue;
__m128 col1 = mCol[1].mValue;
__m128 col2 = mCol[2].mValue;
__m128d t_low = _mm_mul_pd(_mm_cvtps_pd(col0), xxxx);
t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
__m128d t_high = _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx);
t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
return DVec3({ t_low, t_high });
#elif defined(JPH_USE_NEON)
float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
float32x4_t col0 = mCol[0].mValue;
float32x4_t col1 = mCol[1].mValue;
float32x4_t col2 = mCol[2].mValue;
float64x2_t t_low = vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx);
t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
float64x2_t t_high = vmulq_f64(vcvt_high_f64_f32(col0), xxxx);
t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
return DVec3::sFixW({ t_low, t_high });
#else
return DVec3(
double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2],
double(mCol[0].mF32[1]) * inV.mF64[0] + double(mCol[1].mF32[1]) * inV.mF64[1] + double(mCol[2].mF32[1]) * inV.mF64[2],
double(mCol[0].mF32[2]) * inV.mF64[0] + double(mCol[1].mF32[2]) * inV.mF64[1] + double(mCol[2].mF32[2]) * inV.mF64[2]);
#endif
}
DMat44 DMat44::operator * (Mat44Arg inM) const
{
DMat44 result;
// Rotation part
#if defined(JPH_USE_SSE)
for (int i = 0; i < 3; ++i)
{
__m128 c = inM.GetColumn4(i).mValue;
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
result.mCol[i].mValue = t;
}
#elif defined(JPH_USE_NEON)
for (int i = 0; i < 3; ++i)
{
Type c = inM.GetColumn4(i).mValue;
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
result.mCol[i].mValue = t;
}
#else
for (int i = 0; i < 3; ++i)
{
Vec4 coli = inM.GetColumn4(i);
result.mCol[i] = mCol[0] * coli.mF32[0] + mCol[1] * coli.mF32[1] + mCol[2] * coli.mF32[2];
}
#endif
// Translation part
result.mCol3 = *this * inM.GetTranslation();
return result;
}
DMat44 DMat44::operator * (DMat44Arg inM) const
{
DMat44 result;
// Rotation part
#if defined(JPH_USE_SSE)
for (int i = 0; i < 3; ++i)
{
__m128 c = inM.mCol[i].mValue;
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
result.mCol[i].mValue = t;
}
#elif defined(JPH_USE_NEON)
for (int i = 0; i < 3; ++i)
{
Type c = inM.GetColumn4(i).mValue;
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
result.mCol[i].mValue = t;
}
#else
for (int i = 0; i < 3; ++i)
{
Vec4 coli = inM.mCol[i];
result.mCol[i] = mCol[0] * coli.mF32[0] + mCol[1] * coli.mF32[1] + mCol[2] * coli.mF32[2];
}
#endif
// Translation part
result.mCol3 = *this * inM.GetTranslation();
return result;
}
void DMat44::SetRotation(Mat44Arg inRotation)
{
mCol[0] = inRotation.GetColumn4(0);
mCol[1] = inRotation.GetColumn4(1);
mCol[2] = inRotation.GetColumn4(2);
}
DMat44 DMat44::PreScaled(Vec3Arg inScale) const
{
return DMat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol3);
}
DMat44 DMat44::PostScaled(Vec3Arg inScale) const
{
Vec4 scale(inScale, 1);
return DMat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], DVec3(scale) * mCol3);
}
DMat44 DMat44::PreTranslated(Vec3Arg inTranslation) const
{
return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + Multiply3x3(inTranslation));
}
DMat44 DMat44::PreTranslated(DVec3Arg inTranslation) const
{
return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + Multiply3x3(inTranslation));
}
DMat44 DMat44::PostTranslated(Vec3Arg inTranslation) const
{
return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + inTranslation);
}
DMat44 DMat44::PostTranslated(DVec3Arg inTranslation) const
{
return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + inTranslation);
}
DMat44 DMat44::Inversed() const
{
DMat44 m(GetRotation().Inversed3x3());
m.mCol3 = -m.Multiply3x3(mCol3);
return m;
}
DMat44 DMat44::InversedRotationTranslation() const
{
DMat44 m(GetRotation().Transposed3x3());
m.mCol3 = -m.Multiply3x3(mCol3);
return m;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,291 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Double3.h>
JPH_NAMESPACE_BEGIN
/// 3 component vector of doubles (stored as 4 vectors).
/// Note that we keep the 4th component the same as the 3rd component to avoid divisions by zero when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED defined
class [[nodiscard]] alignas(JPH_DVECTOR_ALIGNMENT) DVec3
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying vector type
#if defined(JPH_USE_AVX)
using Type = __m256d;
using TypeArg = __m256d;
#elif defined(JPH_USE_SSE)
using Type = struct { __m128d mLow, mHigh; };
using TypeArg = const Type &;
#elif defined(JPH_USE_NEON)
using Type = float64x2x2_t;
using TypeArg = const Type &;
#else
using Type = struct { double mData[4]; };
using TypeArg = const Type &;
#endif
// Argument type
using ArgType = DVec3Arg;
/// Constructor
DVec3() = default; ///< Intentionally not initialized for performance reasons
DVec3(const DVec3 &inRHS) = default;
DVec3 & operator = (const DVec3 &inRHS) = default;
JPH_INLINE explicit DVec3(Vec3Arg inRHS);
JPH_INLINE explicit DVec3(Vec4Arg inRHS);
JPH_INLINE DVec3(TypeArg inRHS) : mValue(inRHS) { CheckW(); }
/// Create a vector from 3 components
JPH_INLINE DVec3(double inX, double inY, double inZ);
/// Load 3 doubles from memory
explicit JPH_INLINE DVec3(const Double3 &inV);
/// Vector with all zeros
static JPH_INLINE DVec3 sZero();
/// Vector with all ones
static JPH_INLINE DVec3 sOne();
/// Vectors with the principal axis
static JPH_INLINE DVec3 sAxisX() { return DVec3(1, 0, 0); }
static JPH_INLINE DVec3 sAxisY() { return DVec3(0, 1, 0); }
static JPH_INLINE DVec3 sAxisZ() { return DVec3(0, 0, 1); }
/// Replicate inV across all components
static JPH_INLINE DVec3 sReplicate(double inV);
/// Vector with all NaN's
static JPH_INLINE DVec3 sNaN();
/// Load 3 doubles from memory (reads 64 bits extra which it doesn't use)
static JPH_INLINE DVec3 sLoadDouble3Unsafe(const Double3 &inV);
/// Store 3 doubles to memory
JPH_INLINE void StoreDouble3(Double3 *outV) const;
/// Convert to float vector 3 rounding to nearest
JPH_INLINE explicit operator Vec3() const;
/// Prepare to convert to float vector 3 rounding towards zero (returns DVec3 that can be converted to a Vec3 to get the rounding)
JPH_INLINE DVec3 PrepareRoundToZero() const;
/// Prepare to convert to float vector 3 rounding towards positive/negative inf (returns DVec3 that can be converted to a Vec3 to get the rounding)
JPH_INLINE DVec3 PrepareRoundToInf() const;
/// Convert to float vector 3 rounding down
JPH_INLINE Vec3 ToVec3RoundDown() const;
/// Convert to float vector 3 rounding up
JPH_INLINE Vec3 ToVec3RoundUp() const;
/// Return the minimum value of each of the components
static JPH_INLINE DVec3 sMin(DVec3Arg inV1, DVec3Arg inV2);
/// Return the maximum of each of the components
static JPH_INLINE DVec3 sMax(DVec3Arg inV1, DVec3Arg inV2);
/// Clamp a vector between min and max (component wise)
static JPH_INLINE DVec3 sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax);
/// Equals (component wise)
static JPH_INLINE DVec3 sEquals(DVec3Arg inV1, DVec3Arg inV2);
/// Less than (component wise)
static JPH_INLINE DVec3 sLess(DVec3Arg inV1, DVec3Arg inV2);
/// Less than or equal (component wise)
static JPH_INLINE DVec3 sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2);
/// Greater than (component wise)
static JPH_INLINE DVec3 sGreater(DVec3Arg inV1, DVec3Arg inV2);
/// Greater than or equal (component wise)
static JPH_INLINE DVec3 sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2);
/// Calculates inMul1 * inMul2 + inAdd
static JPH_INLINE DVec3 sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd);
/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
static JPH_INLINE DVec3 sSelect(DVec3Arg inNotSet, DVec3Arg inSet, DVec3Arg inControl);
/// Logical or (component wise)
static JPH_INLINE DVec3 sOr(DVec3Arg inV1, DVec3Arg inV2);
/// Logical xor (component wise)
static JPH_INLINE DVec3 sXor(DVec3Arg inV1, DVec3Arg inV2);
/// Logical and (component wise)
static JPH_INLINE DVec3 sAnd(DVec3Arg inV1, DVec3Arg inV2);
/// Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of component is set)
JPH_INLINE int GetTrues() const;
/// Test if any of the components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAnyTrue() const;
/// Test if all components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAllTrue() const;
/// Get individual components
#if defined(JPH_USE_AVX)
JPH_INLINE double GetX() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(mValue)); }
JPH_INLINE double GetY() const { return mF64[1]; }
JPH_INLINE double GetZ() const { return mF64[2]; }
#elif defined(JPH_USE_SSE)
JPH_INLINE double GetX() const { return _mm_cvtsd_f64(mValue.mLow); }
JPH_INLINE double GetY() const { return mF64[1]; }
JPH_INLINE double GetZ() const { return _mm_cvtsd_f64(mValue.mHigh); }
#elif defined(JPH_USE_NEON)
JPH_INLINE double GetX() const { return vgetq_lane_f64(mValue.val[0], 0); }
JPH_INLINE double GetY() const { return vgetq_lane_f64(mValue.val[0], 1); }
JPH_INLINE double GetZ() const { return vgetq_lane_f64(mValue.val[1], 0); }
#else
JPH_INLINE double GetX() const { return mF64[0]; }
JPH_INLINE double GetY() const { return mF64[1]; }
JPH_INLINE double GetZ() const { return mF64[2]; }
#endif
/// Set individual components
JPH_INLINE void SetX(double inX) { mF64[0] = inX; }
JPH_INLINE void SetY(double inY) { mF64[1] = inY; }
JPH_INLINE void SetZ(double inZ) { mF64[2] = mF64[3] = inZ; } // Assure Z and W are the same
/// Set all components
JPH_INLINE void Set(double inX, double inY, double inZ) { *this = DVec3(inX, inY, inZ); }
/// Get double component by index
JPH_INLINE double operator [] (uint inCoordinate) const { JPH_ASSERT(inCoordinate < 3); return mF64[inCoordinate]; }
/// Set double component by index
JPH_INLINE void SetComponent(uint inCoordinate, double inValue) { JPH_ASSERT(inCoordinate < 3); mF64[inCoordinate] = inValue; mValue = sFixW(mValue); } // Assure Z and W are the same
/// Comparison
JPH_INLINE bool operator == (DVec3Arg inV2) const;
JPH_INLINE bool operator != (DVec3Arg inV2) const { return !(*this == inV2); }
/// Test if two vectors are close
JPH_INLINE bool IsClose(DVec3Arg inV2, double inMaxDistSq = 1.0e-24) const;
/// Test if vector is near zero
JPH_INLINE bool IsNearZero(double inMaxDistSq = 1.0e-24) const;
/// Test if vector is normalized
JPH_INLINE bool IsNormalized(double inTolerance = 1.0e-12) const;
/// Test if vector contains NaN elements
JPH_INLINE bool IsNaN() const;
/// Multiply two double vectors (component wise)
JPH_INLINE DVec3 operator * (DVec3Arg inV2) const;
/// Multiply vector with double
JPH_INLINE DVec3 operator * (double inV2) const;
/// Multiply vector with double
friend JPH_INLINE DVec3 operator * (double inV1, DVec3Arg inV2);
/// Divide vector by double
JPH_INLINE DVec3 operator / (double inV2) const;
/// Multiply vector with double
JPH_INLINE DVec3 & operator *= (double inV2);
/// Multiply vector with vector
JPH_INLINE DVec3 & operator *= (DVec3Arg inV2);
/// Divide vector by double
JPH_INLINE DVec3 & operator /= (double inV2);
/// Add two vectors (component wise)
JPH_INLINE DVec3 operator + (Vec3Arg inV2) const;
/// Add two double vectors (component wise)
JPH_INLINE DVec3 operator + (DVec3Arg inV2) const;
/// Add two vectors (component wise)
JPH_INLINE DVec3 & operator += (Vec3Arg inV2);
/// Add two double vectors (component wise)
JPH_INLINE DVec3 & operator += (DVec3Arg inV2);
/// Negate
JPH_INLINE DVec3 operator - () const;
/// Subtract two vectors (component wise)
JPH_INLINE DVec3 operator - (Vec3Arg inV2) const;
/// Subtract two double vectors (component wise)
JPH_INLINE DVec3 operator - (DVec3Arg inV2) const;
/// Subtract two vectors (component wise)
JPH_INLINE DVec3 & operator -= (Vec3Arg inV2);
/// Subtract two vectors (component wise)
JPH_INLINE DVec3 & operator -= (DVec3Arg inV2);
/// Divide (component wise)
JPH_INLINE DVec3 operator / (DVec3Arg inV2) const;
/// Return the absolute value of each of the components
JPH_INLINE DVec3 Abs() const;
/// Reciprocal vector (1 / value) for each of the components
JPH_INLINE DVec3 Reciprocal() const;
/// Cross product
JPH_INLINE DVec3 Cross(DVec3Arg inV2) const;
/// Dot product
JPH_INLINE double Dot(DVec3Arg inV2) const;
/// Squared length of vector
JPH_INLINE double LengthSq() const;
/// Length of vector
JPH_INLINE double Length() const;
/// Normalize vector
JPH_INLINE DVec3 Normalized() const;
/// Component wise square root
JPH_INLINE DVec3 Sqrt() const;
/// Get vector that contains the sign of each element (returns 1 if positive, -1 if negative)
JPH_INLINE DVec3 GetSign() const;
/// To String
friend ostream & operator << (ostream &inStream, DVec3Arg inV)
{
inStream << inV.mF64[0] << ", " << inV.mF64[1] << ", " << inV.mF64[2];
return inStream;
}
/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
JPH_INLINE void CheckW() const;
/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
static JPH_INLINE Type sFixW(TypeArg inValue);
/// Representations of true and false for boolean operations
inline static const double cTrue = BitCast<double>(~uint64(0));
inline static const double cFalse = 0.0;
union
{
Type mValue;
double mF64[4];
};
};
static_assert(std::is_trivial<DVec3>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "DVec3.inl"

View File

@@ -0,0 +1,941 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Core/HashCombine.h>
// Create a std::hash/JPH::Hash for DVec3
JPH_MAKE_HASHABLE(JPH::DVec3, t.GetX(), t.GetY(), t.GetZ())
JPH_NAMESPACE_BEGIN
DVec3::DVec3(Vec3Arg inRHS)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_cvtps_pd(inRHS.mValue);
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_cvtps_pd(inRHS.mValue);
mValue.mHigh = _mm_cvtps_pd(_mm_shuffle_ps(inRHS.mValue, inRHS.mValue, _MM_SHUFFLE(2, 2, 2, 2)));
#elif defined(JPH_USE_NEON)
mValue.val[0] = vcvt_f64_f32(vget_low_f32(inRHS.mValue));
mValue.val[1] = vcvt_high_f64_f32(inRHS.mValue);
#else
mF64[0] = (double)inRHS.GetX();
mF64[1] = (double)inRHS.GetY();
mF64[2] = (double)inRHS.GetZ();
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
}
DVec3::DVec3(Vec4Arg inRHS) :
DVec3(Vec3(inRHS))
{
}
DVec3::DVec3(double inX, double inY, double inZ)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_set_pd(inZ, inZ, inY, inX); // Assure Z and W are the same
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_set_pd(inY, inX);
mValue.mHigh = _mm_set1_pd(inZ);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vcombine_f64(vcreate_f64(BitCast<uint64>(inX)), vcreate_f64(BitCast<uint64>(inY)));
mValue.val[1] = vdupq_n_f64(inZ);
#else
mF64[0] = inX;
mF64[1] = inY;
mF64[2] = inZ;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
}
DVec3::DVec3(const Double3 &inV)
{
#if defined(JPH_USE_AVX)
Type x = _mm256_castpd128_pd256(_mm_load_sd(&inV.x));
Type y = _mm256_castpd128_pd256(_mm_load_sd(&inV.y));
Type z = _mm256_broadcast_sd(&inV.z);
Type xy = _mm256_unpacklo_pd(x, y);
mValue = _mm256_blend_pd(xy, z, 0b1100); // Assure Z and W are the same
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_loadu_pd(&inV.x);
mValue.mHigh = _mm_set1_pd(inV.z);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vld1q_f64(&inV.x);
mValue.val[1] = vdupq_n_f64(inV.z);
#else
mF64[0] = inV.x;
mF64[1] = inV.y;
mF64[2] = inV.z;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
}
void DVec3::CheckW() const
{
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
// Avoid asserts when both components are NaN
JPH_ASSERT(reinterpret_cast<const uint64 *>(mF64)[2] == reinterpret_cast<const uint64 *>(mF64)[3]);
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
}
/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
DVec3::Type DVec3::sFixW(TypeArg inValue)
{
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
#if defined(JPH_USE_AVX)
return _mm256_shuffle_pd(inValue, inValue, 2);
#elif defined(JPH_USE_SSE)
Type value;
value.mLow = inValue.mLow;
value.mHigh = _mm_shuffle_pd(inValue.mHigh, inValue.mHigh, 0);
return value;
#elif defined(JPH_USE_NEON)
Type value;
value.val[0] = inValue.val[0];
value.val[1] = vdupq_laneq_f64(inValue.val[1], 0);
return value;
#else
Type value;
value.mData[0] = inValue.mData[0];
value.mData[1] = inValue.mData[1];
value.mData[2] = inValue.mData[2];
value.mData[3] = inValue.mData[2];
return value;
#endif
#else
return inValue;
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
}
DVec3 DVec3::sZero()
{
#if defined(JPH_USE_AVX)
return _mm256_setzero_pd();
#elif defined(JPH_USE_SSE)
__m128d zero = _mm_setzero_pd();
return DVec3({ zero, zero });
#elif defined(JPH_USE_NEON)
float64x2_t zero = vdupq_n_f64(0.0);
return DVec3({ zero, zero });
#else
return DVec3(0, 0, 0);
#endif
}
DVec3 DVec3::sReplicate(double inV)
{
#if defined(JPH_USE_AVX)
return _mm256_set1_pd(inV);
#elif defined(JPH_USE_SSE)
__m128d value = _mm_set1_pd(inV);
return DVec3({ value, value });
#elif defined(JPH_USE_NEON)
float64x2_t value = vdupq_n_f64(inV);
return DVec3({ value, value });
#else
return DVec3(inV, inV, inV);
#endif
}
DVec3 DVec3::sOne()
{
return sReplicate(1.0);
}
DVec3 DVec3::sNaN()
{
return sReplicate(numeric_limits<double>::quiet_NaN());
}
DVec3 DVec3::sLoadDouble3Unsafe(const Double3 &inV)
{
#if defined(JPH_USE_AVX)
Type v = _mm256_loadu_pd(&inV.x);
#elif defined(JPH_USE_SSE)
Type v;
v.mLow = _mm_loadu_pd(&inV.x);
v.mHigh = _mm_set1_pd(inV.z);
#elif defined(JPH_USE_NEON)
Type v = vld1q_f64_x2(&inV.x);
#else
Type v = { inV.x, inV.y, inV.z };
#endif
return sFixW(v);
}
void DVec3::StoreDouble3(Double3 *outV) const
{
outV->x = mF64[0];
outV->y = mF64[1];
outV->z = mF64[2];
}
DVec3::operator Vec3() const
{
#if defined(JPH_USE_AVX)
return _mm256_cvtpd_ps(mValue);
#elif defined(JPH_USE_SSE)
__m128 low = _mm_cvtpd_ps(mValue.mLow);
__m128 high = _mm_cvtpd_ps(mValue.mHigh);
return _mm_shuffle_ps(low, high, _MM_SHUFFLE(1, 0, 1, 0));
#elif defined(JPH_USE_NEON)
return vcvt_high_f32_f64(vcvtx_f32_f64(mValue.val[0]), mValue.val[1]);
#else
return Vec3((float)GetX(), (float)GetY(), (float)GetZ());
#endif
}
DVec3 DVec3::sMin(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_min_pd(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_min_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_min_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vminq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vminq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(min(inV1.mF64[0], inV2.mF64[0]),
min(inV1.mF64[1], inV2.mF64[1]),
min(inV1.mF64[2], inV2.mF64[2]));
#endif
}
DVec3 DVec3::sMax(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_max_pd(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_max_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_max_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vmaxq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vmaxq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(max(inV1.mF64[0], inV2.mF64[0]),
max(inV1.mF64[1], inV2.mF64[1]),
max(inV1.mF64[2], inV2.mF64[2]));
#endif
}
DVec3 DVec3::sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax)
{
return sMax(sMin(inV, inMax), inMin);
}
DVec3 DVec3::sEquals(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_EQ_OQ);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_cmpeq_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpeq_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vceqq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vceqq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
#else
return DVec3(inV1.mF64[0] == inV2.mF64[0]? cTrue : cFalse,
inV1.mF64[1] == inV2.mF64[1]? cTrue : cFalse,
inV1.mF64[2] == inV2.mF64[2]? cTrue : cFalse);
#endif
}
DVec3 DVec3::sLess(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LT_OQ);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_cmplt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmplt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vcltq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcltq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
#else
return DVec3(inV1.mF64[0] < inV2.mF64[0]? cTrue : cFalse,
inV1.mF64[1] < inV2.mF64[1]? cTrue : cFalse,
inV1.mF64[2] < inV2.mF64[2]? cTrue : cFalse);
#endif
}
DVec3 DVec3::sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LE_OQ);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_cmple_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmple_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vcleq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcleq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
#else
return DVec3(inV1.mF64[0] <= inV2.mF64[0]? cTrue : cFalse,
inV1.mF64[1] <= inV2.mF64[1]? cTrue : cFalse,
inV1.mF64[2] <= inV2.mF64[2]? cTrue : cFalse);
#endif
}
DVec3 DVec3::sGreater(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GT_OQ);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_cmpgt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpgt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vcgtq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcgtq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
#else
return DVec3(inV1.mF64[0] > inV2.mF64[0]? cTrue : cFalse,
inV1.mF64[1] > inV2.mF64[1]? cTrue : cFalse,
inV1.mF64[2] > inV2.mF64[2]? cTrue : cFalse);
#endif
}
DVec3 DVec3::sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GE_OQ);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_cmpge_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpge_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vcgeq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcgeq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
#else
return DVec3(inV1.mF64[0] >= inV2.mF64[0]? cTrue : cFalse,
inV1.mF64[1] >= inV2.mF64[1]? cTrue : cFalse,
inV1.mF64[2] >= inV2.mF64[2]? cTrue : cFalse);
#endif
}
DVec3 DVec3::sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd)
{
#if defined(JPH_USE_AVX)
#ifdef JPH_USE_FMADD
return _mm256_fmadd_pd(inMul1.mValue, inMul2.mValue, inAdd.mValue);
#else
return _mm256_add_pd(_mm256_mul_pd(inMul1.mValue, inMul2.mValue), inAdd.mValue);
#endif
#elif defined(JPH_USE_NEON)
return DVec3({ vmlaq_f64(inAdd.mValue.val[0], inMul1.mValue.val[0], inMul2.mValue.val[0]), vmlaq_f64(inAdd.mValue.val[1], inMul1.mValue.val[1], inMul2.mValue.val[1]) });
#else
return inMul1 * inMul2 + inAdd;
#endif
}
DVec3 DVec3::sSelect(DVec3Arg inNotSet, DVec3Arg inSet, DVec3Arg inControl)
{
#if defined(JPH_USE_AVX)
return _mm256_blendv_pd(inNotSet.mValue, inSet.mValue, inControl.mValue);
#elif defined(JPH_USE_SSE4_1)
Type v = { _mm_blendv_pd(inNotSet.mValue.mLow, inSet.mValue.mLow, inControl.mValue.mLow), _mm_blendv_pd(inNotSet.mValue.mHigh, inSet.mValue.mHigh, inControl.mValue.mHigh) };
return sFixW(v);
#elif defined(JPH_USE_NEON)
Type v = { vbslq_f64(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_f64(inControl.mValue.val[0]), 63)), inSet.mValue.val[0], inNotSet.mValue.val[0]),
vbslq_f64(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_f64(inControl.mValue.val[1]), 63)), inSet.mValue.val[1], inNotSet.mValue.val[1]) };
return sFixW(v);
#else
DVec3 result;
for (int i = 0; i < 3; i++)
result.mF64[i] = (BitCast<uint64>(inControl.mF64[i]) & (uint64(1) << 63))? inSet.mF64[i] : inNotSet.mF64[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
result.mF64[3] = result.mF64[2];
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
return result;
#endif
}
DVec3 DVec3::sOr(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_or_pd(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_or_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_or_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
#else
return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) | BitCast<uint64>(inV2.mF64[0])),
BitCast<double>(BitCast<uint64>(inV1.mF64[1]) | BitCast<uint64>(inV2.mF64[1])),
BitCast<double>(BitCast<uint64>(inV1.mF64[2]) | BitCast<uint64>(inV2.mF64[2])));
#endif
}
DVec3 DVec3::sXor(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_xor_pd(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_xor_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_xor_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
#else
return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) ^ BitCast<uint64>(inV2.mF64[0])),
BitCast<double>(BitCast<uint64>(inV1.mF64[1]) ^ BitCast<uint64>(inV2.mF64[1])),
BitCast<double>(BitCast<uint64>(inV1.mF64[2]) ^ BitCast<uint64>(inV2.mF64[2])));
#endif
}
DVec3 DVec3::sAnd(DVec3Arg inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_and_pd(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_and_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_and_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
#else
return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) & BitCast<uint64>(inV2.mF64[0])),
BitCast<double>(BitCast<uint64>(inV1.mF64[1]) & BitCast<uint64>(inV2.mF64[1])),
BitCast<double>(BitCast<uint64>(inV1.mF64[2]) & BitCast<uint64>(inV2.mF64[2])));
#endif
}
int DVec3::GetTrues() const
{
#if defined(JPH_USE_AVX)
return _mm256_movemask_pd(mValue) & 0x7;
#elif defined(JPH_USE_SSE)
return (_mm_movemask_pd(mValue.mLow) + (_mm_movemask_pd(mValue.mHigh) << 2)) & 0x7;
#else
return int((BitCast<uint64>(mF64[0]) >> 63) | ((BitCast<uint64>(mF64[1]) >> 63) << 1) | ((BitCast<uint64>(mF64[2]) >> 63) << 2));
#endif
}
bool DVec3::TestAnyTrue() const
{
return GetTrues() != 0;
}
bool DVec3::TestAllTrue() const
{
return GetTrues() == 0x7;
}
bool DVec3::operator == (DVec3Arg inV2) const
{
return sEquals(*this, inV2).TestAllTrue();
}
bool DVec3::IsClose(DVec3Arg inV2, double inMaxDistSq) const
{
return (inV2 - *this).LengthSq() <= inMaxDistSq;
}
bool DVec3::IsNearZero(double inMaxDistSq) const
{
return LengthSq() <= inMaxDistSq;
}
DVec3 DVec3::operator * (DVec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_mul_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_mul_pd(mValue.mLow, inV2.mValue.mLow), _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vmulq_f64(mValue.val[0], inV2.mValue.val[0]), vmulq_f64(mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(mF64[0] * inV2.mF64[0], mF64[1] * inV2.mF64[1], mF64[2] * inV2.mF64[2]);
#endif
}
DVec3 DVec3::operator * (double inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
#elif defined(JPH_USE_SSE)
__m128d v = _mm_set1_pd(inV2);
return DVec3({ _mm_mul_pd(mValue.mLow, v), _mm_mul_pd(mValue.mHigh, v) });
#elif defined(JPH_USE_NEON)
return DVec3({ vmulq_n_f64(mValue.val[0], inV2), vmulq_n_f64(mValue.val[1], inV2) });
#else
return DVec3(mF64[0] * inV2, mF64[1] * inV2, mF64[2] * inV2);
#endif
}
DVec3 operator * (double inV1, DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
return _mm256_mul_pd(_mm256_set1_pd(inV1), inV2.mValue);
#elif defined(JPH_USE_SSE)
__m128d v = _mm_set1_pd(inV1);
return DVec3({ _mm_mul_pd(v, inV2.mValue.mLow), _mm_mul_pd(v, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vmulq_n_f64(inV2.mValue.val[0], inV1), vmulq_n_f64(inV2.mValue.val[1], inV1) });
#else
return DVec3(inV1 * inV2.mF64[0], inV1 * inV2.mF64[1], inV1 * inV2.mF64[2]);
#endif
}
DVec3 DVec3::operator / (double inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
#elif defined(JPH_USE_SSE)
__m128d v = _mm_set1_pd(inV2);
return DVec3({ _mm_div_pd(mValue.mLow, v), _mm_div_pd(mValue.mHigh, v) });
#elif defined(JPH_USE_NEON)
float64x2_t v = vdupq_n_f64(inV2);
return DVec3({ vdivq_f64(mValue.val[0], v), vdivq_f64(mValue.val[1], v) });
#else
return DVec3(mF64[0] / inV2, mF64[1] / inV2, mF64[2] / inV2);
#endif
}
DVec3 &DVec3::operator *= (double inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
#elif defined(JPH_USE_SSE)
__m128d v = _mm_set1_pd(inV2);
mValue.mLow = _mm_mul_pd(mValue.mLow, v);
mValue.mHigh = _mm_mul_pd(mValue.mHigh, v);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vmulq_n_f64(mValue.val[0], inV2);
mValue.val[1] = vmulq_n_f64(mValue.val[1], inV2);
#else
for (int i = 0; i < 3; ++i)
mF64[i] *= inV2;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 &DVec3::operator *= (DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_mul_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_mul_pd(mValue.mLow, inV2.mValue.mLow);
mValue.mHigh = _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
mValue.val[1] = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
#else
for (int i = 0; i < 3; ++i)
mF64[i] *= inV2.mF64[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 &DVec3::operator /= (double inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
#elif defined(JPH_USE_SSE)
__m128d v = _mm_set1_pd(inV2);
mValue.mLow = _mm_div_pd(mValue.mLow, v);
mValue.mHigh = _mm_div_pd(mValue.mHigh, v);
#elif defined(JPH_USE_NEON)
float64x2_t v = vdupq_n_f64(inV2);
mValue.val[0] = vdivq_f64(mValue.val[0], v);
mValue.val[1] = vdivq_f64(mValue.val[1], v);
#else
for (int i = 0; i < 3; ++i)
mF64[i] /= inV2;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 DVec3::operator + (Vec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_add_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
#elif defined(JPH_USE_NEON)
return DVec3({ vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
#else
return DVec3(mF64[0] + inV2.mF32[0], mF64[1] + inV2.mF32[1], mF64[2] + inV2.mF32[2]);
#endif
}
DVec3 DVec3::operator + (DVec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_add_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_add_pd(mValue.mLow, inV2.mValue.mLow), _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vaddq_f64(mValue.val[0], inV2.mValue.val[0]), vaddq_f64(mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(mF64[0] + inV2.mF64[0], mF64[1] + inV2.mF64[1], mF64[2] + inV2.mF64[2]);
#endif
}
DVec3 &DVec3::operator += (Vec3Arg inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_add_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
mValue.mHigh = _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
#elif defined(JPH_USE_NEON)
mValue.val[0] = vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
mValue.val[1] = vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
#else
for (int i = 0; i < 3; ++i)
mF64[i] += inV2.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 &DVec3::operator += (DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_add_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_add_pd(mValue.mLow, inV2.mValue.mLow);
mValue.mHigh = _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vaddq_f64(mValue.val[0], inV2.mValue.val[0]);
mValue.val[1] = vaddq_f64(mValue.val[1], inV2.mValue.val[1]);
#else
for (int i = 0; i < 3; ++i)
mF64[i] += inV2.mF64[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 DVec3::operator - () const
{
#if defined(JPH_USE_AVX)
return _mm256_sub_pd(_mm256_setzero_pd(), mValue);
#elif defined(JPH_USE_SSE)
__m128d zero = _mm_setzero_pd();
return DVec3({ _mm_sub_pd(zero, mValue.mLow), _mm_sub_pd(zero, mValue.mHigh) });
#elif defined(JPH_USE_NEON)
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
float64x2_t zero = vdupq_n_f64(0);
return DVec3({ vsubq_f64(zero, mValue.val[0]), vsubq_f64(zero, mValue.val[1]) });
#else
return DVec3({ vnegq_f64(mValue.val[0]), vnegq_f64(mValue.val[1]) });
#endif
#else
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
return DVec3(0.0 - mF64[0], 0.0 - mF64[1], 0.0 - mF64[2]);
#else
return DVec3(-mF64[0], -mF64[1], -mF64[2]);
#endif
#endif
}
DVec3 DVec3::operator - (Vec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_sub_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
#elif defined(JPH_USE_NEON)
return DVec3({ vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
#else
return DVec3(mF64[0] - inV2.mF32[0], mF64[1] - inV2.mF32[1], mF64[2] - inV2.mF32[2]);
#endif
}
DVec3 DVec3::operator - (DVec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
return _mm256_sub_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_sub_pd(mValue.mLow, inV2.mValue.mLow), _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vsubq_f64(mValue.val[0], inV2.mValue.val[0]), vsubq_f64(mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(mF64[0] - inV2.mF64[0], mF64[1] - inV2.mF64[1], mF64[2] - inV2.mF64[2]);
#endif
}
DVec3 &DVec3::operator -= (Vec3Arg inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_sub_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
mValue.mHigh = _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
#elif defined(JPH_USE_NEON)
mValue.val[0] = vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
mValue.val[1] = vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
#else
for (int i = 0; i < 3; ++i)
mF64[i] -= inV2.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 &DVec3::operator -= (DVec3Arg inV2)
{
#if defined(JPH_USE_AVX)
mValue = _mm256_sub_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
mValue.mLow = _mm_sub_pd(mValue.mLow, inV2.mValue.mLow);
mValue.mHigh = _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh);
#elif defined(JPH_USE_NEON)
mValue.val[0] = vsubq_f64(mValue.val[0], inV2.mValue.val[0]);
mValue.val[1] = vsubq_f64(mValue.val[1], inV2.mValue.val[1]);
#else
for (int i = 0; i < 3; ++i)
mF64[i] -= inV2.mF64[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF64[3] = mF64[2];
#endif
#endif
return *this;
}
DVec3 DVec3::operator / (DVec3Arg inV2) const
{
inV2.CheckW();
#if defined(JPH_USE_AVX)
return _mm256_div_pd(mValue, inV2.mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_div_pd(mValue.mLow, inV2.mValue.mLow), _mm_div_pd(mValue.mHigh, inV2.mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vdivq_f64(mValue.val[0], inV2.mValue.val[0]), vdivq_f64(mValue.val[1], inV2.mValue.val[1]) });
#else
return DVec3(mF64[0] / inV2.mF64[0], mF64[1] / inV2.mF64[1], mF64[2] / inV2.mF64[2]);
#endif
}
DVec3 DVec3::Abs() const
{
#if defined(JPH_USE_AVX512)
return _mm256_range_pd(mValue, mValue, 0b1000);
#elif defined(JPH_USE_AVX)
return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), mValue), mValue);
#elif defined(JPH_USE_SSE)
__m128d zero = _mm_setzero_pd();
return DVec3({ _mm_max_pd(_mm_sub_pd(zero, mValue.mLow), mValue.mLow), _mm_max_pd(_mm_sub_pd(zero, mValue.mHigh), mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vabsq_f64(mValue.val[0]), vabsq_f64(mValue.val[1]) });
#else
return DVec3(abs(mF64[0]), abs(mF64[1]), abs(mF64[2]));
#endif
}
DVec3 DVec3::Reciprocal() const
{
return sOne() / mValue;
}
DVec3 DVec3::Cross(DVec3Arg inV2) const
{
#if defined(JPH_USE_AVX2)
__m256d t1 = _mm256_permute4x64_pd(inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
t1 = _mm256_mul_pd(t1, mValue);
__m256d t2 = _mm256_permute4x64_pd(mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
t2 = _mm256_mul_pd(t2, inV2.mValue);
__m256d t3 = _mm256_sub_pd(t1, t2);
return _mm256_permute4x64_pd(t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
#else
return DVec3(mF64[1] * inV2.mF64[2] - mF64[2] * inV2.mF64[1],
mF64[2] * inV2.mF64[0] - mF64[0] * inV2.mF64[2],
mF64[0] * inV2.mF64[1] - mF64[1] * inV2.mF64[0]);
#endif
}
double DVec3::Dot(DVec3Arg inV2) const
{
#if defined(JPH_USE_AVX)
__m256d mul = _mm256_mul_pd(mValue, inV2.mValue);
__m128d xy = _mm256_castpd256_pd128(mul);
__m128d yx = _mm_shuffle_pd(xy, xy, 1);
__m128d sum = _mm_add_pd(xy, yx);
__m128d zw = _mm256_extractf128_pd(mul, 1);
sum = _mm_add_pd(sum, zw);
return _mm_cvtsd_f64(sum);
#elif defined(JPH_USE_SSE)
__m128d xy = _mm_mul_pd(mValue.mLow, inV2.mValue.mLow);
__m128d yx = _mm_shuffle_pd(xy, xy, 1);
__m128d sum = _mm_add_pd(xy, yx);
__m128d z = _mm_mul_sd(mValue.mHigh, inV2.mValue.mHigh);
sum = _mm_add_pd(sum, z);
return _mm_cvtsd_f64(sum);
#elif defined(JPH_USE_NEON)
float64x2_t mul_low = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
float64x2_t mul_high = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
return vaddvq_f64(mul_low) + vgetq_lane_f64(mul_high, 0);
#else
double dot = 0.0;
for (int i = 0; i < 3; i++)
dot += mF64[i] * inV2.mF64[i];
return dot;
#endif
}
double DVec3::LengthSq() const
{
return Dot(*this);
}
DVec3 DVec3::Sqrt() const
{
#if defined(JPH_USE_AVX)
return _mm256_sqrt_pd(mValue);
#elif defined(JPH_USE_SSE)
return DVec3({ _mm_sqrt_pd(mValue.mLow), _mm_sqrt_pd(mValue.mHigh) });
#elif defined(JPH_USE_NEON)
return DVec3({ vsqrtq_f64(mValue.val[0]), vsqrtq_f64(mValue.val[1]) });
#else
return DVec3(sqrt(mF64[0]), sqrt(mF64[1]), sqrt(mF64[2]));
#endif
}
double DVec3::Length() const
{
return sqrt(Dot(*this));
}
DVec3 DVec3::Normalized() const
{
return *this / Length();
}
bool DVec3::IsNormalized(double inTolerance) const
{
return abs(LengthSq() - 1.0) <= inTolerance;
}
bool DVec3::IsNaN() const
{
#if defined(JPH_USE_AVX512)
return (_mm256_fpclass_pd_mask(mValue, 0b10000001) & 0x7) != 0;
#elif defined(JPH_USE_AVX)
return (_mm256_movemask_pd(_mm256_cmp_pd(mValue, mValue, _CMP_UNORD_Q)) & 0x7) != 0;
#elif defined(JPH_USE_SSE)
return ((_mm_movemask_pd(_mm_cmpunord_pd(mValue.mLow, mValue.mLow)) + (_mm_movemask_pd(_mm_cmpunord_pd(mValue.mHigh, mValue.mHigh)) << 2)) & 0x7) != 0;
#else
return isnan(mF64[0]) || isnan(mF64[1]) || isnan(mF64[2]);
#endif
}
DVec3 DVec3::GetSign() const
{
#if defined(JPH_USE_AVX512)
return _mm256_fixupimm_pd(mValue, mValue, _mm256_set1_epi32(0xA9A90A00), 0);
#elif defined(JPH_USE_AVX)
__m256d minus_one = _mm256_set1_pd(-1.0);
__m256d one = _mm256_set1_pd(1.0);
return _mm256_or_pd(_mm256_and_pd(mValue, minus_one), one);
#elif defined(JPH_USE_SSE)
__m128d minus_one = _mm_set1_pd(-1.0);
__m128d one = _mm_set1_pd(1.0);
return DVec3({ _mm_or_pd(_mm_and_pd(mValue.mLow, minus_one), one), _mm_or_pd(_mm_and_pd(mValue.mHigh, minus_one), one) });
#elif defined(JPH_USE_NEON)
uint64x2_t minus_one = vreinterpretq_u64_f64(vdupq_n_f64(-1.0f));
uint64x2_t one = vreinterpretq_u64_f64(vdupq_n_f64(1.0f));
return DVec3({ vreinterpretq_f64_u64(vorrq_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), minus_one), one)),
vreinterpretq_f64_u64(vorrq_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), minus_one), one)) });
#else
return DVec3(std::signbit(mF64[0])? -1.0 : 1.0,
std::signbit(mF64[1])? -1.0 : 1.0,
std::signbit(mF64[2])? -1.0 : 1.0);
#endif
}
DVec3 DVec3::PrepareRoundToZero() const
{
// Float has 23 bit mantissa, double 52 bit mantissa => we lose 29 bits when converting from double to float
constexpr uint64 cDoubleToFloatMantissaLoss = (1U << 29) - 1;
#if defined(JPH_USE_AVX)
return _mm256_and_pd(mValue, _mm256_castsi256_pd(_mm256_set1_epi64x(int64_t(~cDoubleToFloatMantissaLoss))));
#elif defined(JPH_USE_SSE)
__m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(int64_t(~cDoubleToFloatMantissaLoss)));
return DVec3({ _mm_and_pd(mValue.mLow, mask), _mm_and_pd(mValue.mHigh, mask) });
#elif defined(JPH_USE_NEON)
uint64x2_t mask = vdupq_n_u64(~cDoubleToFloatMantissaLoss);
return DVec3({ vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), mask)),
vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), mask)) });
#else
double x = BitCast<double>(BitCast<uint64>(mF64[0]) & ~cDoubleToFloatMantissaLoss);
double y = BitCast<double>(BitCast<uint64>(mF64[1]) & ~cDoubleToFloatMantissaLoss);
double z = BitCast<double>(BitCast<uint64>(mF64[2]) & ~cDoubleToFloatMantissaLoss);
return DVec3(x, y, z);
#endif
}
DVec3 DVec3::PrepareRoundToInf() const
{
// Float has 23 bit mantissa, double 52 bit mantissa => we lose 29 bits when converting from double to float
constexpr uint64 cDoubleToFloatMantissaLoss = (1U << 29) - 1;
#if defined(JPH_USE_AVX512)
__m256i mantissa_loss = _mm256_set1_epi64x(cDoubleToFloatMantissaLoss);
__mmask8 is_zero = _mm256_testn_epi64_mask(_mm256_castpd_si256(mValue), mantissa_loss);
__m256d value_or_mantissa_loss = _mm256_or_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
return _mm256_mask_blend_pd(is_zero, value_or_mantissa_loss, mValue);
#elif defined(JPH_USE_AVX)
__m256i mantissa_loss = _mm256_set1_epi64x(cDoubleToFloatMantissaLoss);
__m256d value_and_mantissa_loss = _mm256_and_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
__m256d is_zero = _mm256_cmp_pd(value_and_mantissa_loss, _mm256_setzero_pd(), _CMP_EQ_OQ);
__m256d value_or_mantissa_loss = _mm256_or_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
return _mm256_blendv_pd(value_or_mantissa_loss, mValue, is_zero);
#elif defined(JPH_USE_SSE4_1)
__m128i mantissa_loss = _mm_set1_epi64x(cDoubleToFloatMantissaLoss);
__m128d zero = _mm_setzero_pd();
__m128d value_and_mantissa_loss_low = _mm_and_pd(mValue.mLow, _mm_castsi128_pd(mantissa_loss));
__m128d is_zero_low = _mm_cmpeq_pd(value_and_mantissa_loss_low, zero);
__m128d value_or_mantissa_loss_low = _mm_or_pd(mValue.mLow, _mm_castsi128_pd(mantissa_loss));
__m128d value_and_mantissa_loss_high = _mm_and_pd(mValue.mHigh, _mm_castsi128_pd(mantissa_loss));
__m128d is_zero_high = _mm_cmpeq_pd(value_and_mantissa_loss_high, zero);
__m128d value_or_mantissa_loss_high = _mm_or_pd(mValue.mHigh, _mm_castsi128_pd(mantissa_loss));
return DVec3({ _mm_blendv_pd(value_or_mantissa_loss_low, mValue.mLow, is_zero_low), _mm_blendv_pd(value_or_mantissa_loss_high, mValue.mHigh, is_zero_high) });
#elif defined(JPH_USE_NEON)
uint64x2_t mantissa_loss = vdupq_n_u64(cDoubleToFloatMantissaLoss);
float64x2_t zero = vdupq_n_f64(0.0);
float64x2_t value_and_mantissa_loss_low = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), mantissa_loss));
uint64x2_t is_zero_low = vceqq_f64(value_and_mantissa_loss_low, zero);
float64x2_t value_or_mantissa_loss_low = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(mValue.val[0]), mantissa_loss));
float64x2_t value_and_mantissa_loss_high = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), mantissa_loss));
float64x2_t value_low = vbslq_f64(is_zero_low, mValue.val[0], value_or_mantissa_loss_low);
uint64x2_t is_zero_high = vceqq_f64(value_and_mantissa_loss_high, zero);
float64x2_t value_or_mantissa_loss_high = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(mValue.val[1]), mantissa_loss));
float64x2_t value_high = vbslq_f64(is_zero_high, mValue.val[1], value_or_mantissa_loss_high);
return DVec3({ value_low, value_high });
#else
uint64 ux = BitCast<uint64>(mF64[0]);
uint64 uy = BitCast<uint64>(mF64[1]);
uint64 uz = BitCast<uint64>(mF64[2]);
double x = BitCast<double>((ux & cDoubleToFloatMantissaLoss) == 0? ux : (ux | cDoubleToFloatMantissaLoss));
double y = BitCast<double>((uy & cDoubleToFloatMantissaLoss) == 0? uy : (uy | cDoubleToFloatMantissaLoss));
double z = BitCast<double>((uz & cDoubleToFloatMantissaLoss) == 0? uz : (uz | cDoubleToFloatMantissaLoss));
return DVec3(x, y, z);
#endif
}
Vec3 DVec3::ToVec3RoundDown() const
{
DVec3 to_zero = PrepareRoundToZero();
DVec3 to_inf = PrepareRoundToInf();
return Vec3(DVec3::sSelect(to_zero, to_inf, DVec3::sLess(*this, DVec3::sZero())));
}
Vec3 DVec3::ToVec3RoundUp() const
{
DVec3 to_zero = PrepareRoundToZero();
DVec3 to_inf = PrepareRoundToInf();
return Vec3(DVec3::sSelect(to_inf, to_zero, DVec3::sLess(*this, DVec3::sZero())));
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,48 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Core/HashCombine.h>
JPH_NAMESPACE_BEGIN
/// Class that holds 3 doubles. Used as a storage class. Convert to DVec3 for calculations.
class [[nodiscard]] Double3
{
public:
JPH_OVERRIDE_NEW_DELETE
Double3() = default; ///< Intentionally not initialized for performance reasons
Double3(const Double3 &inRHS) = default;
Double3 & operator = (const Double3 &inRHS) = default;
Double3(double inX, double inY, double inZ) : x(inX), y(inY), z(inZ) { }
double operator [] (int inCoordinate) const
{
JPH_ASSERT(inCoordinate < 3);
return *(&x + inCoordinate);
}
bool operator == (const Double3 &inRHS) const
{
return x == inRHS.x && y == inRHS.y && z == inRHS.z;
}
bool operator != (const Double3 &inRHS) const
{
return x != inRHS.x || y != inRHS.y || z != inRHS.z;
}
double x;
double y;
double z;
};
static_assert(std::is_trivial<Double3>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
// Create a std::hash/JPH::Hash for Double3
JPH_MAKE_HASHABLE(JPH::Double3, t.x, t.y, t.z)

View File

@@ -0,0 +1,31 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Dynamic resizable matrix class
class [[nodiscard]] DynMatrix
{
public:
/// Constructor
DynMatrix(const DynMatrix &) = default;
DynMatrix(uint inRows, uint inCols) : mRows(inRows), mCols(inCols) { mElements.resize(inRows * inCols); }
/// Access an element
float operator () (uint inRow, uint inCol) const { JPH_ASSERT(inRow < mRows && inCol < mCols); return mElements[inRow * mCols + inCol]; }
float & operator () (uint inRow, uint inCol) { JPH_ASSERT(inRow < mRows && inCol < mCols); return mElements[inRow * mCols + inCol]; }
/// Get dimensions
uint GetCols() const { return mCols; }
uint GetRows() const { return mRows; }
private:
uint mRows;
uint mCols;
Array<float> mElements;
};
JPH_NAMESPACE_END

View File

@@ -0,0 +1,177 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Core/FPException.h>
JPH_NAMESPACE_BEGIN
/// Function to determine the eigen vectors and values of a N x N real symmetric matrix
/// by Jacobi transformations. This method is most suitable for N < 10.
///
/// Taken and adapted from Numerical Recipes paragraph 11.1
///
/// An eigen vector is a vector v for which \f$A \: v = \lambda \: v\f$
///
/// Where:
/// A: A square matrix.
/// \f$\lambda\f$: a non-zero constant value.
///
/// @see https://en.wikipedia.org/wiki/Eigenvalues_and_eigenvectors
///
/// Matrix is a matrix type, which has dimensions N x N.
/// @param inMatrix is the matrix of which to return the eigenvalues and vectors
/// @param outEigVec will contain a matrix whose columns contain the normalized eigenvectors (must be identity before call)
/// @param outEigVal will contain the eigenvalues
template <class Vector, class Matrix>
bool EigenValueSymmetric(const Matrix &inMatrix, Matrix &outEigVec, Vector &outEigVal)
{
// This algorithm can generate infinite values, see comment below
FPExceptionDisableInvalid disable_invalid;
JPH_UNUSED(disable_invalid);
// Maximum number of sweeps to make
const int cMaxSweeps = 50;
// Get problem dimension
const uint n = inMatrix.GetRows();
// Make sure the dimensions are right
JPH_ASSERT(inMatrix.GetRows() == n);
JPH_ASSERT(inMatrix.GetCols() == n);
JPH_ASSERT(outEigVec.GetRows() == n);
JPH_ASSERT(outEigVec.GetCols() == n);
JPH_ASSERT(outEigVal.GetRows() == n);
JPH_ASSERT(outEigVec.IsIdentity());
// Get the matrix in a so we can mess with it
Matrix a = inMatrix;
Vector b, z;
for (uint ip = 0; ip < n; ++ip)
{
// Initialize b to diagonal of a
b[ip] = a(ip, ip);
// Initialize output to diagonal of a
outEigVal[ip] = a(ip, ip);
// Reset z
z[ip] = 0.0f;
}
for (int sweep = 0; sweep < cMaxSweeps; ++sweep)
{
// Get the sum of the off-diagonal elements of a
float sm = 0.0f;
for (uint ip = 0; ip < n - 1; ++ip)
for (uint iq = ip + 1; iq < n; ++iq)
sm += abs(a(ip, iq));
float avg_sm = sm / Square(n);
// Normal return, convergence to machine underflow
if (avg_sm < FLT_MIN) // Original code: sm == 0.0f, when the average is denormal, we also consider it machine underflow
{
// Sanity checks
#ifdef JPH_ENABLE_ASSERTS
for (uint c = 0; c < n; ++c)
{
// Check if the eigenvector is normalized
JPH_ASSERT(outEigVec.GetColumn(c).IsNormalized());
// Check if inMatrix * eigen_vector = eigen_value * eigen_vector
Vector mat_eigvec = inMatrix * outEigVec.GetColumn(c);
Vector eigval_eigvec = outEigVal[c] * outEigVec.GetColumn(c);
JPH_ASSERT(mat_eigvec.IsClose(eigval_eigvec, max(mat_eigvec.LengthSq(), eigval_eigvec.LengthSq()) * 1.0e-6f));
}
#endif
// Success
return true;
}
// On the first three sweeps use a fraction of the sum of the off diagonal elements as threshold
// Note that we pick a minimum threshold of FLT_MIN because dividing by a denormalized number is likely to result in infinity.
float thresh = sweep < 4? 0.2f * avg_sm : FLT_MIN; // Original code: 0.0f instead of FLT_MIN
for (uint ip = 0; ip < n - 1; ++ip)
for (uint iq = ip + 1; iq < n; ++iq)
{
float &a_pq = a(ip, iq);
float &eigval_p = outEigVal[ip];
float &eigval_q = outEigVal[iq];
float abs_a_pq = abs(a_pq);
float g = 100.0f * abs_a_pq;
// After four sweeps, skip the rotation if the off-diagonal element is small
if (sweep > 4
&& abs(eigval_p) + g == abs(eigval_p)
&& abs(eigval_q) + g == abs(eigval_q))
{
a_pq = 0.0f;
}
else if (abs_a_pq > thresh)
{
float h = eigval_q - eigval_p;
float abs_h = abs(h);
float t;
if (abs_h + g == abs_h)
{
t = a_pq / h;
}
else
{
float theta = 0.5f * h / a_pq; // Warning: Can become infinite if a(ip, iq) is very small which may trigger an invalid float exception
t = 1.0f / (abs(theta) + sqrt(1.0f + theta * theta)); // If theta becomes inf, t will be 0 so the infinite is not a problem for the algorithm
if (theta < 0.0f) t = -t;
}
float c = 1.0f / sqrt(1.0f + t * t);
float s = t * c;
float tau = s / (1.0f + c);
h = t * a_pq;
a_pq = 0.0f;
z[ip] -= h;
z[iq] += h;
eigval_p -= h;
eigval_q += h;
#define JPH_EVS_ROTATE(a, i, j, k, l) \
g = a(i, j), \
h = a(k, l), \
a(i, j) = g - s * (h + g * tau), \
a(k, l) = h + s * (g - h * tau)
uint j;
for (j = 0; j < ip; ++j) JPH_EVS_ROTATE(a, j, ip, j, iq);
for (j = ip + 1; j < iq; ++j) JPH_EVS_ROTATE(a, ip, j, j, iq);
for (j = iq + 1; j < n; ++j) JPH_EVS_ROTATE(a, ip, j, iq, j);
for (j = 0; j < n; ++j) JPH_EVS_ROTATE(outEigVec, j, ip, j, iq);
#undef JPH_EVS_ROTATE
}
}
// Update eigenvalues with the sum of ta_pq and reinitialize z
for (uint ip = 0; ip < n; ++ip)
{
b[ip] += z[ip];
outEigVal[ip] = b[ip];
z[ip] = 0.0f;
}
}
// Failure
JPH_ASSERT(false, "Too many iterations");
return false;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,42 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Find the roots of \f$inA \: x^2 + inB \: x + inC = 0\f$.
/// @return The number of roots, actual roots in outX1 and outX2.
/// If number of roots returned is 1 then outX1 == outX2.
template <typename T>
inline int FindRoot(const T inA, const T inB, const T inC, T &outX1, T &outX2)
{
// Check if this is a linear equation
if (inA == T(0))
{
// Check if this is a constant equation
if (inB == T(0))
return 0;
// Linear equation with 1 solution
outX1 = outX2 = -inC / inB;
return 1;
}
// See Numerical Recipes in C, Chapter 5.6 Quadratic and Cubic Equations
T det = Square(inB) - T(4) * inA * inC;
if (det < T(0))
return 0;
T q = (inB + Sign(inB) * sqrt(det)) / T(-2);
outX1 = q / inA;
if (q == T(0))
{
outX2 = outX1;
return 1;
}
outX2 = inC / q;
return 2;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,36 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Class that holds 2 floats, used as a storage class mainly.
class [[nodiscard]] Float2
{
public:
JPH_OVERRIDE_NEW_DELETE
Float2() = default; ///< Intentionally not initialized for performance reasons
Float2(const Float2 &inRHS) = default;
Float2 & operator = (const Float2 &inRHS) = default;
Float2(float inX, float inY) : x(inX), y(inY) { }
bool operator == (const Float2 &inRHS) const { return x == inRHS.x && y == inRHS.y; }
bool operator != (const Float2 &inRHS) const { return x != inRHS.x || y != inRHS.y; }
/// To String
friend ostream & operator << (ostream &inStream, const Float2 &inV)
{
inStream << inV.x << ", " << inV.y;
return inStream;
}
float x;
float y;
};
static_assert(std::is_trivial<Float2>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END

View File

@@ -0,0 +1,50 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Core/HashCombine.h>
JPH_NAMESPACE_BEGIN
/// Class that holds 3 floats. Used as a storage class. Convert to Vec3 for calculations.
class [[nodiscard]] Float3
{
public:
JPH_OVERRIDE_NEW_DELETE
Float3() = default; ///< Intentionally not initialized for performance reasons
Float3(const Float3 &inRHS) = default;
Float3 & operator = (const Float3 &inRHS) = default;
constexpr Float3(float inX, float inY, float inZ) : x(inX), y(inY), z(inZ) { }
float operator [] (int inCoordinate) const
{
JPH_ASSERT(inCoordinate < 3);
return *(&x + inCoordinate);
}
bool operator == (const Float3 &inRHS) const
{
return x == inRHS.x && y == inRHS.y && z == inRHS.z;
}
bool operator != (const Float3 &inRHS) const
{
return x != inRHS.x || y != inRHS.y || z != inRHS.z;
}
float x;
float y;
float z;
};
using VertexList = Array<Float3>;
static_assert(std::is_trivial<Float3>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
// Create a std::hash/JPH::Hash for Float3
JPH_MAKE_HASHABLE(JPH::Float3, t.x, t.y, t.z)

View File

@@ -0,0 +1,33 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Class that holds 4 float values. Convert to Vec4 to perform calculations.
class [[nodiscard]] Float4
{
public:
JPH_OVERRIDE_NEW_DELETE
Float4() = default; ///< Intentionally not initialized for performance reasons
Float4(const Float4 &inRHS) = default;
Float4(float inX, float inY, float inZ, float inW) : x(inX), y(inY), z(inZ), w(inW) { }
float operator [] (int inCoordinate) const
{
JPH_ASSERT(inCoordinate < 4);
return *(&x + inCoordinate);
}
float x;
float y;
float z;
float w;
};
static_assert(std::is_trivial<Float4>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END

View File

@@ -0,0 +1,102 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// This function performs Gauss-Jordan elimination to solve a matrix equation.
/// A must be an NxN matrix and B must be an NxM matrix forming the equation A * x = B
/// on output B will contain x and A will be destroyed.
///
/// This code can be used for example to compute the inverse of a matrix.
/// Set A to the matrix to invert, set B to identity and let GaussianElimination solve
/// the equation, on return B will be the inverse of A. And A is destroyed.
///
/// Taken and adapted from Numerical Recipes in C paragraph 2.1
template <class MatrixA, class MatrixB>
bool GaussianElimination(MatrixA &ioA, MatrixB &ioB, float inTolerance = 1.0e-16f)
{
// Get problem dimensions
const uint n = ioA.GetCols();
const uint m = ioB.GetCols();
// Check matrix requirement
JPH_ASSERT(ioA.GetRows() == n);
JPH_ASSERT(ioB.GetRows() == n);
// Create array for bookkeeping on pivoting
int *ipiv = (int *)JPH_STACK_ALLOC(n * sizeof(int));
memset(ipiv, 0, n * sizeof(int));
for (uint i = 0; i < n; ++i)
{
// Initialize pivot element as the diagonal
uint pivot_row = i, pivot_col = i;
// Determine pivot element
float largest_element = 0.0f;
for (uint j = 0; j < n; ++j)
if (ipiv[j] != 1)
for (uint k = 0; k < n; ++k)
{
if (ipiv[k] == 0)
{
float element = abs(ioA(j, k));
if (element >= largest_element)
{
largest_element = element;
pivot_row = j;
pivot_col = k;
}
}
else if (ipiv[k] > 1)
{
return false;
}
}
// Mark this column as used
++ipiv[pivot_col];
// Exchange rows when needed so that the pivot element is at ioA(pivot_col, pivot_col) instead of at ioA(pivot_row, pivot_col)
if (pivot_row != pivot_col)
{
for (uint j = 0; j < n; ++j)
std::swap(ioA(pivot_row, j), ioA(pivot_col, j));
for (uint j = 0; j < m; ++j)
std::swap(ioB(pivot_row, j), ioB(pivot_col, j));
}
// Get diagonal element that we are about to set to 1
float diagonal_element = ioA(pivot_col, pivot_col);
if (abs(diagonal_element) < inTolerance)
return false;
// Divide the whole row by the pivot element, making ioA(pivot_col, pivot_col) = 1
for (uint j = 0; j < n; ++j)
ioA(pivot_col, j) /= diagonal_element;
for (uint j = 0; j < m; ++j)
ioB(pivot_col, j) /= diagonal_element;
ioA(pivot_col, pivot_col) = 1.0f;
// Next reduce the rows, except for the pivot one,
// after this step the pivot_col column is zero except for the pivot element which is 1
for (uint j = 0; j < n; ++j)
if (j != pivot_col)
{
float element = ioA(j, pivot_col);
for (uint k = 0; k < n; ++k)
ioA(j, k) -= ioA(pivot_col, k) * element;
for (uint k = 0; k < m; ++k)
ioB(j, k) -= ioB(pivot_col, k) * element;
ioA(j, pivot_col) = 0.0f;
}
}
// Success
return true;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,208 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Vec4.h>
#include <Jolt/Core/FPException.h>
JPH_NAMESPACE_BEGIN
using HalfFloat = uint16;
// Define half float constant values
static constexpr HalfFloat HALF_FLT_MAX = 0x7bff;
static constexpr HalfFloat HALF_FLT_MAX_NEGATIVE = 0xfbff;
static constexpr HalfFloat HALF_FLT_INF = 0x7c00;
static constexpr HalfFloat HALF_FLT_INF_NEGATIVE = 0xfc00;
static constexpr HalfFloat HALF_FLT_NANQ = 0x7e00;
static constexpr HalfFloat HALF_FLT_NANQ_NEGATIVE = 0xfe00;
namespace HalfFloatConversion {
// Layout of a float
static constexpr int FLOAT_SIGN_POS = 31;
static constexpr int FLOAT_EXPONENT_POS = 23;
static constexpr int FLOAT_EXPONENT_BITS = 8;
static constexpr int FLOAT_EXPONENT_MASK = (1 << FLOAT_EXPONENT_BITS) - 1;
static constexpr int FLOAT_EXPONENT_BIAS = 127;
static constexpr int FLOAT_MANTISSA_BITS = 23;
static constexpr int FLOAT_MANTISSA_MASK = (1 << FLOAT_MANTISSA_BITS) - 1;
static constexpr int FLOAT_EXPONENT_AND_MANTISSA_MASK = FLOAT_MANTISSA_MASK + (FLOAT_EXPONENT_MASK << FLOAT_EXPONENT_POS);
// Layout of half float
static constexpr int HALF_FLT_SIGN_POS = 15;
static constexpr int HALF_FLT_EXPONENT_POS = 10;
static constexpr int HALF_FLT_EXPONENT_BITS = 5;
static constexpr int HALF_FLT_EXPONENT_MASK = (1 << HALF_FLT_EXPONENT_BITS) - 1;
static constexpr int HALF_FLT_EXPONENT_BIAS = 15;
static constexpr int HALF_FLT_MANTISSA_BITS = 10;
static constexpr int HALF_FLT_MANTISSA_MASK = (1 << HALF_FLT_MANTISSA_BITS) - 1;
static constexpr int HALF_FLT_EXPONENT_AND_MANTISSA_MASK = HALF_FLT_MANTISSA_MASK + (HALF_FLT_EXPONENT_MASK << HALF_FLT_EXPONENT_POS);
/// Define half-float rounding modes
enum ERoundingMode
{
ROUND_TO_NEG_INF, ///< Round to negative infinity
ROUND_TO_POS_INF, ///< Round to positive infinity
ROUND_TO_NEAREST, ///< Round to nearest value
};
/// Convert a float (32-bits) to a half float (16-bits), fallback version when no intrinsics available
template <int RoundingMode>
inline HalfFloat FromFloatFallback(float inV)
{
// Reinterpret the float as an uint32
uint32 value = BitCast<uint32>(inV);
// Extract exponent
uint32 exponent = (value >> FLOAT_EXPONENT_POS) & FLOAT_EXPONENT_MASK;
// Extract mantissa
uint32 mantissa = value & FLOAT_MANTISSA_MASK;
// Extract the sign and move it into the right spot for the half float (so we can just or it in at the end)
HalfFloat hf_sign = HalfFloat(value >> (FLOAT_SIGN_POS - HALF_FLT_SIGN_POS)) & (1 << HALF_FLT_SIGN_POS);
// Check NaN or INF
if (exponent == FLOAT_EXPONENT_MASK) // NaN or INF
return hf_sign | (mantissa == 0? HALF_FLT_INF : HALF_FLT_NANQ);
// Rebias the exponent for half floats
int rebiased_exponent = int(exponent) - FLOAT_EXPONENT_BIAS + HALF_FLT_EXPONENT_BIAS;
// Check overflow to infinity
if (rebiased_exponent >= HALF_FLT_EXPONENT_MASK)
{
bool round_up = RoundingMode == ROUND_TO_NEAREST || (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF);
return hf_sign | (round_up? HALF_FLT_INF : HALF_FLT_MAX);
}
// Check underflow to zero
if (rebiased_exponent < -HALF_FLT_MANTISSA_BITS)
{
bool round_up = RoundingMode != ROUND_TO_NEAREST && (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF) && (value & FLOAT_EXPONENT_AND_MANTISSA_MASK) != 0;
return hf_sign | (round_up? 1 : 0);
}
HalfFloat hf_exponent;
int shift;
if (rebiased_exponent <= 0)
{
// Underflow to denormalized number
hf_exponent = 0;
mantissa |= 1 << FLOAT_MANTISSA_BITS; // Add the implicit 1 bit to the mantissa
shift = FLOAT_MANTISSA_BITS - HALF_FLT_MANTISSA_BITS + 1 - rebiased_exponent;
}
else
{
// Normal half float
hf_exponent = HalfFloat(rebiased_exponent << HALF_FLT_EXPONENT_POS);
shift = FLOAT_MANTISSA_BITS - HALF_FLT_MANTISSA_BITS;
}
// Compose the half float
HalfFloat hf_mantissa = HalfFloat(mantissa >> shift);
HalfFloat hf = hf_sign | hf_exponent | hf_mantissa;
// Calculate the remaining bits that we're discarding
uint remainder = mantissa & ((1 << shift) - 1);
if constexpr (RoundingMode == ROUND_TO_NEAREST)
{
// Round to nearest
uint round_threshold = 1 << (shift - 1);
if (remainder > round_threshold // Above threshold, we must always round
|| (remainder == round_threshold && (hf_mantissa & 1))) // When equal, round to nearest even
hf++; // May overflow to infinity
}
else
{
// Round up or down (truncate) depending on the rounding mode
bool round_up = (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF) && remainder != 0;
if (round_up)
hf++; // May overflow to infinity
}
return hf;
}
/// Convert a float (32-bits) to a half float (16-bits)
template <int RoundingMode>
JPH_INLINE HalfFloat FromFloat(float inV)
{
#ifdef JPH_USE_F16C
FPExceptionDisableOverflow disable_overflow;
JPH_UNUSED(disable_overflow);
union
{
__m128i u128;
HalfFloat u16[8];
} hf;
__m128 val = _mm_load_ss(&inV);
switch (RoundingMode)
{
case ROUND_TO_NEG_INF:
hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_NEG_INF);
break;
case ROUND_TO_POS_INF:
hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_POS_INF);
break;
case ROUND_TO_NEAREST:
hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_NEAREST_INT);
break;
}
return hf.u16[0];
#else
return FromFloatFallback<RoundingMode>(inV);
#endif
}
/// Convert 4 half floats (lower 64 bits) to floats, fallback version when no intrinsics available
inline Vec4 ToFloatFallback(UVec4Arg inValue)
{
// Unpack half floats to 4 uint32's
UVec4 value = inValue.Expand4Uint16Lo();
// Normal half float path, extract the exponent and mantissa, shift them into place and update the exponent bias
UVec4 exponent_mantissa = UVec4::sAnd(value, UVec4::sReplicate(HALF_FLT_EXPONENT_AND_MANTISSA_MASK)).LogicalShiftLeft<FLOAT_EXPONENT_POS - HALF_FLT_EXPONENT_POS>() + UVec4::sReplicate((FLOAT_EXPONENT_BIAS - HALF_FLT_EXPONENT_BIAS) << FLOAT_EXPONENT_POS);
// Denormalized half float path, renormalize the float
UVec4 exponent_mantissa_denormalized = ((exponent_mantissa + UVec4::sReplicate(1 << FLOAT_EXPONENT_POS)).ReinterpretAsFloat() - UVec4::sReplicate((FLOAT_EXPONENT_BIAS - HALF_FLT_EXPONENT_BIAS + 1) << FLOAT_EXPONENT_POS).ReinterpretAsFloat()).ReinterpretAsInt();
// NaN / INF path, set all exponent bits
UVec4 exponent_mantissa_nan_inf = UVec4::sOr(exponent_mantissa, UVec4::sReplicate(FLOAT_EXPONENT_MASK << FLOAT_EXPONENT_POS));
// Get the exponent to determine which of the paths we should take
UVec4 exponent_mask = UVec4::sReplicate(HALF_FLT_EXPONENT_MASK << HALF_FLT_EXPONENT_POS);
UVec4 exponent = UVec4::sAnd(value, exponent_mask);
UVec4 is_denormalized = UVec4::sEquals(exponent, UVec4::sZero());
UVec4 is_nan_inf = UVec4::sEquals(exponent, exponent_mask);
// Select the correct result
UVec4 result_exponent_mantissa = UVec4::sSelect(UVec4::sSelect(exponent_mantissa, exponent_mantissa_nan_inf, is_nan_inf), exponent_mantissa_denormalized, is_denormalized);
// Extract the sign bit and shift it to the left
UVec4 sign = UVec4::sAnd(value, UVec4::sReplicate(1 << HALF_FLT_SIGN_POS)).LogicalShiftLeft<FLOAT_SIGN_POS - HALF_FLT_SIGN_POS>();
// Construct the float
return UVec4::sOr(sign, result_exponent_mantissa).ReinterpretAsFloat();
}
/// Convert 4 half floats (lower 64 bits) to floats
JPH_INLINE Vec4 ToFloat(UVec4Arg inValue)
{
#if defined(JPH_USE_F16C)
return _mm_cvtph_ps(inValue.mValue);
#elif defined(JPH_USE_NEON)
return vcvt_f32_f16(vreinterpret_f16_u32(vget_low_u32(inValue.mValue)));
#else
return ToFloatFallback(inValue);
#endif
}
} // HalfFloatConversion
JPH_NAMESPACE_END

View File

@@ -0,0 +1,243 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/MathTypes.h>
JPH_NAMESPACE_BEGIN
/// Holds a 4x4 matrix of floats, but supports also operations on the 3x3 upper left part of the matrix.
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Mat44
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying column type
using Type = Vec4::Type;
// Argument type
using ArgType = Mat44Arg;
/// Constructor
Mat44() = default; ///< Intentionally not initialized for performance reasons
JPH_INLINE Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4);
JPH_INLINE Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4);
Mat44(const Mat44 &inM2) = default;
Mat44 & operator = (const Mat44 &inM2) = default;
JPH_INLINE Mat44(Type inC1, Type inC2, Type inC3, Type inC4);
/// Zero matrix
static JPH_INLINE Mat44 sZero();
/// Identity matrix
static JPH_INLINE Mat44 sIdentity();
/// Matrix filled with NaN's
static JPH_INLINE Mat44 sNaN();
/// Load 16 floats from memory
static JPH_INLINE Mat44 sLoadFloat4x4(const Float4 *inV);
/// Load 16 floats from memory, 16 bytes aligned
static JPH_INLINE Mat44 sLoadFloat4x4Aligned(const Float4 *inV);
/// Rotate around X, Y or Z axis (angle in radians)
static JPH_INLINE Mat44 sRotationX(float inX);
static JPH_INLINE Mat44 sRotationY(float inY);
static JPH_INLINE Mat44 sRotationZ(float inZ);
/// Rotate around arbitrary axis
static JPH_INLINE Mat44 sRotation(Vec3Arg inAxis, float inAngle);
/// Rotate from quaternion
static JPH_INLINE Mat44 sRotation(QuatArg inQuat);
/// Get matrix that translates
static JPH_INLINE Mat44 sTranslation(Vec3Arg inV);
/// Get matrix that rotates and translates
static JPH_INLINE Mat44 sRotationTranslation(QuatArg inR, Vec3Arg inT);
/// Get inverse matrix of sRotationTranslation
static JPH_INLINE Mat44 sInverseRotationTranslation(QuatArg inR, Vec3Arg inT);
/// Get matrix that scales uniformly
static JPH_INLINE Mat44 sScale(float inScale);
/// Get matrix that scales (produces a matrix with (inV, 1) on its diagonal)
static JPH_INLINE Mat44 sScale(Vec3Arg inV);
/// Get outer product of inV and inV2 (equivalent to \f$inV1 \otimes inV2\f$)
static JPH_INLINE Mat44 sOuterProduct(Vec3Arg inV1, Vec3Arg inV2);
/// Get matrix that represents a cross product \f$A \times B = \text{sCrossProduct}(A) \: B\f$
static JPH_INLINE Mat44 sCrossProduct(Vec3Arg inV);
/// Returns matrix ML so that \f$ML(q) \: p = q \: p\f$ (where p and q are quaternions)
static JPH_INLINE Mat44 sQuatLeftMultiply(QuatArg inQ);
/// Returns matrix MR so that \f$MR(q) \: p = p \: q\f$ (where p and q are quaternions)
static JPH_INLINE Mat44 sQuatRightMultiply(QuatArg inQ);
/// Returns a look at matrix that transforms from world space to view space
/// @param inPos Position of the camera
/// @param inTarget Target of the camera
/// @param inUp Up vector
static JPH_INLINE Mat44 sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp);
/// Returns a right-handed perspective projection matrix
static JPH_INLINE Mat44 sPerspective(float inFovY, float inAspect, float inNear, float inFar);
/// Get float component by element index
JPH_INLINE float operator () (uint inRow, uint inColumn) const { JPH_ASSERT(inRow < 4); JPH_ASSERT(inColumn < 4); return mCol[inColumn].mF32[inRow]; }
JPH_INLINE float & operator () (uint inRow, uint inColumn) { JPH_ASSERT(inRow < 4); JPH_ASSERT(inColumn < 4); return mCol[inColumn].mF32[inRow]; }
/// Comparison
JPH_INLINE bool operator == (Mat44Arg inM2) const;
JPH_INLINE bool operator != (Mat44Arg inM2) const { return !(*this == inM2); }
/// Test if two matrices are close
JPH_INLINE bool IsClose(Mat44Arg inM2, float inMaxDistSq = 1.0e-12f) const;
/// Multiply matrix by matrix
JPH_INLINE Mat44 operator * (Mat44Arg inM) const;
/// Multiply vector by matrix
JPH_INLINE Vec3 operator * (Vec3Arg inV) const;
JPH_INLINE Vec4 operator * (Vec4Arg inV) const;
/// Multiply vector by only 3x3 part of the matrix
JPH_INLINE Vec3 Multiply3x3(Vec3Arg inV) const;
/// Multiply vector by only 3x3 part of the transpose of the matrix (\f$result = this^T \: inV\f$)
JPH_INLINE Vec3 Multiply3x3Transposed(Vec3Arg inV) const;
/// Multiply 3x3 matrix by 3x3 matrix
JPH_INLINE Mat44 Multiply3x3(Mat44Arg inM) const;
/// Multiply transpose of 3x3 matrix by 3x3 matrix (\f$result = this^T \: inM\f$)
JPH_INLINE Mat44 Multiply3x3LeftTransposed(Mat44Arg inM) const;
/// Multiply 3x3 matrix by the transpose of a 3x3 matrix (\f$result = this \: inM^T\f$)
JPH_INLINE Mat44 Multiply3x3RightTransposed(Mat44Arg inM) const;
/// Multiply matrix with float
JPH_INLINE Mat44 operator * (float inV) const;
friend JPH_INLINE Mat44 operator * (float inV, Mat44Arg inM) { return inM * inV; }
/// Multiply matrix with float
JPH_INLINE Mat44 & operator *= (float inV);
/// Per element addition of matrix
JPH_INLINE Mat44 operator + (Mat44Arg inM) const;
/// Negate
JPH_INLINE Mat44 operator - () const;
/// Per element subtraction of matrix
JPH_INLINE Mat44 operator - (Mat44Arg inM) const;
/// Per element addition of matrix
JPH_INLINE Mat44 & operator += (Mat44Arg inM);
/// Access to the columns
JPH_INLINE Vec3 GetAxisX() const { return Vec3(mCol[0]); }
JPH_INLINE void SetAxisX(Vec3Arg inV) { mCol[0] = Vec4(inV, 0.0f); }
JPH_INLINE Vec3 GetAxisY() const { return Vec3(mCol[1]); }
JPH_INLINE void SetAxisY(Vec3Arg inV) { mCol[1] = Vec4(inV, 0.0f); }
JPH_INLINE Vec3 GetAxisZ() const { return Vec3(mCol[2]); }
JPH_INLINE void SetAxisZ(Vec3Arg inV) { mCol[2] = Vec4(inV, 0.0f); }
JPH_INLINE Vec3 GetTranslation() const { return Vec3(mCol[3]); }
JPH_INLINE void SetTranslation(Vec3Arg inV) { mCol[3] = Vec4(inV, 1.0f); }
JPH_INLINE Vec3 GetDiagonal3() const { return Vec3(mCol[0][0], mCol[1][1], mCol[2][2]); }
JPH_INLINE void SetDiagonal3(Vec3Arg inV) { mCol[0][0] = inV.GetX(); mCol[1][1] = inV.GetY(); mCol[2][2] = inV.GetZ(); }
JPH_INLINE Vec4 GetDiagonal4() const { return Vec4(mCol[0][0], mCol[1][1], mCol[2][2], mCol[3][3]); }
JPH_INLINE void SetDiagonal4(Vec4Arg inV) { mCol[0][0] = inV.GetX(); mCol[1][1] = inV.GetY(); mCol[2][2] = inV.GetZ(); mCol[3][3] = inV.GetW(); }
JPH_INLINE Vec3 GetColumn3(uint inCol) const { JPH_ASSERT(inCol < 4); return Vec3(mCol[inCol]); }
JPH_INLINE void SetColumn3(uint inCol, Vec3Arg inV) { JPH_ASSERT(inCol < 4); mCol[inCol] = Vec4(inV, inCol == 3? 1.0f : 0.0f); }
JPH_INLINE Vec4 GetColumn4(uint inCol) const { JPH_ASSERT(inCol < 4); return mCol[inCol]; }
JPH_INLINE void SetColumn4(uint inCol, Vec4Arg inV) { JPH_ASSERT(inCol < 4); mCol[inCol] = inV; }
/// Store matrix to memory
JPH_INLINE void StoreFloat4x4(Float4 *outV) const;
/// Transpose matrix
JPH_INLINE Mat44 Transposed() const;
/// Transpose 3x3 subpart of matrix
JPH_INLINE Mat44 Transposed3x3() const;
/// Inverse 4x4 matrix
JPH_INLINE Mat44 Inversed() const;
/// Inverse 4x4 matrix when it only contains rotation and translation
JPH_INLINE Mat44 InversedRotationTranslation() const;
/// Get the determinant of a 3x3 matrix
JPH_INLINE float GetDeterminant3x3() const;
/// Get the adjoint of a 3x3 matrix
JPH_INLINE Mat44 Adjointed3x3() const;
/// Inverse 3x3 matrix
JPH_INLINE Mat44 Inversed3x3() const;
/// *this = inM.Inversed3x3(), returns false if the matrix is singular in which case *this is unchanged
JPH_INLINE bool SetInversed3x3(Mat44Arg inM);
/// Get rotation part only (note: retains the first 3 values from the bottom row)
JPH_INLINE Mat44 GetRotation() const;
/// Get rotation part only (note: also clears the bottom row)
JPH_INLINE Mat44 GetRotationSafe() const;
/// Updates the rotation part of this matrix (the first 3 columns)
JPH_INLINE void SetRotation(Mat44Arg inRotation);
/// Convert to quaternion
JPH_INLINE Quat GetQuaternion() const;
/// Get matrix that transforms a direction with the same transform as this matrix (length is not preserved)
JPH_INLINE Mat44 GetDirectionPreservingMatrix() const { return GetRotation().Inversed3x3().Transposed3x3(); }
/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
JPH_INLINE Mat44 PreTranslated(Vec3Arg inTranslation) const;
/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
JPH_INLINE Mat44 PostTranslated(Vec3Arg inTranslation) const;
/// Scale a matrix: result = this * Mat44::sScale(inScale)
JPH_INLINE Mat44 PreScaled(Vec3Arg inScale) const;
/// Scale a matrix: result = Mat44::sScale(inScale) * this
JPH_INLINE Mat44 PostScaled(Vec3Arg inScale) const;
/// Decompose a matrix into a rotation & translation part and into a scale part so that:
/// this = return_value * Mat44::sScale(outScale).
/// This equation only holds when the matrix is orthogonal, if it is not the returned matrix
/// will be made orthogonal using the modified Gram-Schmidt algorithm (see: https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process)
JPH_INLINE Mat44 Decompose(Vec3 &outScale) const;
#ifndef JPH_DOUBLE_PRECISION
/// In single precision mode just return the matrix itself
JPH_INLINE Mat44 ToMat44() const { return *this; }
#endif // !JPH_DOUBLE_PRECISION
/// To String
friend ostream & operator << (ostream &inStream, Mat44Arg inM)
{
inStream << inM.mCol[0] << ", " << inM.mCol[1] << ", " << inM.mCol[2] << ", " << inM.mCol[3];
return inStream;
}
private:
Vec4 mCol[4]; ///< Column
};
static_assert(std::is_trivial<Mat44>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "Mat44.inl"

View File

@@ -0,0 +1,952 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Vec3.h>
#include <Jolt/Math/Vec4.h>
#include <Jolt/Math/Quat.h>
JPH_NAMESPACE_BEGIN
#define JPH_EL(r, c) mCol[c].mF32[r]
Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
mCol { inC1, inC2, inC3, inC4 }
{
}
Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4) :
mCol { inC1, inC2, inC3, Vec4(inC4, 1.0f) }
{
}
Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
mCol { inC1, inC2, inC3, inC4 }
{
}
Mat44 Mat44::sZero()
{
return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
}
Mat44 Mat44::sIdentity()
{
return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sNaN()
{
return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
}
Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
{
Mat44 result;
for (int c = 0; c < 4; ++c)
result.mCol[c] = Vec4::sLoadFloat4(inV + c);
return result;
}
Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
{
Mat44 result;
for (int c = 0; c < 4; ++c)
result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
return result;
}
Mat44 Mat44::sRotationX(float inX)
{
Vec4 sv, cv;
Vec4::sReplicate(inX).SinCos(sv, cv);
float s = sv.GetX(), c = cv.GetX();
return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sRotationY(float inY)
{
Vec4 sv, cv;
Vec4::sReplicate(inY).SinCos(sv, cv);
float s = sv.GetX(), c = cv.GetX();
return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sRotationZ(float inZ)
{
Vec4 sv, cv;
Vec4::sReplicate(inZ).SinCos(sv, cv);
float s = sv.GetX(), c = cv.GetX();
return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sRotation(QuatArg inQuat)
{
JPH_ASSERT(inQuat.IsNormalized());
// See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
#ifdef JPH_USE_SSE4_1
__m128 xyzw = inQuat.mValue.mValue;
__m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
__m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
__m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
__m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
__m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
__m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
__m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
__m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
__m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
// Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
// There doesn't appear to be a reliable way to turn this off in Clang
minus = _mm_insert_ps(minus, minus, 0b1000);
__m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
__m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
__m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
__m128 col3 = _mm_set_ps(1, 0, 0, 0);
return Mat44(col0, col1, col2, col3);
#else
float x = inQuat.GetX();
float y = inQuat.GetY();
float z = inQuat.GetZ();
float w = inQuat.GetW();
float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
float ty = y + y;
float tz = z + z;
float xx = tx * x;
float yy = ty * y;
float zz = tz * z;
float xy = tx * y;
float xz = tx * z;
float xw = tx * w;
float yz = ty * z;
float yw = ty * w;
float zw = tz * w;
return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
Vec4(0.0f, 0.0f, 0.0f, 1.0f));
#endif
}
Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
{
return sRotation(Quat::sRotation(inAxis, inAngle));
}
Mat44 Mat44::sTranslation(Vec3Arg inV)
{
return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
}
Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
{
Mat44 m = sRotation(inR);
m.SetTranslation(inT);
return m;
}
Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
{
Mat44 m = sRotation(inR.Conjugated());
m.SetTranslation(-m.Multiply3x3(inT));
return m;
}
Mat44 Mat44::sScale(float inScale)
{
return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sScale(Vec3Arg inV)
{
return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
{
Vec4 v1(inV1, 0);
return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
}
Mat44 Mat44::sCrossProduct(Vec3Arg inV)
{
#ifdef JPH_USE_SSE4_1
// Zero out the W component
__m128 zero = _mm_setzero_ps();
__m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
// Negate
__m128 min_v = _mm_sub_ps(zero, v);
return Mat44(
_mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
_mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
_mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
Vec4(0, 0, 0, 1));
#else
float x = inV.GetX();
float y = inV.GetY();
float z = inV.GetZ();
return Mat44(
Vec4(0, z, -y, 0),
Vec4(-z, 0, x, 0),
Vec4(y, -x, 0, 0),
Vec4(0, 0, 0, 1));
#endif
}
Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
{
Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
Vec3 up = right.Cross(direction);
return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
}
Mat44 Mat44::sPerspective(float inFovY, float inAspect, float inNear, float inFar)
{
float height = 1.0f / Tan(0.5f * inFovY);
float width = height / inAspect;
float range = inFar / (inNear - inFar);
return Mat44(Vec4(width, 0.0f, 0.0f, 0.0f), Vec4(0.0f, height, 0.0f, 0.0f), Vec4(0.0f, 0.0f, range, -1.0f), Vec4(0.0f, 0.0f, range * inNear, 0.0f));
}
bool Mat44::operator == (Mat44Arg inM2) const
{
return UVec4::sAnd(
UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
).TestAllTrue();
}
bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
{
for (int i = 0; i < 4; ++i)
if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
return false;
return true;
}
Mat44 Mat44::operator * (Mat44Arg inM) const
{
Mat44 result;
#if defined(JPH_USE_SSE)
for (int i = 0; i < 4; ++i)
{
__m128 c = inM.mCol[i].mValue;
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
result.mCol[i].mValue = t;
}
#elif defined(JPH_USE_NEON)
for (int i = 0; i < 4; ++i)
{
Type c = inM.mCol[i].mValue;
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
result.mCol[i].mValue = t;
}
#else
for (int i = 0; i < 4; ++i)
result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
#endif
return result;
}
Vec3 Mat44::operator * (Vec3Arg inV) const
{
#if defined(JPH_USE_SSE)
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
t = _mm_add_ps(t, mCol[3].mValue);
return Vec3::sFixW(t);
#elif defined(JPH_USE_NEON)
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
return Vec3::sFixW(t);
#else
return Vec3(
mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0],
mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1],
mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
#endif
}
Vec4 Mat44::operator * (Vec4Arg inV) const
{
#if defined(JPH_USE_SSE)
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
return t;
#elif defined(JPH_USE_NEON)
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
return t;
#else
return Vec4(
mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3],
mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3],
mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3],
mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
#endif
}
Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
{
#if defined(JPH_USE_SSE)
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
return Vec3::sFixW(t);
#elif defined(JPH_USE_NEON)
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
return Vec3::sFixW(t);
#else
return Vec3(
mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2],
mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2],
mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
#endif
}
Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
{
#if defined(JPH_USE_SSE4_1)
__m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
__m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
__m128 xy = _mm_blend_ps(x, y, 0b0010);
__m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
__m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
return xyzz;
#else
return Transposed3x3().Multiply3x3(inV);
#endif
}
Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
{
JPH_ASSERT(mCol[0][3] == 0.0f);
JPH_ASSERT(mCol[1][3] == 0.0f);
JPH_ASSERT(mCol[2][3] == 0.0f);
Mat44 result;
#if defined(JPH_USE_SSE)
for (int i = 0; i < 3; ++i)
{
__m128 c = inM.mCol[i].mValue;
__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
result.mCol[i].mValue = t;
}
#elif defined(JPH_USE_NEON)
for (int i = 0; i < 3; ++i)
{
Type c = inM.mCol[i].mValue;
Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
result.mCol[i].mValue = t;
}
#else
for (int i = 0; i < 3; ++i)
result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
#endif
result.mCol[3] = Vec4(0, 0, 0, 1);
return result;
}
Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
{
// Transpose left hand side
Mat44 trans = Transposed3x3();
// Do 3x3 matrix multiply
Mat44 result;
result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
result.mCol[3] = Vec4(0, 0, 0, 1);
return result;
}
Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
{
JPH_ASSERT(mCol[0][3] == 0.0f);
JPH_ASSERT(mCol[1][3] == 0.0f);
JPH_ASSERT(mCol[2][3] == 0.0f);
Mat44 result;
result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
result.mCol[3] = Vec4(0, 0, 0, 1);
return result;
}
Mat44 Mat44::operator * (float inV) const
{
Vec4 multiplier = Vec4::sReplicate(inV);
Mat44 result;
for (int c = 0; c < 4; ++c)
result.mCol[c] = mCol[c] * multiplier;
return result;
}
Mat44 &Mat44::operator *= (float inV)
{
for (int c = 0; c < 4; ++c)
mCol[c] *= inV;
return *this;
}
Mat44 Mat44::operator + (Mat44Arg inM) const
{
Mat44 result;
for (int i = 0; i < 4; ++i)
result.mCol[i] = mCol[i] + inM.mCol[i];
return result;
}
Mat44 Mat44::operator - () const
{
Mat44 result;
for (int i = 0; i < 4; ++i)
result.mCol[i] = -mCol[i];
return result;
}
Mat44 Mat44::operator - (Mat44Arg inM) const
{
Mat44 result;
for (int i = 0; i < 4; ++i)
result.mCol[i] = mCol[i] - inM.mCol[i];
return result;
}
Mat44 &Mat44::operator += (Mat44Arg inM)
{
for (int c = 0; c < 4; ++c)
mCol[c] += inM.mCol[c];
return *this;
}
void Mat44::StoreFloat4x4(Float4 *outV) const
{
for (int c = 0; c < 4; ++c)
mCol[c].StoreFloat4(outV + c);
}
Mat44 Mat44::Transposed() const
{
#if defined(JPH_USE_SSE)
__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
__m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
__m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
__m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
Mat44 result;
result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
return result;
#elif defined(JPH_USE_NEON)
float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
Mat44 result;
result.mCol[0].mValue = tmp3.val[0];
result.mCol[1].mValue = tmp3.val[1];
result.mCol[2].mValue = tmp4.val[0];
result.mCol[3].mValue = tmp4.val[1];
return result;
#else
Mat44 result;
for (int c = 0; c < 4; ++c)
for (int r = 0; r < 4; ++r)
result.mCol[r].mF32[c] = mCol[c].mF32[r];
return result;
#endif
}
Mat44 Mat44::Transposed3x3() const
{
#if defined(JPH_USE_SSE)
__m128 zero = _mm_setzero_ps();
__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
__m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
__m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
__m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
Mat44 result;
result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
#elif defined(JPH_USE_NEON)
float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
Mat44 result;
result.mCol[0].mValue = tmp3.val[0];
result.mCol[1].mValue = tmp3.val[1];
result.mCol[2].mValue = tmp4.val[0];
#else
Mat44 result;
for (int c = 0; c < 3; ++c)
{
for (int r = 0; r < 3; ++r)
result.mCol[c].mF32[r] = mCol[r].mF32[c];
result.mCol[c].mF32[3] = 0;
}
#endif
result.mCol[3] = Vec4(0, 0, 0, 1);
return result;
}
Mat44 Mat44::Inversed() const
{
#if defined(JPH_USE_SSE)
// Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
// Streaming SIMD Extensions - Inverse of 4x4 Matrix
// Adapted to load data using _mm_shuffle_ps instead of loading from memory
// Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
tmp1 = _mm_mul_ps(row2, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
__m128 minor0 = _mm_mul_ps(row1, tmp1);
__m128 minor1 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
tmp1 = _mm_mul_ps(row1, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
__m128 minor3 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
__m128 minor2 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
tmp1 = _mm_mul_ps(row0, row1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
tmp1 = _mm_mul_ps(row0, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
tmp1 = _mm_mul_ps(row0, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
__m128 det = _mm_mul_ps(row0, minor0);
det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
det = _mm_div_ss(_mm_set_ss(1.0f), det);
det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
Mat44 result;
result.mCol[0].mValue = _mm_mul_ps(det, minor0);
result.mCol[1].mValue = _mm_mul_ps(det, minor1);
result.mCol[2].mValue = _mm_mul_ps(det, minor2);
result.mCol[3].mValue = _mm_mul_ps(det, minor3);
return result;
#elif defined(JPH_USE_NEON)
// Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
tmp1 = vmulq_f32(row2, row3);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
Type minor0 = vmulq_f32(row1, tmp1);
Type minor1 = vmulq_f32(row0, tmp1);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
tmp1 = vmulq_f32(row1, row2);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
Type minor3 = vmulq_f32(row0, tmp1);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
tmp1 = vmulq_f32(tmp1, row3);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
Type minor2 = vmulq_f32(row0, tmp1);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
tmp1 = vmulq_f32(row0, row1);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
tmp1 = vmulq_f32(row0, row3);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
tmp1 = vmulq_f32(row0, row2);
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
Type det = vmulq_f32(row0, minor0);
det = vdupq_n_f32(vaddvq_f32(det));
det = vdivq_f32(vdupq_n_f32(1.0f), det);
Mat44 result;
result.mCol[0].mValue = vmulq_f32(det, minor0);
result.mCol[1].mValue = vmulq_f32(det, minor1);
result.mCol[2].mValue = vmulq_f32(det, minor2);
result.mCol[3].mValue = vmulq_f32(det, minor3);
return result;
#else
float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
float m10211120 = m10 * m21 - m11 * m20;
float m10221220 = m10 * m22 - m12 * m20;
float m10231320 = m10 * m23 - m13 * m20;
float m10311130 = m10 * m31 - m11 * m30;
float m10321230 = m10 * m32 - m12 * m30;
float m10331330 = m10 * m33 - m13 * m30;
float m11221221 = m11 * m22 - m12 * m21;
float m11231321 = m11 * m23 - m13 * m21;
float m11321231 = m11 * m32 - m12 * m31;
float m11331331 = m11 * m33 - m13 * m31;
float m12231322 = m12 * m23 - m13 * m22;
float m12331332 = m12 * m33 - m13 * m32;
float m20312130 = m20 * m31 - m21 * m30;
float m20322230 = m20 * m32 - m22 * m30;
float m20332330 = m20 * m33 - m23 * m30;
float m21322231 = m21 * m32 - m22 * m31;
float m21332331 = m21 * m33 - m23 * m31;
float m22332332 = m22 * m33 - m23 * m32;
Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
#endif
}
Mat44 Mat44::InversedRotationTranslation() const
{
Mat44 m = Transposed3x3();
m.SetTranslation(-m.Multiply3x3(GetTranslation()));
return m;
}
float Mat44::GetDeterminant3x3() const
{
return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
}
Mat44 Mat44::Adjointed3x3() const
{
return Mat44(
Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0),
Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0),
Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0),
Vec4(0, 0, 0, 1));
}
Mat44 Mat44::Inversed3x3() const
{
float det = GetDeterminant3x3();
return Mat44(
(Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)) / det,
(Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)) / det,
(Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0)) / det,
Vec4(0, 0, 0, 1));
}
bool Mat44::SetInversed3x3(Mat44Arg inM)
{
float det = inM.GetDeterminant3x3();
// If the determinant is zero the matrix is singular and we return false
if (det == 0.0f)
return false;
// Finish calculating the inverse
*this = inM.Adjointed3x3();
mCol[0] /= det;
mCol[1] /= det;
mCol[2] /= det;
return true;
}
Quat Mat44::GetQuaternion() const
{
float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
if (tr >= 0.0f)
{
float s = sqrt(tr + 1.0f);
float is = 0.5f / s;
return Quat(
(mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
(mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
(mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
0.5f * s);
}
else
{
int i = 0;
if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
if (i == 0)
{
float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
float is = 0.5f / s;
return Quat(
0.5f * s,
(mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
(mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
(mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
}
else if (i == 1)
{
float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
float is = 0.5f / s;
return Quat(
(mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
0.5f * s,
(mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
(mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
}
else
{
JPH_ASSERT(i == 2);
float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
float is = 0.5f / s;
return Quat(
(mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
(mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
0.5f * s,
(mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
}
}
}
Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
{
return Mat44(
Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
inQ.mValue);
}
Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
{
return Mat44(
Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
inQ.mValue);
}
Mat44 Mat44::GetRotation() const
{
JPH_ASSERT(mCol[0][3] == 0.0f);
JPH_ASSERT(mCol[1][3] == 0.0f);
JPH_ASSERT(mCol[2][3] == 0.0f);
return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
}
Mat44 Mat44::GetRotationSafe() const
{
#if defined(JPH_USE_AVX512)
return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
_mm_maskz_mov_ps(0b0111, mCol[1].mValue),
_mm_maskz_mov_ps(0b0111, mCol[2].mValue),
Vec4(0, 0, 0, 1));
#elif defined(JPH_USE_SSE4_1)
__m128 zero = _mm_setzero_ps();
return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
_mm_blend_ps(mCol[1].mValue, zero, 8),
_mm_blend_ps(mCol[2].mValue, zero, 8),
Vec4(0, 0, 0, 1));
#elif defined(JPH_USE_NEON)
return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
vsetq_lane_f32(0, mCol[1].mValue, 3),
vsetq_lane_f32(0, mCol[2].mValue, 3),
Vec4(0, 0, 0, 1));
#else
return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
Vec4(0, 0, 0, 1));
#endif
}
void Mat44::SetRotation(Mat44Arg inRotation)
{
mCol[0] = inRotation.mCol[0];
mCol[1] = inRotation.mCol[1];
mCol[2] = inRotation.mCol[2];
}
Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
{
return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
}
Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
{
return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
}
Mat44 Mat44::PreScaled(Vec3Arg inScale) const
{
return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
}
Mat44 Mat44::PostScaled(Vec3Arg inScale) const
{
Vec4 scale(inScale, 1);
return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
}
Mat44 Mat44::Decompose(Vec3 &outScale) const
{
// Start the modified Gram-Schmidt algorithm
// X axis will just be normalized
Vec3 x = GetAxisX();
// Make Y axis perpendicular to X
Vec3 y = GetAxisY();
float x_dot_x = x.LengthSq();
y -= (x.Dot(y) / x_dot_x) * x;
// Make Z axis perpendicular to X
Vec3 z = GetAxisZ();
z -= (x.Dot(z) / x_dot_x) * x;
// Make Z axis perpendicular to Y
float y_dot_y = y.LengthSq();
z -= (y.Dot(z) / y_dot_y) * y;
// Determine the scale
float z_dot_z = z.LengthSq();
outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
// If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
if (x.Cross(y).Dot(z) < 0.0f)
outScale.SetZ(-outScale.GetZ());
// Determine the rotation and translation
return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
}
#undef JPH_EL
JPH_NAMESPACE_END

208
thirdparty/jolt_physics/Jolt/Math/Math.h vendored Normal file
View File

@@ -0,0 +1,208 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// The constant \f$\pi\f$
static constexpr float JPH_PI = 3.14159265358979323846f;
/// A large floating point value which, when squared, is still much smaller than FLT_MAX
static constexpr float cLargeFloat = 1.0e15f;
/// Convert a value from degrees to radians
JPH_INLINE constexpr float DegreesToRadians(float inV)
{
return inV * (JPH_PI / 180.0f);
}
/// Convert a value from radians to degrees
JPH_INLINE constexpr float RadiansToDegrees(float inV)
{
return inV * (180.0f / JPH_PI);
}
/// Convert angle in radians to the range \f$[-\pi, \pi]\f$
inline float CenterAngleAroundZero(float inV)
{
if (inV < -JPH_PI)
{
do
inV += 2.0f * JPH_PI;
while (inV < -JPH_PI);
}
else if (inV > JPH_PI)
{
do
inV -= 2.0f * JPH_PI;
while (inV > JPH_PI);
}
JPH_ASSERT(inV >= -JPH_PI && inV <= JPH_PI);
return inV;
}
/// Clamp a value between two values
template <typename T>
JPH_INLINE constexpr T Clamp(T inV, T inMin, T inMax)
{
return min(max(inV, inMin), inMax);
}
/// Square a value
template <typename T>
JPH_INLINE constexpr T Square(T inV)
{
return inV * inV;
}
/// Returns \f$inV^3\f$.
template <typename T>
JPH_INLINE constexpr T Cubed(T inV)
{
return inV * inV * inV;
}
/// Get the sign of a value
template <typename T>
JPH_INLINE constexpr T Sign(T inV)
{
return inV < 0? T(-1) : T(1);
}
/// Check if inV is a power of 2
template <typename T>
constexpr bool IsPowerOf2(T inV)
{
return inV > 0 && (inV & (inV - 1)) == 0;
}
/// Align inV up to the next inAlignment bytes
template <typename T>
inline T AlignUp(T inV, uint64 inAlignment)
{
JPH_ASSERT(IsPowerOf2(inAlignment));
return T((uint64(inV) + inAlignment - 1) & ~(inAlignment - 1));
}
/// Check if inV is inAlignment aligned
template <typename T>
inline bool IsAligned(T inV, uint64 inAlignment)
{
JPH_ASSERT(IsPowerOf2(inAlignment));
return (uint64(inV) & (inAlignment - 1)) == 0;
}
/// Compute number of trailing zero bits (how many low bits are zero)
inline uint CountTrailingZeros(uint32 inValue)
{
#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
#if defined(JPH_USE_TZCNT)
return _tzcnt_u32(inValue);
#elif defined(JPH_COMPILER_MSVC)
if (inValue == 0)
return 32;
unsigned long result;
_BitScanForward(&result, inValue);
return result;
#else
if (inValue == 0)
return 32;
return __builtin_ctz(inValue);
#endif
#elif defined(JPH_CPU_ARM)
#if defined(JPH_COMPILER_MSVC)
if (inValue == 0)
return 32;
unsigned long result;
_BitScanForward(&result, inValue);
return result;
#else
if (inValue == 0)
return 32;
return __builtin_ctz(inValue);
#endif
#elif defined(JPH_CPU_E2K) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
return inValue ? __builtin_ctz(inValue) : 32;
#else
#error Undefined
#endif
}
/// Compute the number of leading zero bits (how many high bits are zero)
inline uint CountLeadingZeros(uint32 inValue)
{
#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
#if defined(JPH_USE_LZCNT)
return _lzcnt_u32(inValue);
#elif defined(JPH_COMPILER_MSVC)
if (inValue == 0)
return 32;
unsigned long result;
_BitScanReverse(&result, inValue);
return 31 - result;
#else
if (inValue == 0)
return 32;
return __builtin_clz(inValue);
#endif
#elif defined(JPH_CPU_ARM)
#if defined(JPH_COMPILER_MSVC)
return _CountLeadingZeros(inValue);
#else
return __builtin_clz(inValue);
#endif
#elif defined(JPH_CPU_E2K) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
return inValue ? __builtin_clz(inValue) : 32;
#else
#error Undefined
#endif
}
/// Count the number of 1 bits in a value
inline uint CountBits(uint32 inValue)
{
#if defined(JPH_COMPILER_CLANG) || defined(JPH_COMPILER_GCC)
return __builtin_popcount(inValue);
#elif defined(JPH_COMPILER_MSVC)
#if defined(JPH_USE_SSE4_2)
return _mm_popcnt_u32(inValue);
#elif defined(JPH_USE_NEON) && (_MSC_VER >= 1930) // _CountOneBits not available on MSVC2019
return _CountOneBits(inValue);
#else
inValue = inValue - ((inValue >> 1) & 0x55555555);
inValue = (inValue & 0x33333333) + ((inValue >> 2) & 0x33333333);
inValue = (inValue + (inValue >> 4)) & 0x0F0F0F0F;
return (inValue * 0x01010101) >> 24;
#endif
#else
#error Undefined
#endif
}
/// Get the next higher power of 2 of a value, or the value itself if the value is already a power of 2
inline uint32 GetNextPowerOf2(uint32 inValue)
{
return inValue <= 1? uint32(1) : uint32(1) << (32 - CountLeadingZeros(inValue - 1));
}
// Simple implementation of C++20 std::bit_cast (unfortunately not constexpr)
template <class To, class From>
JPH_INLINE To BitCast(const From &inValue)
{
static_assert(std::is_trivially_constructible_v<To>);
static_assert(sizeof(From) == sizeof(To));
union FromTo
{
To mTo;
From mFrom;
};
FromTo convert;
convert.mFrom = inValue;
return convert.mTo;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,32 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
class Vec3;
class DVec3;
class Vec4;
class UVec4;
class BVec16;
class Quat;
class Mat44;
class DMat44;
// Types to use for passing arguments to functions
using Vec3Arg = const Vec3;
#ifdef JPH_USE_AVX
using DVec3Arg = const DVec3;
#else
using DVec3Arg = const DVec3 &;
#endif
using Vec4Arg = const Vec4;
using UVec4Arg = const UVec4;
using BVec16Arg = const BVec16;
using QuatArg = const Quat;
using Mat44Arg = const Mat44 &;
using DMat44Arg = const DMat44 &;
JPH_NAMESPACE_END

View File

@@ -0,0 +1,259 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Vector.h>
#include <Jolt/Math/GaussianElimination.h>
JPH_NAMESPACE_BEGIN
/// Templatized matrix class
template <uint Rows, uint Cols>
class [[nodiscard]] Matrix
{
public:
/// Constructor
inline Matrix() = default;
inline Matrix(const Matrix &inM2) { *this = inM2; }
/// Dimensions
inline uint GetRows() const { return Rows; }
inline uint GetCols() const { return Cols; }
/// Zero matrix
inline void SetZero()
{
for (uint c = 0; c < Cols; ++c)
mCol[c].SetZero();
}
inline static Matrix sZero() { Matrix m; m.SetZero(); return m; }
/// Check if this matrix consists of all zeros
inline bool IsZero() const
{
for (uint c = 0; c < Cols; ++c)
if (!mCol[c].IsZero())
return false;
return true;
}
/// Identity matrix
inline void SetIdentity()
{
// Clear matrix
SetZero();
// Set diagonal to 1
for (uint rc = 0, min_rc = min(Rows, Cols); rc < min_rc; ++rc)
mCol[rc].mF32[rc] = 1.0f;
}
inline static Matrix sIdentity() { Matrix m; m.SetIdentity(); return m; }
/// Check if this matrix is identity
bool IsIdentity() const { return *this == sIdentity(); }
/// Diagonal matrix
inline void SetDiagonal(const Vector<Rows < Cols? Rows : Cols> &inV)
{
// Clear matrix
SetZero();
// Set diagonal
for (uint rc = 0, min_rc = min(Rows, Cols); rc < min_rc; ++rc)
mCol[rc].mF32[rc] = inV[rc];
}
inline static Matrix sDiagonal(const Vector<Rows < Cols? Rows : Cols> &inV)
{
Matrix m;
m.SetDiagonal(inV);
return m;
}
/// Copy a (part) of another matrix into this matrix
template <class OtherMatrix>
void CopyPart(const OtherMatrix &inM, uint inSourceRow, uint inSourceCol, uint inNumRows, uint inNumCols, uint inDestRow, uint inDestCol)
{
for (uint c = 0; c < inNumCols; ++c)
for (uint r = 0; r < inNumRows; ++r)
mCol[inDestCol + c].mF32[inDestRow + r] = inM(inSourceRow + r, inSourceCol + c);
}
/// Get float component by element index
inline float operator () (uint inRow, uint inColumn) const
{
JPH_ASSERT(inRow < Rows);
JPH_ASSERT(inColumn < Cols);
return mCol[inColumn].mF32[inRow];
}
inline float & operator () (uint inRow, uint inColumn)
{
JPH_ASSERT(inRow < Rows);
JPH_ASSERT(inColumn < Cols);
return mCol[inColumn].mF32[inRow];
}
/// Comparison
inline bool operator == (const Matrix &inM2) const
{
for (uint c = 0; c < Cols; ++c)
if (mCol[c] != inM2.mCol[c])
return false;
return true;
}
inline bool operator != (const Matrix &inM2) const
{
for (uint c = 0; c < Cols; ++c)
if (mCol[c] != inM2.mCol[c])
return true;
return false;
}
/// Assignment
inline Matrix & operator = (const Matrix &inM2)
{
for (uint c = 0; c < Cols; ++c)
mCol[c] = inM2.mCol[c];
return *this;
}
/// Multiply matrix by matrix
template <uint OtherCols>
inline Matrix<Rows, OtherCols> operator * (const Matrix<Cols, OtherCols> &inM) const
{
Matrix<Rows, OtherCols> m;
for (uint c = 0; c < OtherCols; ++c)
for (uint r = 0; r < Rows; ++r)
{
float dot = 0.0f;
for (uint i = 0; i < Cols; ++i)
dot += mCol[i].mF32[r] * inM.mCol[c].mF32[i];
m.mCol[c].mF32[r] = dot;
}
return m;
}
/// Multiply vector by matrix
inline Vector<Rows> operator * (const Vector<Cols> &inV) const
{
Vector<Rows> v;
for (uint r = 0; r < Rows; ++r)
{
float dot = 0.0f;
for (uint c = 0; c < Cols; ++c)
dot += mCol[c].mF32[r] * inV.mF32[c];
v.mF32[r] = dot;
}
return v;
}
/// Multiply matrix with float
inline Matrix operator * (float inV) const
{
Matrix m;
for (uint c = 0; c < Cols; ++c)
m.mCol[c] = mCol[c] * inV;
return m;
}
inline friend Matrix operator * (float inV, const Matrix &inM)
{
return inM * inV;
}
/// Per element addition of matrix
inline Matrix operator + (const Matrix &inM) const
{
Matrix m;
for (uint c = 0; c < Cols; ++c)
m.mCol[c] = mCol[c] + inM.mCol[c];
return m;
}
/// Per element subtraction of matrix
inline Matrix operator - (const Matrix &inM) const
{
Matrix m;
for (uint c = 0; c < Cols; ++c)
m.mCol[c] = mCol[c] - inM.mCol[c];
return m;
}
/// Transpose matrix
inline Matrix<Cols, Rows> Transposed() const
{
Matrix<Cols, Rows> m;
for (uint r = 0; r < Rows; ++r)
for (uint c = 0; c < Cols; ++c)
m.mCol[r].mF32[c] = mCol[c].mF32[r];
return m;
}
/// Inverse matrix
bool SetInversed(const Matrix &inM)
{
if constexpr (Rows != Cols) JPH_ASSERT(false);
Matrix copy(inM);
SetIdentity();
return GaussianElimination(copy, *this);
}
inline Matrix Inversed() const
{
Matrix m;
m.SetInversed(*this);
return m;
}
/// To String
friend ostream & operator << (ostream &inStream, const Matrix &inM)
{
for (uint i = 0; i < Cols - 1; ++i)
inStream << inM.mCol[i] << ", ";
inStream << inM.mCol[Cols - 1];
return inStream;
}
/// Column access
const Vector<Rows> & GetColumn(int inIdx) const { return mCol[inIdx]; }
Vector<Rows> & GetColumn(int inIdx) { return mCol[inIdx]; }
Vector<Rows> mCol[Cols]; ///< Column
};
// The template specialization doesn't sit well with Doxygen
#ifndef JPH_PLATFORM_DOXYGEN
/// Specialization of SetInversed for 2x2 matrix
template <>
inline bool Matrix<2, 2>::SetInversed(const Matrix<2, 2> &inM)
{
// Fetch elements
float a = inM.mCol[0].mF32[0];
float b = inM.mCol[1].mF32[0];
float c = inM.mCol[0].mF32[1];
float d = inM.mCol[1].mF32[1];
// Calculate determinant
float det = a * d - b * c;
if (det == 0.0f)
return false;
// Construct inverse
mCol[0].mF32[0] = d / det;
mCol[1].mF32[0] = -b / det;
mCol[0].mF32[1] = -c / det;
mCol[1].mF32[1] = a / det;
return true;
}
#endif // !JPH_PLATFORM_DOXYGEN
JPH_NAMESPACE_END

255
thirdparty/jolt_physics/Jolt/Math/Quat.h vendored Normal file
View File

@@ -0,0 +1,255 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Vec3.h>
#include <Jolt/Math/Vec4.h>
JPH_NAMESPACE_BEGIN
/// Quaternion class, quaternions are 4 dimensional vectors which can describe rotations in 3 dimensional
/// space if their length is 1.
///
/// They are written as:
///
/// \f$q = w + x \: i + y \: j + z \: k\f$
///
/// or in vector notation:
///
/// \f$q = [w, v] = [w, x, y, z]\f$
///
/// Where:
///
/// w = the real part
/// v = the imaginary part, (x, y, z)
///
/// Note that we store the quaternion in a Vec4 as [x, y, z, w] because that makes
/// it easy to extract the rotation axis of the quaternion:
///
/// q = [cos(angle / 2), sin(angle / 2) * rotation_axis]
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Quat
{
public:
JPH_OVERRIDE_NEW_DELETE
///@name Constructors
///@{
inline Quat() = default; ///< Intentionally not initialized for performance reasons
Quat(const Quat &inRHS) = default;
Quat & operator = (const Quat &inRHS) = default;
inline Quat(float inX, float inY, float inZ, float inW) : mValue(inX, inY, inZ, inW) { }
inline explicit Quat(Vec4Arg inV) : mValue(inV) { }
///@}
///@name Tests
///@{
/// Check if two quaternions are exactly equal
inline bool operator == (QuatArg inRHS) const { return mValue == inRHS.mValue; }
/// Check if two quaternions are different
inline bool operator != (QuatArg inRHS) const { return mValue != inRHS.mValue; }
/// If this quaternion is close to inRHS. Note that q and -q represent the same rotation, this is not checked here.
inline bool IsClose(QuatArg inRHS, float inMaxDistSq = 1.0e-12f) const { return mValue.IsClose(inRHS.mValue, inMaxDistSq); }
/// If the length of this quaternion is 1 +/- inTolerance
inline bool IsNormalized(float inTolerance = 1.0e-5f) const { return mValue.IsNormalized(inTolerance); }
/// If any component of this quaternion is a NaN (not a number)
inline bool IsNaN() const { return mValue.IsNaN(); }
///@}
///@name Get components
///@{
/// Get X component (imaginary part i)
JPH_INLINE float GetX() const { return mValue.GetX(); }
/// Get Y component (imaginary part j)
JPH_INLINE float GetY() const { return mValue.GetY(); }
/// Get Z component (imaginary part k)
JPH_INLINE float GetZ() const { return mValue.GetZ(); }
/// Get W component (real part)
JPH_INLINE float GetW() const { return mValue.GetW(); }
/// Get the imaginary part of the quaternion
JPH_INLINE Vec3 GetXYZ() const { return Vec3(mValue); }
/// Get the quaternion as a Vec4
JPH_INLINE Vec4 GetXYZW() const { return mValue; }
/// Set individual components
JPH_INLINE void SetX(float inX) { mValue.SetX(inX); }
JPH_INLINE void SetY(float inY) { mValue.SetY(inY); }
JPH_INLINE void SetZ(float inZ) { mValue.SetZ(inZ); }
JPH_INLINE void SetW(float inW) { mValue.SetW(inW); }
/// Set all components
JPH_INLINE void Set(float inX, float inY, float inZ, float inW) { mValue.Set(inX, inY, inZ, inW); }
///@}
///@name Default quaternions
///@{
/// @return [0, 0, 0, 0]
JPH_INLINE static Quat sZero() { return Quat(Vec4::sZero()); }
/// @return [1, 0, 0, 0] (or in storage format Quat(0, 0, 0, 1))
JPH_INLINE static Quat sIdentity() { return Quat(0, 0, 0, 1); }
///@}
/// Rotation from axis and angle
JPH_INLINE static Quat sRotation(Vec3Arg inAxis, float inAngle);
/// Get axis and angle that represents this quaternion, outAngle will always be in the range \f$[0, \pi]\f$
JPH_INLINE void GetAxisAngle(Vec3 &outAxis, float &outAngle) const;
/// Create quaternion that rotates a vector from the direction of inFrom to the direction of inTo along the shortest path
/// @see https://www.euclideanspace.com/maths/algebra/vectors/angleBetween/index.htm
JPH_INLINE static Quat sFromTo(Vec3Arg inFrom, Vec3Arg inTo);
/// Random unit quaternion
template <class Random>
inline static Quat sRandom(Random &inRandom);
/// Conversion from Euler angles. Rotation order is X then Y then Z (RotZ * RotY * RotX). Angles in radians.
inline static Quat sEulerAngles(Vec3Arg inAngles);
/// Conversion to Euler angles. Rotation order is X then Y then Z (RotZ * RotY * RotX). Angles in radians.
inline Vec3 GetEulerAngles() const;
///@name Length / normalization operations
///@{
/// Squared length of quaternion.
/// @return Squared length of quaternion (\f$|v|^2\f$)
JPH_INLINE float LengthSq() const { return mValue.LengthSq(); }
/// Length of quaternion.
/// @return Length of quaternion (\f$|v|\f$)
JPH_INLINE float Length() const { return mValue.Length(); }
/// Normalize the quaternion (make it length 1)
JPH_INLINE Quat Normalized() const { return Quat(mValue.Normalized()); }
///@}
///@name Additions / multiplications
///@{
JPH_INLINE void operator += (QuatArg inRHS) { mValue += inRHS.mValue; }
JPH_INLINE void operator -= (QuatArg inRHS) { mValue -= inRHS.mValue; }
JPH_INLINE void operator *= (float inValue) { mValue *= inValue; }
JPH_INLINE void operator /= (float inValue) { mValue /= inValue; }
JPH_INLINE Quat operator - () const { return Quat(-mValue); }
JPH_INLINE Quat operator + (QuatArg inRHS) const { return Quat(mValue + inRHS.mValue); }
JPH_INLINE Quat operator - (QuatArg inRHS) const { return Quat(mValue - inRHS.mValue); }
JPH_INLINE Quat operator * (QuatArg inRHS) const;
JPH_INLINE Quat operator * (float inValue) const { return Quat(mValue * inValue); }
inline friend Quat operator * (float inValue, QuatArg inRHS) { return Quat(inRHS.mValue * inValue); }
JPH_INLINE Quat operator / (float inValue) const { return Quat(mValue / inValue); }
///@}
/// Rotate a vector by this quaternion
JPH_INLINE Vec3 operator * (Vec3Arg inValue) const;
/// Rotate a vector by the inverse of this quaternion
JPH_INLINE Vec3 InverseRotate(Vec3Arg inValue) const;
/// Rotate a the vector (1, 0, 0) with this quaternion
JPH_INLINE Vec3 RotateAxisX() const;
/// Rotate a the vector (0, 1, 0) with this quaternion
JPH_INLINE Vec3 RotateAxisY() const;
/// Rotate a the vector (0, 0, 1) with this quaternion
JPH_INLINE Vec3 RotateAxisZ() const;
/// Dot product
JPH_INLINE float Dot(QuatArg inRHS) const { return mValue.Dot(inRHS.mValue); }
/// The conjugate [w, -x, -y, -z] is the same as the inverse for unit quaternions
JPH_INLINE Quat Conjugated() const { return Quat(Vec4::sXor(mValue, UVec4(0x80000000, 0x80000000, 0x80000000, 0).ReinterpretAsFloat())); }
/// Get inverse quaternion
JPH_INLINE Quat Inversed() const { return Conjugated() / Length(); }
/// Ensures that the W component is positive by negating the entire quaternion if it is not. This is useful when you want to store a quaternion as a 3 vector by discarding W and reconstructing it as sqrt(1 - x^2 - y^2 - z^2).
JPH_INLINE Quat EnsureWPositive() const { return Quat(Vec4::sXor(mValue, Vec4::sAnd(mValue.SplatW(), UVec4::sReplicate(0x80000000).ReinterpretAsFloat()))); }
/// Get a quaternion that is perpendicular to this quaternion
JPH_INLINE Quat GetPerpendicular() const { return Quat(Vec4(1, -1, 1, -1) * mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>()); }
/// Get rotation angle around inAxis (uses Swing Twist Decomposition to get the twist quaternion and uses q(axis, angle) = [cos(angle / 2), axis * sin(angle / 2)])
JPH_INLINE float GetRotationAngle(Vec3Arg inAxis) const { return GetW() == 0.0f? JPH_PI : 2.0f * ATan(GetXYZ().Dot(inAxis) / GetW()); }
/// Swing Twist Decomposition: any quaternion can be split up as:
///
/// \f[q = q_{swing} \: q_{twist}\f]
///
/// where \f$q_{twist}\f$ rotates only around axis v.
///
/// \f$q_{twist}\f$ is:
///
/// \f[q_{twist} = \frac{[q_w, q_{ijk} \cdot v \: v]}{\left|[q_w, q_{ijk} \cdot v \: v]\right|}\f]
///
/// where q_w is the real part of the quaternion and q_i the imaginary part (a 3 vector).
///
/// The swing can then be calculated as:
///
/// \f[q_{swing} = q \: q_{twist}^* \f]
///
/// Where \f$q_{twist}^*\f$ = complex conjugate of \f$q_{twist}\f$
JPH_INLINE Quat GetTwist(Vec3Arg inAxis) const;
/// Decomposes quaternion into swing and twist component:
///
/// \f$q = q_{swing} \: q_{twist}\f$
///
/// where \f$q_{swing} \: \hat{x} = q_{twist} \: \hat{y} = q_{twist} \: \hat{z} = 0\f$
///
/// In other words:
///
/// - \f$q_{twist}\f$ only rotates around the X-axis.
/// - \f$q_{swing}\f$ only rotates around the Y and Z-axis.
///
/// @see Gino van den Bergen - Rotational Joint Limits in Quaternion Space - GDC 2016
JPH_INLINE void GetSwingTwist(Quat &outSwing, Quat &outTwist) const;
/// Linear interpolation between two quaternions (for small steps).
/// @param inFraction is in the range [0, 1]
/// @param inDestination The destination quaternion
/// @return (1 - inFraction) * this + fraction * inDestination
JPH_INLINE Quat LERP(QuatArg inDestination, float inFraction) const;
/// Spherical linear interpolation between two quaternions.
/// @param inFraction is in the range [0, 1]
/// @param inDestination The destination quaternion
/// @return When fraction is zero this quaternion is returned, when fraction is 1 inDestination is returned.
/// When fraction is between 0 and 1 an interpolation along the shortest path is returned.
JPH_INLINE Quat SLERP(QuatArg inDestination, float inFraction) const;
/// Load 3 floats from memory (X, Y and Z component and then calculates W) reads 32 bits extra which it doesn't use
static JPH_INLINE Quat sLoadFloat3Unsafe(const Float3 &inV);
/// Store 3 as floats to memory (X, Y and Z component)
JPH_INLINE void StoreFloat3(Float3 *outV) const;
/// To String
friend ostream & operator << (ostream &inStream, QuatArg inQ) { inStream << inQ.mValue; return inStream; }
/// 4 vector that stores [x, y, z, w] parts of the quaternion
Vec4 mValue;
};
static_assert(std::is_trivial<Quat>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "Quat.inl"

View File

@@ -0,0 +1,328 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
JPH_NAMESPACE_BEGIN
Quat Quat::operator * (QuatArg inRHS) const
{
#if defined(JPH_USE_SSE4_1)
// Taken from: http://momchil-velikov.blogspot.nl/2013/10/fast-sse-quternion-multiplication.html
__m128 abcd = mValue.mValue;
__m128 xyzw = inRHS.mValue.mValue;
__m128 t0 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(3, 3, 3, 3));
__m128 t1 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2, 3, 0, 1));
__m128 t3 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0, 0, 0, 0));
__m128 t4 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(1, 0, 3, 2));
__m128 t5 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(1, 1, 1, 1));
__m128 t6 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2, 0, 3, 1));
// [d,d,d,d] * [z,w,x,y] = [dz,dw,dx,dy]
__m128 m0 = _mm_mul_ps(t0, t1);
// [a,a,a,a] * [y,x,w,z] = [ay,ax,aw,az]
__m128 m1 = _mm_mul_ps(t3, t4);
// [b,b,b,b] * [z,x,w,y] = [bz,bx,bw,by]
__m128 m2 = _mm_mul_ps(t5, t6);
// [c,c,c,c] * [w,z,x,y] = [cw,cz,cx,cy]
__m128 t7 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2, 2, 2, 2));
__m128 t8 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 2, 0, 1));
__m128 m3 = _mm_mul_ps(t7, t8);
// [dz,dw,dx,dy] + -[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az]
__m128 e = _mm_addsub_ps(m0, m1);
// [dx+aw,dz+ay,dy-az,dw-ax]
e = _mm_shuffle_ps(e, e, _MM_SHUFFLE(1, 3, 0, 2));
// [dx+aw,dz+ay,dy-az,dw-ax] + -[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]
e = _mm_addsub_ps(e, m2);
// [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]
e = _mm_shuffle_ps(e, e, _MM_SHUFFLE(2, 0, 1, 3));
// [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] + -[cw,cz,cx,cy] = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy]
e = _mm_addsub_ps(e, m3);
// [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy]
return Quat(Vec4(_mm_shuffle_ps(e, e, _MM_SHUFFLE(2, 3, 1, 0))));
#else
float lx = mValue.GetX();
float ly = mValue.GetY();
float lz = mValue.GetZ();
float lw = mValue.GetW();
float rx = inRHS.mValue.GetX();
float ry = inRHS.mValue.GetY();
float rz = inRHS.mValue.GetZ();
float rw = inRHS.mValue.GetW();
float x = lw * rx + lx * rw + ly * rz - lz * ry;
float y = lw * ry - lx * rz + ly * rw + lz * rx;
float z = lw * rz + lx * ry - ly * rx + lz * rw;
float w = lw * rw - lx * rx - ly * ry - lz * rz;
return Quat(x, y, z, w);
#endif
}
Quat Quat::sRotation(Vec3Arg inAxis, float inAngle)
{
// returns [inAxis * sin(0.5f * inAngle), cos(0.5f * inAngle)]
JPH_ASSERT(inAxis.IsNormalized());
Vec4 s, c;
Vec4::sReplicate(0.5f * inAngle).SinCos(s, c);
return Quat(Vec4::sSelect(Vec4(inAxis) * s, c, UVec4(0, 0, 0, 0xffffffffU)));
}
void Quat::GetAxisAngle(Vec3 &outAxis, float &outAngle) const
{
JPH_ASSERT(IsNormalized());
Quat w_pos = EnsureWPositive();
float abs_w = w_pos.GetW();
if (abs_w >= 1.0f)
{
outAxis = Vec3::sZero();
outAngle = 0.0f;
}
else
{
outAngle = 2.0f * ACos(abs_w);
outAxis = w_pos.GetXYZ().NormalizedOr(Vec3::sZero());
}
}
Quat Quat::sFromTo(Vec3Arg inFrom, Vec3Arg inTo)
{
/*
Uses (inFrom = v1, inTo = v2):
angle = arcos(v1 . v2 / |v1||v2|)
axis = normalize(v1 x v2)
Quaternion is then:
s = sin(angle / 2)
x = axis.x * s
y = axis.y * s
z = axis.z * s
w = cos(angle / 2)
Using identities:
sin(2 * a) = 2 * sin(a) * cos(a)
cos(2 * a) = cos(a)^2 - sin(a)^2
sin(a)^2 + cos(a)^2 = 1
This reduces to:
x = (v1 x v2).x
y = (v1 x v2).y
z = (v1 x v2).z
w = |v1||v2| + v1 . v2
which then needs to be normalized because the whole equation was multiplied by 2 cos(angle / 2)
*/
float len_v1_v2 = sqrt(inFrom.LengthSq() * inTo.LengthSq());
float w = len_v1_v2 + inFrom.Dot(inTo);
if (w == 0.0f)
{
if (len_v1_v2 == 0.0f)
{
// If either of the vectors has zero length, there is no rotation and we return identity
return Quat::sIdentity();
}
else
{
// If vectors are perpendicular, take one of the many 180 degree rotations that exist
return Quat(Vec4(inFrom.GetNormalizedPerpendicular(), 0));
}
}
Vec3 v = inFrom.Cross(inTo);
return Quat(Vec4(v, w)).Normalized();
}
template <class Random>
Quat Quat::sRandom(Random &inRandom)
{
std::uniform_real_distribution<float> zero_to_one(0.0f, 1.0f);
float x0 = zero_to_one(inRandom);
float r1 = sqrt(1.0f - x0), r2 = sqrt(x0);
std::uniform_real_distribution<float> zero_to_two_pi(0.0f, 2.0f * JPH_PI);
Vec4 s, c;
Vec4(zero_to_two_pi(inRandom), zero_to_two_pi(inRandom), 0, 0).SinCos(s, c);
return Quat(s.GetX() * r1, c.GetX() * r1, s.GetY() * r2, c.GetY() * r2);
}
Quat Quat::sEulerAngles(Vec3Arg inAngles)
{
Vec4 half(0.5f * inAngles);
Vec4 s, c;
half.SinCos(s, c);
float cx = c.GetX();
float sx = s.GetX();
float cy = c.GetY();
float sy = s.GetY();
float cz = c.GetZ();
float sz = s.GetZ();
return Quat(
cz * sx * cy - sz * cx * sy,
cz * cx * sy + sz * sx * cy,
sz * cx * cy - cz * sx * sy,
cz * cx * cy + sz * sx * sy);
}
Vec3 Quat::GetEulerAngles() const
{
float y_sq = GetY() * GetY();
// X
float t0 = 2.0f * (GetW() * GetX() + GetY() * GetZ());
float t1 = 1.0f - 2.0f * (GetX() * GetX() + y_sq);
// Y
float t2 = 2.0f * (GetW() * GetY() - GetZ() * GetX());
t2 = t2 > 1.0f? 1.0f : t2;
t2 = t2 < -1.0f? -1.0f : t2;
// Z
float t3 = 2.0f * (GetW() * GetZ() + GetX() * GetY());
float t4 = 1.0f - 2.0f * (y_sq + GetZ() * GetZ());
return Vec3(ATan2(t0, t1), ASin(t2), ATan2(t3, t4));
}
Quat Quat::GetTwist(Vec3Arg inAxis) const
{
Quat twist(Vec4(GetXYZ().Dot(inAxis) * inAxis, GetW()));
float twist_len = twist.LengthSq();
if (twist_len != 0.0f)
return twist / sqrt(twist_len);
else
return Quat::sIdentity();
}
void Quat::GetSwingTwist(Quat &outSwing, Quat &outTwist) const
{
float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
float s = sqrt(Square(w) + Square(x));
if (s != 0.0f)
{
outTwist = Quat(x / s, 0, 0, w / s);
outSwing = Quat(0, (w * y - x * z) / s, (w * z + x * y) / s, s);
}
else
{
// If both x and w are zero, this must be a 180 degree rotation around either y or z
outTwist = Quat::sIdentity();
outSwing = *this;
}
}
Quat Quat::LERP(QuatArg inDestination, float inFraction) const
{
float scale0 = 1.0f - inFraction;
return Quat(Vec4::sReplicate(scale0) * mValue + Vec4::sReplicate(inFraction) * inDestination.mValue);
}
Quat Quat::SLERP(QuatArg inDestination, float inFraction) const
{
// Difference at which to LERP instead of SLERP
const float delta = 0.0001f;
// Calc cosine
float sign_scale1 = 1.0f;
float cos_omega = Dot(inDestination);
// Adjust signs (if necessary)
if (cos_omega < 0.0f)
{
cos_omega = -cos_omega;
sign_scale1 = -1.0f;
}
// Calculate coefficients
float scale0, scale1;
if (1.0f - cos_omega > delta)
{
// Standard case (slerp)
float omega = ACos(cos_omega);
float sin_omega = Sin(omega);
scale0 = Sin((1.0f - inFraction) * omega) / sin_omega;
scale1 = sign_scale1 * Sin(inFraction * omega) / sin_omega;
}
else
{
// Quaternions are very close so we can do a linear interpolation
scale0 = 1.0f - inFraction;
scale1 = sign_scale1 * inFraction;
}
// Interpolate between the two quaternions
return Quat(Vec4::sReplicate(scale0) * mValue + Vec4::sReplicate(scale1) * inDestination.mValue).Normalized();
}
Vec3 Quat::operator * (Vec3Arg inValue) const
{
// Rotating a vector by a quaternion is done by: p' = q * p * q^-1 (q^-1 = conjugated(q) for a unit quaternion)
JPH_ASSERT(IsNormalized());
return Vec3((*this * Quat(Vec4(inValue, 0)) * Conjugated()).mValue);
}
Vec3 Quat::InverseRotate(Vec3Arg inValue) const
{
JPH_ASSERT(IsNormalized());
return Vec3((Conjugated() * Quat(Vec4(inValue, 0)) * *this).mValue);
}
Vec3 Quat::RotateAxisX() const
{
// This is *this * Vec3::sAxisX() written out:
JPH_ASSERT(IsNormalized());
float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
float tx = 2.0f * x, tw = 2.0f * w;
return Vec3(tx * x + tw * w - 1.0f, tx * y + z * tw, tx * z - y * tw);
}
Vec3 Quat::RotateAxisY() const
{
// This is *this * Vec3::sAxisY() written out:
JPH_ASSERT(IsNormalized());
float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
float ty = 2.0f * y, tw = 2.0f * w;
return Vec3(x * ty - z * tw, tw * w + ty * y - 1.0f, x * tw + ty * z);
}
Vec3 Quat::RotateAxisZ() const
{
// This is *this * Vec3::sAxisZ() written out:
JPH_ASSERT(IsNormalized());
float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
float tz = 2.0f * z, tw = 2.0f * w;
return Vec3(x * tz + y * tw, y * tz - x * tw, tw * w + tz * z - 1.0f);
}
void Quat::StoreFloat3(Float3 *outV) const
{
JPH_ASSERT(IsNormalized());
EnsureWPositive().GetXYZ().StoreFloat3(outV);
}
Quat Quat::sLoadFloat3Unsafe(const Float3 &inV)
{
Vec3 v = Vec3::sLoadFloat3Unsafe(inV);
float w = sqrt(max(1.0f - v.LengthSq(), 0.0f)); // It is possible that the length of v is a fraction above 1, and we don't want to introduce NaN's in that case so we clamp to 0
return Quat(Vec4(v, w));
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,44 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/DVec3.h>
#include <Jolt/Math/DMat44.h>
JPH_NAMESPACE_BEGIN
#ifdef JPH_DOUBLE_PRECISION
// Define real to double
using Real = double;
using Real3 = Double3;
using RVec3 = DVec3;
using RVec3Arg = DVec3Arg;
using RMat44 = DMat44;
using RMat44Arg = DMat44Arg;
#define JPH_RVECTOR_ALIGNMENT JPH_DVECTOR_ALIGNMENT
#else
// Define real to float
using Real = float;
using Real3 = Float3;
using RVec3 = Vec3;
using RVec3Arg = Vec3Arg;
using RMat44 = Mat44;
using RMat44Arg = Mat44Arg;
#define JPH_RVECTOR_ALIGNMENT JPH_VECTOR_ALIGNMENT
#endif // JPH_DOUBLE_PRECISION
// Put the 'real' operator in a namespace so that users can opt in to use it:
// using namespace JPH::literals;
namespace literals {
constexpr Real operator ""_r (long double inValue) { return Real(inValue); }
};
JPH_NAMESPACE_END

View File

@@ -0,0 +1,19 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Enum indicating which component to use when swizzling
enum
{
SWIZZLE_X = 0, ///< Use the X component
SWIZZLE_Y = 1, ///< Use the Y component
SWIZZLE_Z = 2, ///< Use the Z component
SWIZZLE_W = 3, ///< Use the W component
SWIZZLE_UNUSED = 2, ///< We always use the Z component when we don't specifically want to initialize a value, this is consistent with what is done in Vec3(x, y, z), Vec3(Float3 &) and Vec3::sLoadFloat3Unsafe
};
JPH_NAMESPACE_END

View File

@@ -0,0 +1,79 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
// Note that this file exists because std::sin etc. are not platform independent and will lead to non-deterministic simulation
/// Sine of x (input in radians)
JPH_INLINE float Sin(float inX)
{
Vec4 s, c;
Vec4::sReplicate(inX).SinCos(s, c);
return s.GetX();
}
/// Cosine of x (input in radians)
JPH_INLINE float Cos(float inX)
{
Vec4 s, c;
Vec4::sReplicate(inX).SinCos(s, c);
return c.GetX();
}
/// Tangent of x (input in radians)
JPH_INLINE float Tan(float inX)
{
return Vec4::sReplicate(inX).Tan().GetX();
}
/// Arc sine of x (returns value in the range [-PI / 2, PI / 2])
/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::asin
JPH_INLINE float ASin(float inX)
{
return Vec4::sReplicate(inX).ASin().GetX();
}
/// Arc cosine of x (returns value in the range [0, PI])
/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::acos
JPH_INLINE float ACos(float inX)
{
return Vec4::sReplicate(inX).ACos().GetX();
}
/// An approximation of ACos, max error is 4.2e-3 over the entire range [-1, 1], is approximately 2.5x faster than ACos
JPH_INLINE float ACosApproximate(float inX)
{
// See: https://www.johndcook.com/blog/2022/09/06/inverse-cosine-near-1/
// See also: https://seblagarde.wordpress.com/2014/12/01/inverse-trigonometric-functions-gpu-optimization-for-amd-gcn-architecture/
// Taylor of cos(x) = 1 - x^2 / 2 + ...
// Substitute x = sqrt(2 y) we get: cos(sqrt(2 y)) = 1 - y
// Substitute z = 1 - y we get: cos(sqrt(2 (1 - z))) = z <=> acos(z) = sqrt(2 (1 - z))
// To avoid the discontinuity at 1, instead of using the Taylor expansion of acos(x) we use acos(x) / sqrt(2 (1 - x)) = 1 + (1 - x) / 12 + ...
// Since the approximation was made at 1, it has quite a large error at 0 meaning that if we want to extend to the
// range [-1, 1] by mirroring the range [0, 1], the value at 0+ is not the same as 0-.
// So we observe that the form of the Taylor expansion is f(x) = sqrt(1 - x) * (a + b x) and we fit the function so that f(0) = pi / 2
// this gives us a = pi / 2. f(1) = 0 regardless of b. We search for a constant b that minimizes the error in the range [0, 1].
float abs_x = min(abs(inX), 1.0f); // Ensure that we don't get a value larger than 1
float val = sqrt(1.0f - abs_x) * (JPH_PI / 2 - 0.175394f * abs_x);
// Our approximation is valid in the range [0, 1], extend it to the range [-1, 1]
return inX < 0? JPH_PI - val : val;
}
/// Arc tangent of x (returns value in the range [-PI / 2, PI / 2])
JPH_INLINE float ATan(float inX)
{
return Vec4::sReplicate(inX).ATan().GetX();
}
/// Arc tangent of y / x using the signs of the arguments to determine the correct quadrant (returns value in the range [-PI, PI])
JPH_INLINE float ATan2(float inY, float inX)
{
return Vec4::sATan2(Vec4::sReplicate(inY), Vec4::sReplicate(inX)).GetX();
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,220 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Vec4.h>
JPH_NAMESPACE_BEGIN
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) UVec4
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying vector type
#if defined(JPH_USE_SSE)
using Type = __m128i;
#elif defined(JPH_USE_NEON)
using Type = uint32x4_t;
#else
using Type = struct { uint32 mData[4]; };
#endif
/// Constructor
UVec4() = default; ///< Intentionally not initialized for performance reasons
UVec4(const UVec4 &inRHS) = default;
UVec4 & operator = (const UVec4 &inRHS) = default;
JPH_INLINE UVec4(Type inRHS) : mValue(inRHS) { }
/// Create a vector from 4 integer components
JPH_INLINE UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW);
/// Comparison
JPH_INLINE bool operator == (UVec4Arg inV2) const;
JPH_INLINE bool operator != (UVec4Arg inV2) const { return !(*this == inV2); }
/// Swizzle the elements in inV
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
JPH_INLINE UVec4 Swizzle() const;
/// Vector with all zeros
static JPH_INLINE UVec4 sZero();
/// Replicate int inV across all components
static JPH_INLINE UVec4 sReplicate(uint32 inV);
/// Load 1 int from memory and place it in the X component, zeros Y, Z and W
static JPH_INLINE UVec4 sLoadInt(const uint32 *inV);
/// Load 4 ints from memory
static JPH_INLINE UVec4 sLoadInt4(const uint32 *inV);
/// Load 4 ints from memory, aligned to 16 bytes
static JPH_INLINE UVec4 sLoadInt4Aligned(const uint32 *inV);
/// Gather 4 ints from memory at inBase + inOffsets[i] * Scale
template <const int Scale>
static JPH_INLINE UVec4 sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets);
/// Return the minimum value of each of the components
static JPH_INLINE UVec4 sMin(UVec4Arg inV1, UVec4Arg inV2);
/// Return the maximum of each of the components
static JPH_INLINE UVec4 sMax(UVec4Arg inV1, UVec4Arg inV2);
/// Equals (component wise)
static JPH_INLINE UVec4 sEquals(UVec4Arg inV1, UVec4Arg inV2);
/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
static JPH_INLINE UVec4 sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl);
/// Logical or (component wise)
static JPH_INLINE UVec4 sOr(UVec4Arg inV1, UVec4Arg inV2);
/// Logical xor (component wise)
static JPH_INLINE UVec4 sXor(UVec4Arg inV1, UVec4Arg inV2);
/// Logical and (component wise)
static JPH_INLINE UVec4 sAnd(UVec4Arg inV1, UVec4Arg inV2);
/// Logical not (component wise)
static JPH_INLINE UVec4 sNot(UVec4Arg inV1);
/// Sorts the elements in inIndex so that the values that correspond to trues in inValue are the first elements.
/// The remaining elements will be set to inValue.w.
/// I.e. if inValue = (true, false, true, false) and inIndex = (1, 2, 3, 4) the function returns (1, 3, 4, 4).
static JPH_INLINE UVec4 sSort4True(UVec4Arg inValue, UVec4Arg inIndex);
/// Get individual components
#if defined(JPH_USE_SSE)
JPH_INLINE uint32 GetX() const { return uint32(_mm_cvtsi128_si32(mValue)); }
JPH_INLINE uint32 GetY() const { return mU32[1]; }
JPH_INLINE uint32 GetZ() const { return mU32[2]; }
JPH_INLINE uint32 GetW() const { return mU32[3]; }
#elif defined(JPH_USE_NEON)
JPH_INLINE uint32 GetX() const { return vgetq_lane_u32(mValue, 0); }
JPH_INLINE uint32 GetY() const { return vgetq_lane_u32(mValue, 1); }
JPH_INLINE uint32 GetZ() const { return vgetq_lane_u32(mValue, 2); }
JPH_INLINE uint32 GetW() const { return vgetq_lane_u32(mValue, 3); }
#else
JPH_INLINE uint32 GetX() const { return mU32[0]; }
JPH_INLINE uint32 GetY() const { return mU32[1]; }
JPH_INLINE uint32 GetZ() const { return mU32[2]; }
JPH_INLINE uint32 GetW() const { return mU32[3]; }
#endif
/// Set individual components
JPH_INLINE void SetX(uint32 inX) { mU32[0] = inX; }
JPH_INLINE void SetY(uint32 inY) { mU32[1] = inY; }
JPH_INLINE void SetZ(uint32 inZ) { mU32[2] = inZ; }
JPH_INLINE void SetW(uint32 inW) { mU32[3] = inW; }
/// Get component by index
JPH_INLINE uint32 operator [] (uint inCoordinate) const { JPH_ASSERT(inCoordinate < 4); return mU32[inCoordinate]; }
JPH_INLINE uint32 & operator [] (uint inCoordinate) { JPH_ASSERT(inCoordinate < 4); return mU32[inCoordinate]; }
/// Multiplies each of the 4 integer components with an integer (discards any overflow)
JPH_INLINE UVec4 operator * (UVec4Arg inV2) const;
/// Adds an integer value to all integer components (discards any overflow)
JPH_INLINE UVec4 operator + (UVec4Arg inV2);
/// Add two integer vectors (component wise)
JPH_INLINE UVec4 & operator += (UVec4Arg inV2);
/// Replicate the X component to all components
JPH_INLINE UVec4 SplatX() const;
/// Replicate the Y component to all components
JPH_INLINE UVec4 SplatY() const;
/// Replicate the Z component to all components
JPH_INLINE UVec4 SplatZ() const;
/// Replicate the W component to all components
JPH_INLINE UVec4 SplatW() const;
/// Convert each component from an int to a float
JPH_INLINE Vec4 ToFloat() const;
/// Reinterpret UVec4 as a Vec4 (doesn't change the bits)
JPH_INLINE Vec4 ReinterpretAsFloat() const;
/// Store 4 ints to memory
JPH_INLINE void StoreInt4(uint32 *outV) const;
/// Store 4 ints to memory, aligned to 16 bytes
JPH_INLINE void StoreInt4Aligned(uint32 *outV) const;
/// Test if any of the components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAnyTrue() const;
/// Test if any of X, Y or Z components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAnyXYZTrue() const;
/// Test if all components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAllTrue() const;
/// Test if X, Y and Z components are true (true is when highest bit of component is set)
JPH_INLINE bool TestAllXYZTrue() const;
/// Count the number of components that are true (true is when highest bit of component is set)
JPH_INLINE int CountTrues() const;
/// Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of component is set)
JPH_INLINE int GetTrues() const;
/// Shift all components by Count bits to the left (filling with zeros from the left)
template <const uint Count>
JPH_INLINE UVec4 LogicalShiftLeft() const;
/// Shift all components by Count bits to the right (filling with zeros from the right)
template <const uint Count>
JPH_INLINE UVec4 LogicalShiftRight() const;
/// Shift all components by Count bits to the right (shifting in the value of the highest bit)
template <const uint Count>
JPH_INLINE UVec4 ArithmeticShiftRight() const;
/// Takes the lower 4 16 bits and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Uint16Lo() const;
/// Takes the upper 4 16 bits and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Uint16Hi() const;
/// Takes byte 0 .. 3 and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Byte0() const;
/// Takes byte 4 .. 7 and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Byte4() const;
/// Takes byte 8 .. 11 and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Byte8() const;
/// Takes byte 12 .. 15 and expands them to X, Y, Z and W
JPH_INLINE UVec4 Expand4Byte12() const;
/// Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W, 0, 0, 0), when Count = 3 the resulting vector is (Y, Z, W, 0)
JPH_INLINE UVec4 ShiftComponents4Minus(int inCount) const;
/// To String
friend ostream & operator << (ostream &inStream, UVec4Arg inV)
{
inStream << inV.mU32[0] << ", " << inV.mU32[1] << ", " << inV.mU32[2] << ", " << inV.mU32[3];
return inStream;
}
union
{
Type mValue;
uint32 mU32[4];
};
};
static_assert(std::is_trivial<UVec4>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "UVec4.inl"

View File

@@ -0,0 +1,581 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
JPH_NAMESPACE_BEGIN
UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
{
#if defined(JPH_USE_SSE)
mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
#elif defined(JPH_USE_NEON)
uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
mValue = vcombine_u32(xy, zw);
#else
mU32[0] = inX;
mU32[1] = inY;
mU32[2] = inZ;
mU32[3] = inW;
#endif
}
bool UVec4::operator == (UVec4Arg inV2) const
{
return sEquals(*this, inV2).TestAllTrue();
}
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
UVec4 UVec4::Swizzle() const
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
#if defined(JPH_USE_SSE)
return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
#elif defined(JPH_USE_NEON)
return JPH_NEON_SHUFFLE_U32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
#else
return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
#endif
}
UVec4 UVec4::sZero()
{
#if defined(JPH_USE_SSE)
return _mm_setzero_si128();
#elif defined(JPH_USE_NEON)
return vdupq_n_u32(0);
#else
return UVec4(0, 0, 0, 0);
#endif
}
UVec4 UVec4::sReplicate(uint32 inV)
{
#if defined(JPH_USE_SSE)
return _mm_set1_epi32(int(inV));
#elif defined(JPH_USE_NEON)
return vdupq_n_u32(inV);
#else
return UVec4(inV, inV, inV, inV);
#endif
}
UVec4 UVec4::sLoadInt(const uint32 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
#elif defined(JPH_USE_NEON)
return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
#else
return UVec4(*inV, 0, 0, 0);
#endif
}
UVec4 UVec4::sLoadInt4(const uint32 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
#elif defined(JPH_USE_NEON)
return vld1q_u32(inV);
#else
return UVec4(inV[0], inV[1], inV[2], inV[3]);
#endif
}
UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
#elif defined(JPH_USE_NEON)
return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
#else
return UVec4(inV[0], inV[1], inV[2], inV[3]);
#endif
}
template <const int Scale>
UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
{
#ifdef JPH_USE_AVX2
return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
#else
const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
uint32 x = *reinterpret_cast<const uint32 *>(base + inOffsets.GetX() * Scale);
uint32 y = *reinterpret_cast<const uint32 *>(base + inOffsets.GetY() * Scale);
uint32 z = *reinterpret_cast<const uint32 *>(base + inOffsets.GetZ() * Scale);
uint32 w = *reinterpret_cast<const uint32 *>(base + inOffsets.GetW() * Scale);
return UVec4(x, y, z, w);
#endif
}
UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE4_1)
return _mm_min_epu32(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vminq_u32(inV1.mValue, inV2.mValue);
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
return result;
#endif
}
UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE4_1)
return _mm_max_epu32(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmaxq_u32(inV1.mValue, inV2.mValue);
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
return result;
#endif
}
UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vceqq_u32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,
inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,
inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,
inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
#endif
}
UVec4 UVec4::sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
{
#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.mValue), _mm_castsi128_ps(inSet.mValue), _mm_castsi128_ps(inControl.mValue)));
#elif defined(JPH_USE_SSE)
__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.mValue))));
#elif defined(JPH_USE_NEON)
return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mU32[i] : inNotSet.mU32[i];
return result;
#endif
}
UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_or_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vorrq_u32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mU32[0] | inV2.mU32[0],
inV1.mU32[1] | inV2.mU32[1],
inV1.mU32[2] | inV2.mU32[2],
inV1.mU32[3] | inV2.mU32[3]);
#endif
}
UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_xor_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return veorq_u32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mU32[0] ^ inV2.mU32[0],
inV1.mU32[1] ^ inV2.mU32[1],
inV1.mU32[2] ^ inV2.mU32[2],
inV1.mU32[3] ^ inV2.mU32[3]);
#endif
}
UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_and_si128(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vandq_u32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mU32[0] & inV2.mU32[0],
inV1.mU32[1] & inV2.mU32[1],
inV1.mU32[2] & inV2.mU32[2],
inV1.mU32[3] & inV2.mU32[3]);
#endif
}
UVec4 UVec4::sNot(UVec4Arg inV1)
{
#if defined(JPH_USE_AVX512)
return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
#elif defined(JPH_USE_SSE)
return sXor(inV1, sReplicate(0xffffffff));
#elif defined(JPH_USE_NEON)
return vmvnq_u32(inV1.mValue);
#else
return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
#endif
}
UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
{
// If inValue.z is false then shift W to Z
UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
// If inValue.y is false then shift Z and further to Y and further
v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
// If inValue.x is false then shift X and further to Y and further
v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
return v;
}
UVec4 UVec4::operator * (UVec4Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_mullo_epi32(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmulq_u32(mValue, inV2.mValue);
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = mU32[i] * inV2.mU32[i];
return result;
#endif
}
UVec4 UVec4::operator + (UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_add_epi32(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vaddq_u32(mValue, inV2.mValue);
#else
return UVec4(mU32[0] + inV2.mU32[0],
mU32[1] + inV2.mU32[1],
mU32[2] + inV2.mU32[2],
mU32[3] + inV2.mU32[3]);
#endif
}
UVec4 &UVec4::operator += (UVec4Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_add_epi32(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vaddq_u32(mValue, inV2.mValue);
#else
for (int i = 0; i < 4; ++i)
mU32[i] += inV2.mU32[i];
#endif
return *this;
}
UVec4 UVec4::SplatX() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_u32(mValue, 0);
#else
return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
#endif
}
UVec4 UVec4::SplatY() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_u32(mValue, 1);
#else
return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
#endif
}
UVec4 UVec4::SplatZ() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_u32(mValue, 2);
#else
return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
#endif
}
UVec4 UVec4::SplatW() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_u32(mValue, 3);
#else
return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
#endif
}
Vec4 UVec4::ToFloat() const
{
#if defined(JPH_USE_SSE)
return _mm_cvtepi32_ps(mValue);
#elif defined(JPH_USE_NEON)
return vcvtq_f32_u32(mValue);
#else
return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
#endif
}
Vec4 UVec4::ReinterpretAsFloat() const
{
#if defined(JPH_USE_SSE)
return Vec4(_mm_castsi128_ps(mValue));
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(mValue);
#else
return *reinterpret_cast<const Vec4 *>(this);
#endif
}
void UVec4::StoreInt4(uint32 *outV) const
{
#if defined(JPH_USE_SSE)
_mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
#elif defined(JPH_USE_NEON)
vst1q_u32(outV, mValue);
#else
for (int i = 0; i < 4; ++i)
outV[i] = mU32[i];
#endif
}
void UVec4::StoreInt4Aligned(uint32 *outV) const
{
#if defined(JPH_USE_SSE)
_mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
#elif defined(JPH_USE_NEON)
vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
#else
for (int i = 0; i < 4; ++i)
outV[i] = mU32[i];
#endif
}
int UVec4::CountTrues() const
{
#if defined(JPH_USE_SSE)
return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
#elif defined(JPH_USE_NEON)
return vaddvq_u32(vshrq_n_u32(mValue, 31));
#else
return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
#endif
}
int UVec4::GetTrues() const
{
#if defined(JPH_USE_SSE)
return _mm_movemask_ps(_mm_castsi128_ps(mValue));
#elif defined(JPH_USE_NEON)
int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
#else
return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
#endif
}
bool UVec4::TestAnyTrue() const
{
return GetTrues() != 0;
}
bool UVec4::TestAnyXYZTrue() const
{
return (GetTrues() & 0b111) != 0;
}
bool UVec4::TestAllTrue() const
{
return GetTrues() == 0b1111;
}
bool UVec4::TestAllXYZTrue() const
{
return (GetTrues() & 0b111) == 0b111;
}
template <const uint Count>
UVec4 UVec4::LogicalShiftLeft() const
{
static_assert(Count <= 31, "Invalid shift");
#if defined(JPH_USE_SSE)
return _mm_slli_epi32(mValue, Count);
#elif defined(JPH_USE_NEON)
return vshlq_n_u32(mValue, Count);
#else
return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
#endif
}
template <const uint Count>
UVec4 UVec4::LogicalShiftRight() const
{
static_assert(Count <= 31, "Invalid shift");
#if defined(JPH_USE_SSE)
return _mm_srli_epi32(mValue, Count);
#elif defined(JPH_USE_NEON)
return vshrq_n_u32(mValue, Count);
#else
return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
#endif
}
template <const uint Count>
UVec4 UVec4::ArithmeticShiftRight() const
{
static_assert(Count <= 31, "Invalid shift");
#if defined(JPH_USE_SSE)
return _mm_srai_epi32(mValue, Count);
#elif defined(JPH_USE_NEON)
return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(mValue), Count));
#else
return UVec4(uint32(int32_t(mU32[0]) >> Count),
uint32(int32_t(mU32[1]) >> Count),
uint32(int32_t(mU32[2]) >> Count),
uint32(int32_t(mU32[3]) >> Count));
#endif
}
UVec4 UVec4::Expand4Uint16Lo() const
{
#if defined(JPH_USE_SSE)
return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
#elif defined(JPH_USE_NEON)
uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(mValue));
uint16x4_t zero = vdup_n_u16(0);
return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
#else
return UVec4(mU32[0] & 0xffff,
(mU32[0] >> 16) & 0xffff,
mU32[1] & 0xffff,
(mU32[1] >> 16) & 0xffff);
#endif
}
UVec4 UVec4::Expand4Uint16Hi() const
{
#if defined(JPH_USE_SSE)
return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
#elif defined(JPH_USE_NEON)
uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(mValue));
uint16x4_t zero = vdup_n_u16(0);
return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
#else
return UVec4(mU32[2] & 0xffff,
(mU32[2] >> 16) & 0xffff,
mU32[3] & 0xffff,
(mU32[3] >> 16) & 0xffff);
#endif
}
UVec4 UVec4::Expand4Byte0() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
#elif defined(JPH_USE_NEON)
uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
return result;
#endif
}
UVec4 UVec4::Expand4Byte4() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
#elif defined(JPH_USE_NEON)
uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
return result;
#endif
}
UVec4 UVec4::Expand4Byte8() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
#elif defined(JPH_USE_NEON)
uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
return result;
#endif
}
UVec4 UVec4::Expand4Byte12() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
#elif defined(JPH_USE_NEON)
uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
#else
UVec4 result;
for (int i = 0; i < 4; i++)
result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
return result;
#endif
}
UVec4 UVec4::ShiftComponents4Minus(int inCount) const
{
#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
alignas(UVec4) static constexpr uint32 sFourMinusXShuffle[5][4] =
{
{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
{ 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
{ 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
{ 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
};
#endif
#if defined(JPH_USE_SSE4_1)
return _mm_shuffle_epi8(mValue, *reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
#elif defined(JPH_USE_NEON)
uint8x16_t idx = vreinterpretq_u8_u32(*reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
#else
UVec4 result = UVec4::sZero();
for (int i = 0; i < inCount; i++)
result.mU32[i] = mU32[i + 4 - inCount];
return result;
#endif
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,71 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#include <Jolt/Jolt.h>
#include <Jolt/Math/Vec3.h>
JPH_NAMESPACE_BEGIN
static void sAddVertex(StaticArray<Vec3, 1026> &ioVertices, Vec3Arg inVertex)
{
bool found = false;
for (const Vec3 &v : ioVertices)
if (v == inVertex)
{
found = true;
break;
}
if (!found)
ioVertices.push_back(inVertex);
}
static void sCreateVertices(StaticArray<Vec3, 1026> &ioVertices, Vec3Arg inDir1, Vec3Arg inDir2, Vec3Arg inDir3, int inLevel)
{
Vec3 center1 = (inDir1 + inDir2).Normalized();
Vec3 center2 = (inDir2 + inDir3).Normalized();
Vec3 center3 = (inDir3 + inDir1).Normalized();
sAddVertex(ioVertices, center1);
sAddVertex(ioVertices, center2);
sAddVertex(ioVertices, center3);
if (inLevel > 0)
{
int new_level = inLevel - 1;
sCreateVertices(ioVertices, inDir1, center1, center3, new_level);
sCreateVertices(ioVertices, center1, center2, center3, new_level);
sCreateVertices(ioVertices, center1, inDir2, center2, new_level);
sCreateVertices(ioVertices, center3, center2, inDir3, new_level);
}
}
const StaticArray<Vec3, 1026> Vec3::sUnitSphere = []() {
const int level = 3;
StaticArray<Vec3, 1026> verts;
// Add unit axis
verts.push_back(Vec3::sAxisX());
verts.push_back(-Vec3::sAxisX());
verts.push_back(Vec3::sAxisY());
verts.push_back(-Vec3::sAxisY());
verts.push_back(Vec3::sAxisZ());
verts.push_back(-Vec3::sAxisZ());
// Subdivide
sCreateVertices(verts, Vec3::sAxisX(), Vec3::sAxisY(), Vec3::sAxisZ(), level);
sCreateVertices(verts, -Vec3::sAxisX(), Vec3::sAxisY(), Vec3::sAxisZ(), level);
sCreateVertices(verts, Vec3::sAxisX(), -Vec3::sAxisY(), Vec3::sAxisZ(), level);
sCreateVertices(verts, -Vec3::sAxisX(), -Vec3::sAxisY(), Vec3::sAxisZ(), level);
sCreateVertices(verts, Vec3::sAxisX(), Vec3::sAxisY(), -Vec3::sAxisZ(), level);
sCreateVertices(verts, -Vec3::sAxisX(), Vec3::sAxisY(), -Vec3::sAxisZ(), level);
sCreateVertices(verts, Vec3::sAxisX(), -Vec3::sAxisY(), -Vec3::sAxisZ(), level);
sCreateVertices(verts, -Vec3::sAxisX(), -Vec3::sAxisY(), -Vec3::sAxisZ(), level);
return verts;
}();
JPH_NAMESPACE_END

298
thirdparty/jolt_physics/Jolt/Math/Vec3.h vendored Normal file
View File

@@ -0,0 +1,298 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Core/StaticArray.h>
#include <Jolt/Math/Float3.h>
#include <Jolt/Math/Swizzle.h>
#include <Jolt/Math/MathTypes.h>
JPH_NAMESPACE_BEGIN
/// 3 component vector (stored as 4 vectors).
/// Note that we keep the 4th component the same as the 3rd component to avoid divisions by zero when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED defined
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec3
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying vector type
#if defined(JPH_USE_SSE)
using Type = __m128;
#elif defined(JPH_USE_NEON)
using Type = float32x4_t;
#else
using Type = Vec4::Type;
#endif
// Argument type
using ArgType = Vec3Arg;
/// Constructor
Vec3() = default; ///< Intentionally not initialized for performance reasons
Vec3(const Vec3 &inRHS) = default;
Vec3 & operator = (const Vec3 &inRHS) = default;
explicit JPH_INLINE Vec3(Vec4Arg inRHS);
JPH_INLINE Vec3(Type inRHS) : mValue(inRHS) { CheckW(); }
/// Load 3 floats from memory
explicit JPH_INLINE Vec3(const Float3 &inV);
/// Create a vector from 3 components
JPH_INLINE Vec3(float inX, float inY, float inZ);
/// Vector with all zeros
static JPH_INLINE Vec3 sZero();
/// Vector with all ones
static JPH_INLINE Vec3 sOne();
/// Vector with all NaN's
static JPH_INLINE Vec3 sNaN();
/// Vectors with the principal axis
static JPH_INLINE Vec3 sAxisX() { return Vec3(1, 0, 0); }
static JPH_INLINE Vec3 sAxisY() { return Vec3(0, 1, 0); }
static JPH_INLINE Vec3 sAxisZ() { return Vec3(0, 0, 1); }
/// Replicate inV across all components
static JPH_INLINE Vec3 sReplicate(float inV);
/// Load 3 floats from memory (reads 32 bits extra which it doesn't use)
static JPH_INLINE Vec3 sLoadFloat3Unsafe(const Float3 &inV);
/// Return the minimum value of each of the components
static JPH_INLINE Vec3 sMin(Vec3Arg inV1, Vec3Arg inV2);
/// Return the maximum of each of the components
static JPH_INLINE Vec3 sMax(Vec3Arg inV1, Vec3Arg inV2);
/// Clamp a vector between min and max (component wise)
static JPH_INLINE Vec3 sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax);
/// Equals (component wise)
static JPH_INLINE UVec4 sEquals(Vec3Arg inV1, Vec3Arg inV2);
/// Less than (component wise)
static JPH_INLINE UVec4 sLess(Vec3Arg inV1, Vec3Arg inV2);
/// Less than or equal (component wise)
static JPH_INLINE UVec4 sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2);
/// Greater than (component wise)
static JPH_INLINE UVec4 sGreater(Vec3Arg inV1, Vec3Arg inV2);
/// Greater than or equal (component wise)
static JPH_INLINE UVec4 sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2);
/// Calculates inMul1 * inMul2 + inAdd
static JPH_INLINE Vec3 sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd);
/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
static JPH_INLINE Vec3 sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl);
/// Logical or (component wise)
static JPH_INLINE Vec3 sOr(Vec3Arg inV1, Vec3Arg inV2);
/// Logical xor (component wise)
static JPH_INLINE Vec3 sXor(Vec3Arg inV1, Vec3Arg inV2);
/// Logical and (component wise)
static JPH_INLINE Vec3 sAnd(Vec3Arg inV1, Vec3Arg inV2);
/// Get unit vector given spherical coordinates
/// inTheta \f$\in [0, \pi]\f$ is angle between vector and z-axis
/// inPhi \f$\in [0, 2 \pi]\f$ is the angle in the xy-plane starting from the x axis and rotating counter clockwise around the z-axis
static JPH_INLINE Vec3 sUnitSpherical(float inTheta, float inPhi);
/// A set of vectors uniformly spanning the surface of a unit sphere, usable for debug purposes
JPH_EXPORT static const StaticArray<Vec3, 1026> sUnitSphere;
/// Get random unit vector
template <class Random>
static inline Vec3 sRandom(Random &inRandom);
/// Get individual components
#if defined(JPH_USE_SSE)
JPH_INLINE float GetX() const { return _mm_cvtss_f32(mValue); }
JPH_INLINE float GetY() const { return mF32[1]; }
JPH_INLINE float GetZ() const { return mF32[2]; }
#elif defined(JPH_USE_NEON)
JPH_INLINE float GetX() const { return vgetq_lane_f32(mValue, 0); }
JPH_INLINE float GetY() const { return vgetq_lane_f32(mValue, 1); }
JPH_INLINE float GetZ() const { return vgetq_lane_f32(mValue, 2); }
#else
JPH_INLINE float GetX() const { return mF32[0]; }
JPH_INLINE float GetY() const { return mF32[1]; }
JPH_INLINE float GetZ() const { return mF32[2]; }
#endif
/// Set individual components
JPH_INLINE void SetX(float inX) { mF32[0] = inX; }
JPH_INLINE void SetY(float inY) { mF32[1] = inY; }
JPH_INLINE void SetZ(float inZ) { mF32[2] = mF32[3] = inZ; } // Assure Z and W are the same
/// Set all components
JPH_INLINE void Set(float inX, float inY, float inZ) { *this = Vec3(inX, inY, inZ); }
/// Get float component by index
JPH_INLINE float operator [] (uint inCoordinate) const { JPH_ASSERT(inCoordinate < 3); return mF32[inCoordinate]; }
/// Set float component by index
JPH_INLINE void SetComponent(uint inCoordinate, float inValue) { JPH_ASSERT(inCoordinate < 3); mF32[inCoordinate] = inValue; mValue = sFixW(mValue); } // Assure Z and W are the same
/// Comparison
JPH_INLINE bool operator == (Vec3Arg inV2) const;
JPH_INLINE bool operator != (Vec3Arg inV2) const { return !(*this == inV2); }
/// Test if two vectors are close
JPH_INLINE bool IsClose(Vec3Arg inV2, float inMaxDistSq = 1.0e-12f) const;
/// Test if vector is near zero
JPH_INLINE bool IsNearZero(float inMaxDistSq = 1.0e-12f) const;
/// Test if vector is normalized
JPH_INLINE bool IsNormalized(float inTolerance = 1.0e-6f) const;
/// Test if vector contains NaN elements
JPH_INLINE bool IsNaN() const;
/// Multiply two float vectors (component wise)
JPH_INLINE Vec3 operator * (Vec3Arg inV2) const;
/// Multiply vector with float
JPH_INLINE Vec3 operator * (float inV2) const;
/// Multiply vector with float
friend JPH_INLINE Vec3 operator * (float inV1, Vec3Arg inV2);
/// Divide vector by float
JPH_INLINE Vec3 operator / (float inV2) const;
/// Multiply vector with float
JPH_INLINE Vec3 & operator *= (float inV2);
/// Multiply vector with vector
JPH_INLINE Vec3 & operator *= (Vec3Arg inV2);
/// Divide vector by float
JPH_INLINE Vec3 & operator /= (float inV2);
/// Add two float vectors (component wise)
JPH_INLINE Vec3 operator + (Vec3Arg inV2) const;
/// Add two float vectors (component wise)
JPH_INLINE Vec3 & operator += (Vec3Arg inV2);
/// Negate
JPH_INLINE Vec3 operator - () const;
/// Subtract two float vectors (component wise)
JPH_INLINE Vec3 operator - (Vec3Arg inV2) const;
/// Subtract two float vectors (component wise)
JPH_INLINE Vec3 & operator -= (Vec3Arg inV2);
/// Divide (component wise)
JPH_INLINE Vec3 operator / (Vec3Arg inV2) const;
/// Swizzle the elements in inV
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
JPH_INLINE Vec3 Swizzle() const;
/// Replicate the X component to all components
JPH_INLINE Vec4 SplatX() const;
/// Replicate the Y component to all components
JPH_INLINE Vec4 SplatY() const;
/// Replicate the Z component to all components
JPH_INLINE Vec4 SplatZ() const;
/// Get index of component with lowest value
JPH_INLINE int GetLowestComponentIndex() const;
/// Get index of component with highest value
JPH_INLINE int GetHighestComponentIndex() const;
/// Return the absolute value of each of the components
JPH_INLINE Vec3 Abs() const;
/// Reciprocal vector (1 / value) for each of the components
JPH_INLINE Vec3 Reciprocal() const;
/// Cross product
JPH_INLINE Vec3 Cross(Vec3Arg inV2) const;
/// Dot product, returns the dot product in X, Y and Z components
JPH_INLINE Vec3 DotV(Vec3Arg inV2) const;
/// Dot product, returns the dot product in X, Y, Z and W components
JPH_INLINE Vec4 DotV4(Vec3Arg inV2) const;
/// Dot product
JPH_INLINE float Dot(Vec3Arg inV2) const;
/// Squared length of vector
JPH_INLINE float LengthSq() const;
/// Length of vector
JPH_INLINE float Length() const;
/// Normalize vector
JPH_INLINE Vec3 Normalized() const;
/// Normalize vector or return inZeroValue if the length of the vector is zero
JPH_INLINE Vec3 NormalizedOr(Vec3Arg inZeroValue) const;
/// Store 3 floats to memory
JPH_INLINE void StoreFloat3(Float3 *outV) const;
/// Convert each component from a float to an int
JPH_INLINE UVec4 ToInt() const;
/// Reinterpret Vec3 as a UVec4 (doesn't change the bits)
JPH_INLINE UVec4 ReinterpretAsInt() const;
/// Get the minimum of X, Y and Z
JPH_INLINE float ReduceMin() const;
/// Get the maximum of X, Y and Z
JPH_INLINE float ReduceMax() const;
/// Component wise square root
JPH_INLINE Vec3 Sqrt() const;
/// Get normalized vector that is perpendicular to this vector
JPH_INLINE Vec3 GetNormalizedPerpendicular() const;
/// Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
JPH_INLINE Vec3 GetSign() const;
/// To String
friend ostream & operator << (ostream &inStream, Vec3Arg inV)
{
inStream << inV.mF32[0] << ", " << inV.mF32[1] << ", " << inV.mF32[2];
return inStream;
}
/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
JPH_INLINE void CheckW() const;
/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
static JPH_INLINE Type sFixW(Type inValue);
union
{
Type mValue;
float mF32[4];
};
};
static_assert(std::is_trivial<Vec3>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "Vec3.inl"

View File

@@ -0,0 +1,860 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#include <Jolt/Math/Vec4.h>
#include <Jolt/Math/UVec4.h>
#include <Jolt/Core/HashCombine.h>
JPH_SUPPRESS_WARNINGS_STD_BEGIN
#include <random>
JPH_SUPPRESS_WARNINGS_STD_END
// Create a std::hash/JPH::Hash for Vec3
JPH_MAKE_HASHABLE(JPH::Vec3, t.GetX(), t.GetY(), t.GetZ())
JPH_NAMESPACE_BEGIN
void Vec3::CheckW() const
{
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
// Avoid asserts when both components are NaN
JPH_ASSERT(reinterpret_cast<const uint32 *>(mF32)[2] == reinterpret_cast<const uint32 *>(mF32)[3]);
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
}
JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
{
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(inValue, inValue, _MM_SHUFFLE(2, 2, 1, 0));
#elif defined(JPH_USE_NEON)
return JPH_NEON_SHUFFLE_F32x4(inValue, inValue, 0, 1, 2, 2);
#else
Type value;
value.mData[0] = inValue.mData[0];
value.mData[1] = inValue.mData[1];
value.mData[2] = inValue.mData[2];
value.mData[3] = inValue.mData[2];
return value;
#endif
#else
return inValue;
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
}
Vec3::Vec3(Vec4Arg inRHS) :
mValue(sFixW(inRHS.mValue))
{
}
Vec3::Vec3(const Float3 &inV)
{
#if defined(JPH_USE_SSE)
Type x = _mm_load_ss(&inV.x);
Type y = _mm_load_ss(&inV.y);
Type z = _mm_load_ss(&inV.z);
Type xy = _mm_unpacklo_ps(x, y);
mValue = _mm_shuffle_ps(xy, z, _MM_SHUFFLE(0, 0, 1, 0)); // Assure Z and W are the same
#elif defined(JPH_USE_NEON)
float32x2_t xy = vld1_f32(&inV.x);
float32x2_t zz = vdup_n_f32(inV.z); // Assure Z and W are the same
mValue = vcombine_f32(xy, zz);
#else
mF32[0] = inV[0];
mF32[1] = inV[1];
mF32[2] = inV[2];
mF32[3] = inV[2]; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
#endif
}
Vec3::Vec3(float inX, float inY, float inZ)
{
#if defined(JPH_USE_SSE)
mValue = _mm_set_ps(inZ, inZ, inY, inX);
#elif defined(JPH_USE_NEON)
uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
uint32x2_t zz = vreinterpret_u32_f32(vdup_n_f32(inZ));
mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zz));
#else
mF32[0] = inX;
mF32[1] = inY;
mF32[2] = inZ;
mF32[3] = inZ; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
#endif
}
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
Vec3 Vec3::Swizzle() const
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleZ, SwizzleZ, SwizzleY, SwizzleX)); // Assure Z and W are the same
#elif defined(JPH_USE_NEON)
return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
#else
return Vec3(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ]);
#endif
}
Vec3 Vec3::sZero()
{
#if defined(JPH_USE_SSE)
return _mm_setzero_ps();
#elif defined(JPH_USE_NEON)
return vdupq_n_f32(0);
#else
return Vec3(0, 0, 0);
#endif
}
Vec3 Vec3::sReplicate(float inV)
{
#if defined(JPH_USE_SSE)
return _mm_set1_ps(inV);
#elif defined(JPH_USE_NEON)
return vdupq_n_f32(inV);
#else
return Vec3(inV, inV, inV);
#endif
}
Vec3 Vec3::sOne()
{
return sReplicate(1.0f);
}
Vec3 Vec3::sNaN()
{
return sReplicate(numeric_limits<float>::quiet_NaN());
}
Vec3 Vec3::sLoadFloat3Unsafe(const Float3 &inV)
{
#if defined(JPH_USE_SSE)
Type v = _mm_loadu_ps(&inV.x);
#elif defined(JPH_USE_NEON)
Type v = vld1q_f32(&inV.x);
#else
Type v = { inV.x, inV.y, inV.z };
#endif
return sFixW(v);
}
Vec3 Vec3::sMin(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_min_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vminq_f32(inV1.mValue, inV2.mValue);
#else
return Vec3(min(inV1.mF32[0], inV2.mF32[0]),
min(inV1.mF32[1], inV2.mF32[1]),
min(inV1.mF32[2], inV2.mF32[2]));
#endif
}
Vec3 Vec3::sMax(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_max_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmaxq_f32(inV1.mValue, inV2.mValue);
#else
return Vec3(max(inV1.mF32[0], inV2.mF32[0]),
max(inV1.mF32[1], inV2.mF32[1]),
max(inV1.mF32[2], inV2.mF32[2]));
#endif
}
Vec3 Vec3::sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax)
{
return sMax(sMin(inV, inMax), inMin);
}
UVec4 Vec3::sEquals(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vceqq_f32(inV1.mValue, inV2.mValue);
#else
uint32 z = inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0;
return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
z,
z);
#endif
}
UVec4 Vec3::sLess(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcltq_f32(inV1.mValue, inV2.mValue);
#else
uint32 z = inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0;
return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
z,
z);
#endif
}
UVec4 Vec3::sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcleq_f32(inV1.mValue, inV2.mValue);
#else
uint32 z = inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0;
return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
z,
z);
#endif
}
UVec4 Vec3::sGreater(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcgtq_f32(inV1.mValue, inV2.mValue);
#else
uint32 z = inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0;
return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
z,
z);
#endif
}
UVec4 Vec3::sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcgeq_f32(inV1.mValue, inV2.mValue);
#else
uint32 z = inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0;
return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
z,
z);
#endif
}
Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
{
#if defined(JPH_USE_SSE)
#ifdef JPH_USE_FMADD
return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
#else
return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
#endif
#elif defined(JPH_USE_NEON)
return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
#else
return Vec3(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2]);
#endif
}
Vec3 Vec3::sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl)
{
#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
Type v = _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
return sFixW(v);
#elif defined(JPH_USE_SSE)
__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
Type v = _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
return sFixW(v);
#elif defined(JPH_USE_NEON)
Type v = vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
return sFixW(v);
#else
Vec3 result;
for (int i = 0; i < 3; i++)
result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
result.mF32[3] = result.mF32[2];
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
return result;
#endif
}
Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_or_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return Vec3(UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
#endif
}
Vec3 Vec3::sXor(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_xor_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return Vec3(UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
#endif
}
Vec3 Vec3::sAnd(Vec3Arg inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_and_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return Vec3(UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
#endif
}
Vec3 Vec3::sUnitSpherical(float inTheta, float inPhi)
{
Vec4 s, c;
Vec4(inTheta, inPhi, 0, 0).SinCos(s, c);
return Vec3(s.GetX() * c.GetY(), s.GetX() * s.GetY(), c.GetX());
}
template <class Random>
Vec3 Vec3::sRandom(Random &inRandom)
{
std::uniform_real_distribution<float> zero_to_one(0.0f, 1.0f);
float theta = JPH_PI * zero_to_one(inRandom);
float phi = 2.0f * JPH_PI * zero_to_one(inRandom);
return sUnitSpherical(theta, phi);
}
bool Vec3::operator == (Vec3Arg inV2) const
{
return sEquals(*this, inV2).TestAllXYZTrue();
}
bool Vec3::IsClose(Vec3Arg inV2, float inMaxDistSq) const
{
return (inV2 - *this).LengthSq() <= inMaxDistSq;
}
bool Vec3::IsNearZero(float inMaxDistSq) const
{
return LengthSq() <= inMaxDistSq;
}
Vec3 Vec3::operator * (Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmulq_f32(mValue, inV2.mValue);
#else
return Vec3(mF32[0] * inV2.mF32[0], mF32[1] * inV2.mF32[1], mF32[2] * inV2.mF32[2]);
#endif
}
Vec3 Vec3::operator * (float inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
return vmulq_n_f32(mValue, inV2);
#else
return Vec3(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2);
#endif
}
Vec3 operator * (float inV1, Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmulq_n_f32(inV2.mValue, inV1);
#else
return Vec3(inV1 * inV2.mF32[0], inV1 * inV2.mF32[1], inV1 * inV2.mF32[2]);
#endif
}
Vec3 Vec3::operator / (float inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
return vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
return Vec3(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2);
#endif
}
Vec3 &Vec3::operator *= (float inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
mValue = vmulq_n_f32(mValue, inV2);
#else
for (int i = 0; i < 3; ++i)
mF32[i] *= inV2;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF32[3] = mF32[2];
#endif
#endif
return *this;
}
Vec3 &Vec3::operator *= (Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vmulq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 3; ++i)
mF32[i] *= inV2.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF32[3] = mF32[2];
#endif
#endif
return *this;
}
Vec3 &Vec3::operator /= (float inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
for (int i = 0; i < 3; ++i)
mF32[i] /= inV2;
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF32[3] = mF32[2];
#endif
#endif
return *this;
}
Vec3 Vec3::operator + (Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vaddq_f32(mValue, inV2.mValue);
#else
return Vec3(mF32[0] + inV2.mF32[0], mF32[1] + inV2.mF32[1], mF32[2] + inV2.mF32[2]);
#endif
}
Vec3 &Vec3::operator += (Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vaddq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 3; ++i)
mF32[i] += inV2.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF32[3] = mF32[2];
#endif
#endif
return *this;
}
Vec3 Vec3::operator - () const
{
#if defined(JPH_USE_SSE)
return _mm_sub_ps(_mm_setzero_ps(), mValue);
#elif defined(JPH_USE_NEON)
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
return vsubq_f32(vdupq_n_f32(0), mValue);
#else
return vnegq_f32(mValue);
#endif
#else
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
return Vec3(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2]);
#else
return Vec3(-mF32[0], -mF32[1], -mF32[2]);
#endif
#endif
}
Vec3 Vec3::operator - (Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vsubq_f32(mValue, inV2.mValue);
#else
return Vec3(mF32[0] - inV2.mF32[0], mF32[1] - inV2.mF32[1], mF32[2] - inV2.mF32[2]);
#endif
}
Vec3 &Vec3::operator -= (Vec3Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vsubq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 3; ++i)
mF32[i] -= inV2.mF32[i];
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
mF32[3] = mF32[2];
#endif
#endif
return *this;
}
Vec3 Vec3::operator / (Vec3Arg inV2) const
{
inV2.CheckW(); // Check W equals Z to avoid div by zero
#if defined(JPH_USE_SSE)
return _mm_div_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vdivq_f32(mValue, inV2.mValue);
#else
return Vec3(mF32[0] / inV2.mF32[0], mF32[1] / inV2.mF32[1], mF32[2] / inV2.mF32[2]);
#endif
}
Vec4 Vec3::SplatX() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 0);
#else
return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
#endif
}
Vec4 Vec3::SplatY() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 1);
#else
return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
#endif
}
Vec4 Vec3::SplatZ() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 2);
#else
return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
#endif
}
int Vec3::GetLowestComponentIndex() const
{
return GetX() < GetY() ? (GetZ() < GetX() ? 2 : 0) : (GetZ() < GetY() ? 2 : 1);
}
int Vec3::GetHighestComponentIndex() const
{
return GetX() > GetY() ? (GetZ() > GetX() ? 2 : 0) : (GetZ() > GetY() ? 2 : 1);
}
Vec3 Vec3::Abs() const
{
#if defined(JPH_USE_AVX512)
return _mm_range_ps(mValue, mValue, 0b1000);
#elif defined(JPH_USE_SSE)
return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
#elif defined(JPH_USE_NEON)
return vabsq_f32(mValue);
#else
return Vec3(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]));
#endif
}
Vec3 Vec3::Reciprocal() const
{
return sOne() / mValue;
}
Vec3 Vec3::Cross(Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE)
Type t1 = _mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
t1 = _mm_mul_ps(t1, mValue);
Type t2 = _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
t2 = _mm_mul_ps(t2, inV2.mValue);
Type t3 = _mm_sub_ps(t1, t2);
return _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
#elif defined(JPH_USE_NEON)
Type t1 = JPH_NEON_SHUFFLE_F32x4(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
t1 = vmulq_f32(t1, mValue);
Type t2 = JPH_NEON_SHUFFLE_F32x4(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
t2 = vmulq_f32(t2, inV2.mValue);
Type t3 = vsubq_f32(t1, t2);
return JPH_NEON_SHUFFLE_F32x4(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
#else
return Vec3(mF32[1] * inV2.mF32[2] - mF32[2] * inV2.mF32[1],
mF32[2] * inV2.mF32[0] - mF32[0] * inV2.mF32[2],
mF32[0] * inV2.mF32[1] - mF32[1] * inV2.mF32[0]);
#endif
}
Vec3 Vec3::DotV(Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
mul = vsetq_lane_f32(0, mul, 3);
return vdupq_n_f32(vaddvq_f32(mul));
#else
float dot = 0.0f;
for (int i = 0; i < 3; i++)
dot += mF32[i] * inV2.mF32[i];
return Vec3::sReplicate(dot);
#endif
}
Vec4 Vec3::DotV4(Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
mul = vsetq_lane_f32(0, mul, 3);
return vdupq_n_f32(vaddvq_f32(mul));
#else
float dot = 0.0f;
for (int i = 0; i < 3; i++)
dot += mF32[i] * inV2.mF32[i];
return Vec4::sReplicate(dot);
#endif
}
float Vec3::Dot(Vec3Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
mul = vsetq_lane_f32(0, mul, 3);
return vaddvq_f32(mul);
#else
float dot = 0.0f;
for (int i = 0; i < 3; i++)
dot += mF32[i] * inV2.mF32[i];
return dot;
#endif
}
float Vec3::LengthSq() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
mul = vsetq_lane_f32(0, mul, 3);
return vaddvq_f32(mul);
#else
float len_sq = 0.0f;
for (int i = 0; i < 3; i++)
len_sq += mF32[i] * mF32[i];
return len_sq;
#endif
}
float Vec3::Length() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
mul = vsetq_lane_f32(0, mul, 3);
float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
return vget_lane_f32(vsqrt_f32(sum), 0);
#else
return sqrt(LengthSq());
#endif
}
Vec3 Vec3::Sqrt() const
{
#if defined(JPH_USE_SSE)
return _mm_sqrt_ps(mValue);
#elif defined(JPH_USE_NEON)
return vsqrtq_f32(mValue);
#else
return Vec3(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]));
#endif
}
Vec3 Vec3::Normalized() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
mul = vsetq_lane_f32(0, mul, 3);
float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
return vdivq_f32(mValue, vsqrtq_f32(sum));
#else
return *this / Length();
#endif
}
Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
{
#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
// clang with '-ffast-math' (which you should not use!) can generate _mm_rsqrt_ps
// instructions which produce INFs/NaNs when they get a denormal float as input.
// We therefore treat denormals as zero here.
Type is_zero = _mm_cmple_ps(len_sq, _mm_set1_ps(FLT_MIN));
#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
if (_mm_movemask_ps(is_zero) == 0xf)
return inZeroValue;
else
return _mm_div_ps(mValue, _mm_sqrt_ps(len_sq));
#else
return _mm_blendv_ps(_mm_div_ps(mValue, _mm_sqrt_ps(len_sq)), inZeroValue.mValue, is_zero);
#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
mul = vsetq_lane_f32(0, mul, 3);
float32x4_t len_sq = vdupq_n_f32(vaddvq_f32(mul));
uint32x4_t is_zero = vcleq_f32(len_sq, vdupq_n_f32(FLT_MIN));
return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, vsqrtq_f32(len_sq)));
#else
float len_sq = LengthSq();
if (len_sq <= FLT_MIN)
return inZeroValue;
else
return *this / sqrt(len_sq);
#endif
}
bool Vec3::IsNormalized(float inTolerance) const
{
return abs(LengthSq() - 1.0f) <= inTolerance;
}
bool Vec3::IsNaN() const
{
#if defined(JPH_USE_AVX512)
return (_mm_fpclass_ps_mask(mValue, 0b10000001) & 0x7) != 0;
#elif defined(JPH_USE_SSE)
return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
#elif defined(JPH_USE_NEON)
uint32x4_t mask = JPH_NEON_UINT32x4(1, 1, 1, 0);
uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
#else
return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]);
#endif
}
void Vec3::StoreFloat3(Float3 *outV) const
{
#if defined(JPH_USE_SSE)
_mm_store_ss(&outV->x, mValue);
Vec3 t = Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_UNUSED>();
_mm_store_ss(&outV->y, t.mValue);
t = t.Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_UNUSED>();
_mm_store_ss(&outV->z, t.mValue);
#elif defined(JPH_USE_NEON)
float32x2_t xy = vget_low_f32(mValue);
vst1_f32(&outV->x, xy);
vst1q_lane_f32(&outV->z, mValue, 2);
#else
outV->x = mF32[0];
outV->y = mF32[1];
outV->z = mF32[2];
#endif
}
UVec4 Vec3::ToInt() const
{
#if defined(JPH_USE_SSE)
return _mm_cvttps_epi32(mValue);
#elif defined(JPH_USE_NEON)
return vcvtq_u32_f32(mValue);
#else
return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
#endif
}
UVec4 Vec3::ReinterpretAsInt() const
{
#if defined(JPH_USE_SSE)
return UVec4(_mm_castps_si128(mValue));
#elif defined(JPH_USE_NEON)
return vreinterpretq_u32_f32(mValue);
#else
return *reinterpret_cast<const UVec4 *>(this);
#endif
}
float Vec3::ReduceMin() const
{
Vec3 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
return v.GetX();
}
float Vec3::ReduceMax() const
{
Vec3 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
return v.GetX();
}
Vec3 Vec3::GetNormalizedPerpendicular() const
{
if (abs(mF32[0]) > abs(mF32[1]))
{
float len = sqrt(mF32[0] * mF32[0] + mF32[2] * mF32[2]);
return Vec3(mF32[2], 0.0f, -mF32[0]) / len;
}
else
{
float len = sqrt(mF32[1] * mF32[1] + mF32[2] * mF32[2]);
return Vec3(0.0f, mF32[2], -mF32[1]) / len;
}
}
Vec3 Vec3::GetSign() const
{
#if defined(JPH_USE_AVX512)
return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
#elif defined(JPH_USE_SSE)
Type minus_one = _mm_set1_ps(-1.0f);
Type one = _mm_set1_ps(1.0f);
return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
#elif defined(JPH_USE_NEON)
Type minus_one = vdupq_n_f32(-1.0f);
Type one = vdupq_n_f32(1.0f);
return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
#else
return Vec3(std::signbit(mF32[0])? -1.0f : 1.0f,
std::signbit(mF32[1])? -1.0f : 1.0f,
std::signbit(mF32[2])? -1.0f : 1.0f);
#endif
}
JPH_NAMESPACE_END

286
thirdparty/jolt_physics/Jolt/Math/Vec4.h vendored Normal file
View File

@@ -0,0 +1,286 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
#include <Jolt/Math/Float4.h>
#include <Jolt/Math/Swizzle.h>
#include <Jolt/Math/MathTypes.h>
JPH_NAMESPACE_BEGIN
class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec4
{
public:
JPH_OVERRIDE_NEW_DELETE
// Underlying vector type
#if defined(JPH_USE_SSE)
using Type = __m128;
#elif defined(JPH_USE_NEON)
using Type = float32x4_t;
#else
using Type = struct { float mData[4]; };
#endif
/// Constructor
Vec4() = default; ///< Intentionally not initialized for performance reasons
Vec4(const Vec4 &inRHS) = default;
Vec4 & operator = (const Vec4 &inRHS) = default;
explicit JPH_INLINE Vec4(Vec3Arg inRHS); ///< WARNING: W component undefined!
JPH_INLINE Vec4(Vec3Arg inRHS, float inW);
JPH_INLINE Vec4(Type inRHS) : mValue(inRHS) { }
/// Create a vector from 4 components
JPH_INLINE Vec4(float inX, float inY, float inZ, float inW);
/// Vector with all zeros
static JPH_INLINE Vec4 sZero();
/// Vector with all ones
static JPH_INLINE Vec4 sOne();
/// Vector with all NaN's
static JPH_INLINE Vec4 sNaN();
/// Replicate inV across all components
static JPH_INLINE Vec4 sReplicate(float inV);
/// Load 4 floats from memory
static JPH_INLINE Vec4 sLoadFloat4(const Float4 *inV);
/// Load 4 floats from memory, 16 bytes aligned
static JPH_INLINE Vec4 sLoadFloat4Aligned(const Float4 *inV);
/// Gather 4 floats from memory at inBase + inOffsets[i] * Scale
template <const int Scale>
static JPH_INLINE Vec4 sGatherFloat4(const float *inBase, UVec4Arg inOffsets);
/// Return the minimum value of each of the components
static JPH_INLINE Vec4 sMin(Vec4Arg inV1, Vec4Arg inV2);
/// Return the maximum of each of the components
static JPH_INLINE Vec4 sMax(Vec4Arg inV1, Vec4Arg inV2);
/// Equals (component wise)
static JPH_INLINE UVec4 sEquals(Vec4Arg inV1, Vec4Arg inV2);
/// Less than (component wise)
static JPH_INLINE UVec4 sLess(Vec4Arg inV1, Vec4Arg inV2);
/// Less than or equal (component wise)
static JPH_INLINE UVec4 sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2);
/// Greater than (component wise)
static JPH_INLINE UVec4 sGreater(Vec4Arg inV1, Vec4Arg inV2);
/// Greater than or equal (component wise)
static JPH_INLINE UVec4 sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2);
/// Calculates inMul1 * inMul2 + inAdd
static JPH_INLINE Vec4 sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd);
/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
static JPH_INLINE Vec4 sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl);
/// Logical or (component wise)
static JPH_INLINE Vec4 sOr(Vec4Arg inV1, Vec4Arg inV2);
/// Logical xor (component wise)
static JPH_INLINE Vec4 sXor(Vec4Arg inV1, Vec4Arg inV2);
/// Logical and (component wise)
static JPH_INLINE Vec4 sAnd(Vec4Arg inV1, Vec4Arg inV2);
/// Sort the four elements of ioValue and sort ioIndex at the same time.
/// Based on a sorting network: http://en.wikipedia.org/wiki/Sorting_network
static JPH_INLINE void sSort4(Vec4 &ioValue, UVec4 &ioIndex);
/// Reverse sort the four elements of ioValue (highest first) and sort ioIndex at the same time.
/// Based on a sorting network: http://en.wikipedia.org/wiki/Sorting_network
static JPH_INLINE void sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex);
/// Get individual components
#if defined(JPH_USE_SSE)
JPH_INLINE float GetX() const { return _mm_cvtss_f32(mValue); }
JPH_INLINE float GetY() const { return mF32[1]; }
JPH_INLINE float GetZ() const { return mF32[2]; }
JPH_INLINE float GetW() const { return mF32[3]; }
#elif defined(JPH_USE_NEON)
JPH_INLINE float GetX() const { return vgetq_lane_f32(mValue, 0); }
JPH_INLINE float GetY() const { return vgetq_lane_f32(mValue, 1); }
JPH_INLINE float GetZ() const { return vgetq_lane_f32(mValue, 2); }
JPH_INLINE float GetW() const { return vgetq_lane_f32(mValue, 3); }
#else
JPH_INLINE float GetX() const { return mF32[0]; }
JPH_INLINE float GetY() const { return mF32[1]; }
JPH_INLINE float GetZ() const { return mF32[2]; }
JPH_INLINE float GetW() const { return mF32[3]; }
#endif
/// Set individual components
JPH_INLINE void SetX(float inX) { mF32[0] = inX; }
JPH_INLINE void SetY(float inY) { mF32[1] = inY; }
JPH_INLINE void SetZ(float inZ) { mF32[2] = inZ; }
JPH_INLINE void SetW(float inW) { mF32[3] = inW; }
/// Set all components
JPH_INLINE void Set(float inX, float inY, float inZ, float inW) { *this = Vec4(inX, inY, inZ, inW); }
/// Get float component by index
JPH_INLINE float operator [] (uint inCoordinate) const { JPH_ASSERT(inCoordinate < 4); return mF32[inCoordinate]; }
JPH_INLINE float & operator [] (uint inCoordinate) { JPH_ASSERT(inCoordinate < 4); return mF32[inCoordinate]; }
/// Comparison
JPH_INLINE bool operator == (Vec4Arg inV2) const;
JPH_INLINE bool operator != (Vec4Arg inV2) const { return !(*this == inV2); }
/// Test if two vectors are close
JPH_INLINE bool IsClose(Vec4Arg inV2, float inMaxDistSq = 1.0e-12f) const;
/// Test if vector is normalized
JPH_INLINE bool IsNormalized(float inTolerance = 1.0e-6f) const;
/// Test if vector contains NaN elements
JPH_INLINE bool IsNaN() const;
/// Multiply two float vectors (component wise)
JPH_INLINE Vec4 operator * (Vec4Arg inV2) const;
/// Multiply vector with float
JPH_INLINE Vec4 operator * (float inV2) const;
/// Multiply vector with float
friend JPH_INLINE Vec4 operator * (float inV1, Vec4Arg inV2);
/// Divide vector by float
JPH_INLINE Vec4 operator / (float inV2) const;
/// Multiply vector with float
JPH_INLINE Vec4 & operator *= (float inV2);
/// Multiply vector with vector
JPH_INLINE Vec4 & operator *= (Vec4Arg inV2);
/// Divide vector by float
JPH_INLINE Vec4 & operator /= (float inV2);
/// Add two float vectors (component wise)
JPH_INLINE Vec4 operator + (Vec4Arg inV2) const;
/// Add two float vectors (component wise)
JPH_INLINE Vec4 & operator += (Vec4Arg inV2);
/// Negate
JPH_INLINE Vec4 operator - () const;
/// Subtract two float vectors (component wise)
JPH_INLINE Vec4 operator - (Vec4Arg inV2) const;
/// Subtract two float vectors (component wise)
JPH_INLINE Vec4 & operator -= (Vec4Arg inV2);
/// Divide (component wise)
JPH_INLINE Vec4 operator / (Vec4Arg inV2) const;
/// Swizzle the elements in inV
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
JPH_INLINE Vec4 Swizzle() const;
/// Replicate the X component to all components
JPH_INLINE Vec4 SplatX() const;
/// Replicate the Y component to all components
JPH_INLINE Vec4 SplatY() const;
/// Replicate the Z component to all components
JPH_INLINE Vec4 SplatZ() const;
/// Replicate the W component to all components
JPH_INLINE Vec4 SplatW() const;
/// Return the absolute value of each of the components
JPH_INLINE Vec4 Abs() const;
/// Reciprocal vector (1 / value) for each of the components
JPH_INLINE Vec4 Reciprocal() const;
/// Dot product, returns the dot product in X, Y and Z components
JPH_INLINE Vec4 DotV(Vec4Arg inV2) const;
/// Dot product
JPH_INLINE float Dot(Vec4Arg inV2) const;
/// Squared length of vector
JPH_INLINE float LengthSq() const;
/// Length of vector
JPH_INLINE float Length() const;
/// Normalize vector
JPH_INLINE Vec4 Normalized() const;
/// Store 4 floats to memory
JPH_INLINE void StoreFloat4(Float4 *outV) const;
/// Convert each component from a float to an int
JPH_INLINE UVec4 ToInt() const;
/// Reinterpret Vec4 as a UVec4 (doesn't change the bits)
JPH_INLINE UVec4 ReinterpretAsInt() const;
/// Store if X is negative in bit 0, Y in bit 1, Z in bit 2 and W in bit 3
JPH_INLINE int GetSignBits() const;
/// Get the minimum of X, Y, Z and W
JPH_INLINE float ReduceMin() const;
/// Get the maximum of X, Y, Z and W
JPH_INLINE float ReduceMax() const;
/// Component wise square root
JPH_INLINE Vec4 Sqrt() const;
/// Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
JPH_INLINE Vec4 GetSign() const;
/// Calculate the sine and cosine for each element of this vector (input in radians)
inline void SinCos(Vec4 &outSin, Vec4 &outCos) const;
/// Calculate the tangent for each element of this vector (input in radians)
inline Vec4 Tan() const;
/// Calculate the arc sine for each element of this vector (returns value in the range [-PI / 2, PI / 2])
/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::asin
inline Vec4 ASin() const;
/// Calculate the arc cosine for each element of this vector (returns value in the range [0, PI])
/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::acos
inline Vec4 ACos() const;
/// Calculate the arc tangent for each element of this vector (returns value in the range [-PI / 2, PI / 2])
inline Vec4 ATan() const;
/// Calculate the arc tangent of y / x using the signs of the arguments to determine the correct quadrant (returns value in the range [-PI, PI])
inline static Vec4 sATan2(Vec4Arg inY, Vec4Arg inX);
/// To String
friend ostream & operator << (ostream &inStream, Vec4Arg inV)
{
inStream << inV.mF32[0] << ", " << inV.mF32[1] << ", " << inV.mF32[2] << ", " << inV.mF32[3];
return inStream;
}
union
{
Type mValue;
float mF32[4];
};
};
static_assert(std::is_trivial<Vec4>(), "Is supposed to be a trivial type!");
JPH_NAMESPACE_END
#include "Vec4.inl"

View File

@@ -0,0 +1,986 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#include <Jolt/Math/Trigonometry.h>
#include <Jolt/Math/Vec3.h>
#include <Jolt/Math/UVec4.h>
JPH_NAMESPACE_BEGIN
// Constructor
Vec4::Vec4(Vec3Arg inRHS) :
mValue(inRHS.mValue)
{
}
Vec4::Vec4(Vec3Arg inRHS, float inW)
{
#if defined(JPH_USE_SSE4_1)
mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
#elif defined(JPH_USE_NEON)
mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
#else
for (int i = 0; i < 3; i++)
mF32[i] = inRHS.mF32[i];
mF32[3] = inW;
#endif
}
Vec4::Vec4(float inX, float inY, float inZ, float inW)
{
#if defined(JPH_USE_SSE)
mValue = _mm_set_ps(inW, inZ, inY, inX);
#elif defined(JPH_USE_NEON)
uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
uint32x2_t zw = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inZ)) | (static_cast<uint64>(BitCast<uint32>(inW)) << 32));
mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zw));
#else
mF32[0] = inX;
mF32[1] = inY;
mF32[2] = inZ;
mF32[3] = inW;
#endif
}
template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
Vec4 Vec4::Swizzle() const
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
#elif defined(JPH_USE_NEON)
return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
#else
return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
#endif
}
Vec4 Vec4::sZero()
{
#if defined(JPH_USE_SSE)
return _mm_setzero_ps();
#elif defined(JPH_USE_NEON)
return vdupq_n_f32(0);
#else
return Vec4(0, 0, 0, 0);
#endif
}
Vec4 Vec4::sReplicate(float inV)
{
#if defined(JPH_USE_SSE)
return _mm_set1_ps(inV);
#elif defined(JPH_USE_NEON)
return vdupq_n_f32(inV);
#else
return Vec4(inV, inV, inV, inV);
#endif
}
Vec4 Vec4::sOne()
{
return sReplicate(1.0f);
}
Vec4 Vec4::sNaN()
{
return sReplicate(numeric_limits<float>::quiet_NaN());
}
Vec4 Vec4::sLoadFloat4(const Float4 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_loadu_ps(&inV->x);
#elif defined(JPH_USE_NEON)
return vld1q_f32(&inV->x);
#else
return Vec4(inV->x, inV->y, inV->z, inV->w);
#endif
}
Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
{
#if defined(JPH_USE_SSE)
return _mm_load_ps(&inV->x);
#elif defined(JPH_USE_NEON)
return vld1q_f32(&inV->x);
#else
return Vec4(inV->x, inV->y, inV->z, inV->w);
#endif
}
template <const int Scale>
Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
{
#if defined(JPH_USE_SSE)
#ifdef JPH_USE_AVX2
return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
#else
const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
Type xy = _mm_unpacklo_ps(x, y);
Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
Type zw = _mm_unpacklo_ps(z, w);
return _mm_movelh_ps(xy, zw);
#endif
#else
const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
return Vec4(x, y, z, w);
#endif
}
Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_min_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vminq_f32(inV1.mValue, inV2.mValue);
#else
return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
min(inV1.mF32[1], inV2.mF32[1]),
min(inV1.mF32[2], inV2.mF32[2]),
min(inV1.mF32[3], inV2.mF32[3]));
#endif
}
Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_max_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmaxq_f32(inV1.mValue, inV2.mValue);
#else
return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
max(inV1.mF32[1], inV2.mF32[1]),
max(inV1.mF32[2], inV2.mF32[2]),
max(inV1.mF32[3], inV2.mF32[3]));
#endif
}
UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vceqq_f32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
#endif
}
UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcltq_f32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
#endif
}
UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcleq_f32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
#endif
}
UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcgtq_f32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
#endif
}
UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
#elif defined(JPH_USE_NEON)
return vcgeq_f32(inV1.mValue, inV2.mValue);
#else
return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
#endif
}
Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
{
#if defined(JPH_USE_SSE)
#ifdef JPH_USE_FMADD
return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
#else
return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
#endif
#elif defined(JPH_USE_NEON)
return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
#else
return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
#endif
}
Vec4 Vec4::sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl)
{
#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
return _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
#elif defined(JPH_USE_SSE)
__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
return _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
#elif defined(JPH_USE_NEON)
return vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
#else
Vec4 result;
for (int i = 0; i < 4; i++)
result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
return result;
#endif
}
Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_or_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
#endif
}
Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_xor_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
#endif
}
Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_and_ps(inV1.mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
#else
return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
#endif
}
void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
{
// Pass 1, test 1st vs 3rd, 2nd vs 4th
Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
ioValue = sSelect(ioValue, v1, c1);
ioIndex = UVec4::sSelect(ioIndex, i1, c1);
// Pass 2, test 1st vs 2nd, 3rd vs 4th
Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
ioValue = sSelect(ioValue, v2, c2);
ioIndex = UVec4::sSelect(ioIndex, i2, c2);
// Pass 3, test 2nd vs 3rd component
Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
ioValue = sSelect(ioValue, v3, c3);
ioIndex = UVec4::sSelect(ioIndex, i3, c3);
}
void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
{
// Pass 1, test 1st vs 3rd, 2nd vs 4th
Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
ioValue = sSelect(ioValue, v1, c1);
ioIndex = UVec4::sSelect(ioIndex, i1, c1);
// Pass 2, test 1st vs 2nd, 3rd vs 4th
Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
ioValue = sSelect(ioValue, v2, c2);
ioIndex = UVec4::sSelect(ioIndex, i2, c2);
// Pass 3, test 2nd vs 3rd component
Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
ioValue = sSelect(ioValue, v3, c3);
ioIndex = UVec4::sSelect(ioIndex, i3, c3);
}
bool Vec4::operator == (Vec4Arg inV2) const
{
return sEquals(*this, inV2).TestAllTrue();
}
bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
{
return (inV2 - *this).LengthSq() <= inMaxDistSq;
}
bool Vec4::IsNormalized(float inTolerance) const
{
return abs(LengthSq() - 1.0f) <= inTolerance;
}
bool Vec4::IsNaN() const
{
#if defined(JPH_USE_AVX512)
return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
#elif defined(JPH_USE_SSE)
return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
#elif defined(JPH_USE_NEON)
uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
#else
return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
#endif
}
Vec4 Vec4::operator * (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmulq_f32(mValue, inV2.mValue);
#else
return Vec4(mF32[0] * inV2.mF32[0],
mF32[1] * inV2.mF32[1],
mF32[2] * inV2.mF32[2],
mF32[3] * inV2.mF32[3]);
#endif
}
Vec4 Vec4::operator * (float inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
return vmulq_n_f32(mValue, inV2);
#else
return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
#endif
}
/// Multiply vector with float
Vec4 operator * (float inV1, Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
#elif defined(JPH_USE_NEON)
return vmulq_n_f32(inV2.mValue, inV1);
#else
return Vec4(inV1 * inV2.mF32[0],
inV1 * inV2.mF32[1],
inV1 * inV2.mF32[2],
inV1 * inV2.mF32[3]);
#endif
}
Vec4 Vec4::operator / (float inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
return vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
#endif
}
Vec4 &Vec4::operator *= (float inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
mValue = vmulq_n_f32(mValue, inV2);
#else
for (int i = 0; i < 4; ++i)
mF32[i] *= inV2;
#endif
return *this;
}
Vec4 &Vec4::operator *= (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_mul_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vmulq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 4; ++i)
mF32[i] *= inV2.mF32[i];
#endif
return *this;
}
Vec4 &Vec4::operator /= (float inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
#elif defined(JPH_USE_NEON)
mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
#else
for (int i = 0; i < 4; ++i)
mF32[i] /= inV2;
#endif
return *this;
}
Vec4 Vec4::operator + (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vaddq_f32(mValue, inV2.mValue);
#else
return Vec4(mF32[0] + inV2.mF32[0],
mF32[1] + inV2.mF32[1],
mF32[2] + inV2.mF32[2],
mF32[3] + inV2.mF32[3]);
#endif
}
Vec4 &Vec4::operator += (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_add_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vaddq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 4; ++i)
mF32[i] += inV2.mF32[i];
#endif
return *this;
}
Vec4 Vec4::operator - () const
{
#if defined(JPH_USE_SSE)
return _mm_sub_ps(_mm_setzero_ps(), mValue);
#elif defined(JPH_USE_NEON)
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
return vsubq_f32(vdupq_n_f32(0), mValue);
#else
return vnegq_f32(mValue);
#endif
#else
#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
return Vec4(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2], 0.0f - mF32[3]);
#else
return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
#endif
#endif
}
Vec4 Vec4::operator - (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vsubq_f32(mValue, inV2.mValue);
#else
return Vec4(mF32[0] - inV2.mF32[0],
mF32[1] - inV2.mF32[1],
mF32[2] - inV2.mF32[2],
mF32[3] - inV2.mF32[3]);
#endif
}
Vec4 &Vec4::operator -= (Vec4Arg inV2)
{
#if defined(JPH_USE_SSE)
mValue = _mm_sub_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
mValue = vsubq_f32(mValue, inV2.mValue);
#else
for (int i = 0; i < 4; ++i)
mF32[i] -= inV2.mF32[i];
#endif
return *this;
}
Vec4 Vec4::operator / (Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE)
return _mm_div_ps(mValue, inV2.mValue);
#elif defined(JPH_USE_NEON)
return vdivq_f32(mValue, inV2.mValue);
#else
return Vec4(mF32[0] / inV2.mF32[0],
mF32[1] / inV2.mF32[1],
mF32[2] / inV2.mF32[2],
mF32[3] / inV2.mF32[3]);
#endif
}
Vec4 Vec4::SplatX() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 0);
#else
return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
#endif
}
Vec4 Vec4::SplatY() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 1);
#else
return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
#endif
}
Vec4 Vec4::SplatZ() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 2);
#else
return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
#endif
}
Vec4 Vec4::SplatW() const
{
#if defined(JPH_USE_SSE)
return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
#elif defined(JPH_USE_NEON)
return vdupq_laneq_f32(mValue, 3);
#else
return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
#endif
}
Vec4 Vec4::Abs() const
{
#if defined(JPH_USE_AVX512)
return _mm_range_ps(mValue, mValue, 0b1000);
#elif defined(JPH_USE_SSE)
return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
#elif defined(JPH_USE_NEON)
return vabsq_f32(mValue);
#else
return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
#endif
}
Vec4 Vec4::Reciprocal() const
{
return sOne() / mValue;
}
Vec4 Vec4::DotV(Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_dp_ps(mValue, inV2.mValue, 0xff);
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
return vdupq_n_f32(vaddvq_f32(mul));
#else
// Brackets placed so that the order is consistent with the vectorized version
return Vec4::sReplicate((mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]));
#endif
}
float Vec4::Dot(Vec4Arg inV2) const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
return vaddvq_f32(mul);
#else
// Brackets placed so that the order is consistent with the vectorized version
return (mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]);
#endif
}
float Vec4::LengthSq() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
return vaddvq_f32(mul);
#else
// Brackets placed so that the order is consistent with the vectorized version
return (mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]);
#endif
}
float Vec4::Length() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
return vget_lane_f32(vsqrt_f32(sum), 0);
#else
// Brackets placed so that the order is consistent with the vectorized version
return sqrt((mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]));
#endif
}
Vec4 Vec4::Sqrt() const
{
#if defined(JPH_USE_SSE)
return _mm_sqrt_ps(mValue);
#elif defined(JPH_USE_NEON)
return vsqrtq_f32(mValue);
#else
return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
#endif
}
Vec4 Vec4::GetSign() const
{
#if defined(JPH_USE_AVX512)
return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
#elif defined(JPH_USE_SSE)
Type minus_one = _mm_set1_ps(-1.0f);
Type one = _mm_set1_ps(1.0f);
return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
#elif defined(JPH_USE_NEON)
Type minus_one = vdupq_n_f32(-1.0f);
Type one = vdupq_n_f32(1.0f);
return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
#else
return Vec4(std::signbit(mF32[0])? -1.0f : 1.0f,
std::signbit(mF32[1])? -1.0f : 1.0f,
std::signbit(mF32[2])? -1.0f : 1.0f,
std::signbit(mF32[3])? -1.0f : 1.0f);
#endif
}
Vec4 Vec4::Normalized() const
{
#if defined(JPH_USE_SSE4_1)
return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
#elif defined(JPH_USE_NEON)
float32x4_t mul = vmulq_f32(mValue, mValue);
float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
return vdivq_f32(mValue, vsqrtq_f32(sum));
#else
return *this / Length();
#endif
}
void Vec4::StoreFloat4(Float4 *outV) const
{
#if defined(JPH_USE_SSE)
_mm_storeu_ps(&outV->x, mValue);
#elif defined(JPH_USE_NEON)
vst1q_f32(&outV->x, mValue);
#else
for (int i = 0; i < 4; ++i)
(&outV->x)[i] = mF32[i];
#endif
}
UVec4 Vec4::ToInt() const
{
#if defined(JPH_USE_SSE)
return _mm_cvttps_epi32(mValue);
#elif defined(JPH_USE_NEON)
return vcvtq_u32_f32(mValue);
#else
return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
#endif
}
UVec4 Vec4::ReinterpretAsInt() const
{
#if defined(JPH_USE_SSE)
return UVec4(_mm_castps_si128(mValue));
#elif defined(JPH_USE_NEON)
return vreinterpretq_u32_f32(mValue);
#else
return *reinterpret_cast<const UVec4 *>(this);
#endif
}
int Vec4::GetSignBits() const
{
#if defined(JPH_USE_SSE)
return _mm_movemask_ps(mValue);
#elif defined(JPH_USE_NEON)
int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
#else
return (std::signbit(mF32[0])? 1 : 0) | (std::signbit(mF32[1])? 2 : 0) | (std::signbit(mF32[2])? 4 : 0) | (std::signbit(mF32[3])? 8 : 0);
#endif
}
float Vec4::ReduceMin() const
{
Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
return v.GetX();
}
float Vec4::ReduceMax() const
{
Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
return v.GetX();
}
void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
{
// Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
// Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
// x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
// Make x relative to the closest quadrant.
// This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
// This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
// We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
// get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
// This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
// PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
// This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
// See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
// After this we have x in the range [-PI / 4, PI / 4].
Vec4 float_quadrant = quadrant.ToFloat();
x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
// Calculate x2 = x^2
Vec4 x2 = x * x;
// Taylor expansion:
// Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sOne();
// Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
// The lowest 2 bits of quadrant indicate the quadrant that we are in.
// Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
// since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
// we can alternate between using the Taylor expansion for sin and cos according to the following table:
//
// quadrant sin(x) cos(x)
// XXX00b sin(x') cos(x')
// XXX01b cos(x') -sin(x')
// XXX10b -sin(x') -cos(x')
// XXX11b -cos(x') sin(x')
//
// So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
// Select which one of the results is sin and which one is cos
Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
// Update the signs
sin_sign = UVec4::sXor(sin_sign, bit2);
UVec4 cos_sign = UVec4::sXor(bit1, bit2);
// Correct the signs
outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
}
Vec4 Vec4::Tan() const
{
// Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
// Make argument positive
UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
// x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
// Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
Vec4 float_quadrant = quadrant.ToFloat();
x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
// Calculate x2 = x^2
Vec4 x2 = x * x;
// Roughly equivalent to the Taylor expansion:
// Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
Vec4 tan =
(((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
+ Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
// For the 2nd and 4th quadrant we need to invert the value
UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
// Put the sign back
return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
}
Vec4 Vec4::ASin() const
{
// Implementation based on asinf.c from the cephes library
// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
// Make argument positive
UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
// ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
a = Vec4::sMin(a, Vec4::sOne());
// When |x| <= 0.5 we use the asin approximation as is
Vec4 z1 = a * a;
Vec4 x1 = a;
// When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
Vec4 z2 = 0.5f * (Vec4::sOne() - a);
Vec4 x2 = z2.Sqrt();
// Select which of the two situations we have
UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
Vec4 z = Vec4::sSelect(z1, z2, greater);
Vec4 x = Vec4::sSelect(x1, x2, greater);
// Polynomial approximation of asin
z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
// If |x| > 0.5 we need to apply the remainder of the identity above
z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
// Put the sign back
return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
}
Vec4 Vec4::ACos() const
{
// Not the most accurate, but simple
return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
}
Vec4 Vec4::ATan() const
{
// Implementation based on atanf.c from the cephes library
// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
// Make argument positive
UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
Vec4 y = Vec4::sZero();
// If x > Tan(PI / 8)
UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
Vec4 x1 = (x - Vec4::sOne()) / (x + Vec4::sOne());
// If x > Tan(3 * PI / 8)
UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
// Apply first if
x = Vec4::sSelect(x, x1, greater1);
y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
// Apply second if
x = Vec4::sSelect(x, x2, greater2);
y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
// Polynomial approximation
Vec4 z = x * x;
y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
// Put the sign back
return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
}
Vec4 Vec4::sATan2(Vec4Arg inY, Vec4Arg inX)
{
UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
// Determine absolute value and sign of y
UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
// Determine absolute value and sign of x
UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
// Always divide smallest / largest to avoid dividing by zero
UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
Vec4 atan = (numerator / denominator).ATan();
// If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
// Now we need to map to the correct quadrant
// x_sign y_sign result
// +1 +1 atan
// -1 +1 -atan + PI
// -1 -1 atan - PI
// +1 -1 -atan
// This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
return atan;
}
JPH_NAMESPACE_END

View File

@@ -0,0 +1,211 @@
// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
// SPDX-License-Identifier: MIT
#pragma once
JPH_NAMESPACE_BEGIN
/// Templatized vector class
template <uint Rows>
class [[nodiscard]] Vector
{
public:
/// Constructor
inline Vector() = default;
inline Vector(const Vector &) = default;
/// Dimensions
inline uint GetRows() const { return Rows; }
/// Vector with all zeros
inline void SetZero()
{
for (uint r = 0; r < Rows; ++r)
mF32[r] = 0.0f;
}
inline static Vector sZero() { Vector v; v.SetZero(); return v; }
/// Copy a (part) of another vector into this vector
template <class OtherVector>
void CopyPart(const OtherVector &inV, uint inSourceRow, uint inNumRows, uint inDestRow)
{
for (uint r = 0; r < inNumRows; ++r)
mF32[inDestRow + r] = inV[inSourceRow + r];
}
/// Get float component by index
inline float operator [] (uint inCoordinate) const
{
JPH_ASSERT(inCoordinate < Rows);
return mF32[inCoordinate];
}
inline float & operator [] (uint inCoordinate)
{
JPH_ASSERT(inCoordinate < Rows);
return mF32[inCoordinate];
}
/// Comparison
inline bool operator == (const Vector &inV2) const
{
for (uint r = 0; r < Rows; ++r)
if (mF32[r] != inV2.mF32[r])
return false;
return true;
}
inline bool operator != (const Vector &inV2) const
{
for (uint r = 0; r < Rows; ++r)
if (mF32[r] != inV2.mF32[r])
return true;
return false;
}
/// Test if vector consists of all zeros
inline bool IsZero() const
{
for (uint r = 0; r < Rows; ++r)
if (mF32[r] != 0.0f)
return false;
return true;
}
/// Test if two vectors are close to each other
inline bool IsClose(const Vector &inV2, float inMaxDistSq = 1.0e-12f) const
{
return (inV2 - *this).LengthSq() <= inMaxDistSq;
}
/// Assignment
inline Vector & operator = (const Vector &) = default;
/// Multiply vector with float
inline Vector operator * (const float inV2) const
{
Vector v;
for (uint r = 0; r < Rows; ++r)
v.mF32[r] = mF32[r] * inV2;
return v;
}
inline Vector & operator *= (const float inV2)
{
for (uint r = 0; r < Rows; ++r)
mF32[r] *= inV2;
return *this;
}
/// Multiply vector with float
inline friend Vector operator * (const float inV1, const Vector &inV2)
{
return inV2 * inV1;
}
/// Divide vector by float
inline Vector operator / (float inV2) const
{
Vector v;
for (uint r = 0; r < Rows; ++r)
v.mF32[r] = mF32[r] / inV2;
return v;
}
inline Vector & operator /= (float inV2)
{
for (uint r = 0; r < Rows; ++r)
mF32[r] /= inV2;
return *this;
}
/// Add two float vectors (component wise)
inline Vector operator + (const Vector &inV2) const
{
Vector v;
for (uint r = 0; r < Rows; ++r)
v.mF32[r] = mF32[r] + inV2.mF32[r];
return v;
}
inline Vector & operator += (const Vector &inV2)
{
for (uint r = 0; r < Rows; ++r)
mF32[r] += inV2.mF32[r];
return *this;
}
/// Negate
inline Vector operator - () const
{
Vector v;
for (uint r = 0; r < Rows; ++r)
v.mF32[r] = -mF32[r];
return v;
}
/// Subtract two float vectors (component wise)
inline Vector operator - (const Vector &inV2) const
{
Vector v;
for (uint r = 0; r < Rows; ++r)
v.mF32[r] = mF32[r] - inV2.mF32[r];
return v;
}
inline Vector & operator -= (const Vector &inV2)
{
for (uint r = 0; r < Rows; ++r)
mF32[r] -= inV2.mF32[r];
return *this;
}
/// Dot product
inline float Dot(const Vector &inV2) const
{
float dot = 0.0f;
for (uint r = 0; r < Rows; ++r)
dot += mF32[r] * inV2.mF32[r];
return dot;
}
/// Squared length of vector
inline float LengthSq() const
{
return Dot(*this);
}
/// Length of vector
inline float Length() const
{
return sqrt(LengthSq());
}
/// Check if vector is normalized
inline bool IsNormalized(float inToleranceSq = 1.0e-6f)
{
return abs(LengthSq() - 1.0f) <= inToleranceSq;
}
/// Normalize vector
inline Vector Normalized() const
{
return *this / Length();
}
/// To String
friend ostream & operator << (ostream &inStream, const Vector &inV)
{
inStream << "[";
for (uint i = 0; i < Rows - 1; ++i)
inStream << inV.mF32[i] << ", ";
inStream << inV.mF32[Rows - 1] << "]";
return inStream;
}
float mF32[Rows];
};
JPH_NAMESPACE_END