initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/thirdparty/jolt_physics/Jolt/Math/BVec16.h
+++ b/thirdparty/jolt_physics/Jolt/Math/BVec16.h
@@ -0,0 +1,99 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// A vector consisting of 16 bytes
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) BVec16
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying vector type
+#if defined(JPH_USE_SSE)
+	using Type = __m128i;
+#elif defined(JPH_USE_NEON)
+	using Type = uint8x16_t;
+#else
+	using Type = struct { uint64 mData[2]; };
+#endif
+
+	/// Constructor
+								BVec16() = default; ///< Intentionally not initialized for performance reasons
+								BVec16(const BVec16 &inRHS) = default;
+	BVec16 &					operator = (const BVec16 &inRHS) = default;
+	JPH_INLINE					BVec16(Type inRHS) : mValue(inRHS)					{ }
+
+	/// Create a vector from 16 bytes
+	JPH_INLINE					BVec16(uint8 inB0, uint8 inB1, uint8 inB2, uint8 inB3, uint8 inB4, uint8 inB5, uint8 inB6, uint8 inB7, uint8 inB8, uint8 inB9, uint8 inB10, uint8 inB11, uint8 inB12, uint8 inB13, uint8 inB14, uint8 inB15);
+
+	/// Create a vector from two uint64's
+	JPH_INLINE					BVec16(uint64 inV0, uint64 inV1);
+
+	/// Comparison
+	JPH_INLINE bool				operator == (BVec16Arg inV2) const;
+	JPH_INLINE bool				operator != (BVec16Arg inV2) const					{ return !(*this == inV2); }
+
+	/// Vector with all zeros
+	static JPH_INLINE BVec16	sZero();
+
+	/// Replicate int inV across all components
+	static JPH_INLINE BVec16	sReplicate(uint8 inV);
+
+	/// Load 16 bytes from memory
+	static JPH_INLINE BVec16	sLoadByte16(const uint8 *inV);
+
+	/// Equals (component wise), highest bit of each component that is set is considered true
+	static JPH_INLINE BVec16	sEquals(BVec16Arg inV1, BVec16Arg inV2);
+
+	/// Logical or (component wise)
+	static JPH_INLINE BVec16	sOr(BVec16Arg inV1, BVec16Arg inV2);
+
+	/// Logical xor (component wise)
+	static JPH_INLINE BVec16	sXor(BVec16Arg inV1, BVec16Arg inV2);
+
+	/// Logical and (component wise)
+	static JPH_INLINE BVec16	sAnd(BVec16Arg inV1, BVec16Arg inV2);
+
+	/// Logical not (component wise)
+	static JPH_INLINE BVec16	sNot(BVec16Arg inV1);
+
+	/// Get component by index
+	JPH_INLINE uint8			operator [] (uint inCoordinate) const				{ JPH_ASSERT(inCoordinate < 16); return mU8[inCoordinate]; }
+	JPH_INLINE uint8 &			operator [] (uint inCoordinate)						{ JPH_ASSERT(inCoordinate < 16); return mU8[inCoordinate]; }
+
+	/// Test if any of the components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAnyTrue() const;
+
+	/// Test if all components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAllTrue() const;
+
+	/// Store if mU8[0] is true in bit 0, mU8[1] in bit 1, etc. (true is when highest bit of component is set)
+	JPH_INLINE int				GetTrues() const;
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, BVec16Arg inV)
+	{
+		inStream << uint(inV.mU8[0]) << ", " << uint(inV.mU8[1]) << ", " << uint(inV.mU8[2]) << ", " << uint(inV.mU8[3]) << ", "
+				 << uint(inV.mU8[4]) << ", " << uint(inV.mU8[5]) << ", " << uint(inV.mU8[6]) << ", " << uint(inV.mU8[7]) << ", "
+				 << uint(inV.mU8[8]) << ", " << uint(inV.mU8[9]) << ", " << uint(inV.mU8[10]) << ", " << uint(inV.mU8[11]) << ", "
+				 << uint(inV.mU8[12]) << ", " << uint(inV.mU8[13]) << ", " << uint(inV.mU8[14]) << ", " << uint(inV.mU8[15]);
+		return inStream;
+	}
+
+	union
+	{
+		Type					mValue;
+		uint8					mU8[16];
+		uint64					mU64[2];
+	};
+};
+
+static_assert(std::is_trivial<BVec16>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "BVec16.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/BVec16.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/BVec16.inl
@@ -0,0 +1,177 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2024 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+BVec16::BVec16(uint8 inB0, uint8 inB1, uint8 inB2, uint8 inB3, uint8 inB4, uint8 inB5, uint8 inB6, uint8 inB7, uint8 inB8, uint8 inB9, uint8 inB10, uint8 inB11, uint8 inB12, uint8 inB13, uint8 inB14, uint8 inB15)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_set_epi8(char(inB15), char(inB14), char(inB13), char(inB12), char(inB11), char(inB10), char(inB9), char(inB8), char(inB7), char(inB6), char(inB5), char(inB4), char(inB3), char(inB2), char(inB1), char(inB0));
+#elif defined(JPH_USE_NEON)
+	uint8x8_t v1 = vcreate_u8(uint64(inB0) | (uint64(inB1) << 8) | (uint64(inB2) << 16) | (uint64(inB3) << 24) | (uint64(inB4) << 32) | (uint64(inB5) << 40) | (uint64(inB6) << 48) | (uint64(inB7) << 56));
+	uint8x8_t v2 = vcreate_u8(uint64(inB8) | (uint64(inB9) << 8) | (uint64(inB10) << 16) | (uint64(inB11) << 24) | (uint64(inB12) << 32) | (uint64(inB13) << 40) | (uint64(inB14) << 48) | (uint64(inB15) << 56));
+	mValue = vcombine_u8(v1, v2);
+#else
+	mU8[0] = inB0;
+	mU8[1] = inB1;
+	mU8[2] = inB2;
+	mU8[3] = inB3;
+	mU8[4] = inB4;
+	mU8[5] = inB5;
+	mU8[6] = inB6;
+	mU8[7] = inB7;
+	mU8[8] = inB8;
+	mU8[9] = inB9;
+	mU8[10] = inB10;
+	mU8[11] = inB11;
+	mU8[12] = inB12;
+	mU8[13] = inB13;
+	mU8[14] = inB14;
+	mU8[15] = inB15;
+#endif
+}
+
+BVec16::BVec16(uint64 inV0, uint64 inV1)
+{
+	mU64[0] = inV0;
+	mU64[1] = inV1;
+}
+
+bool BVec16::operator == (BVec16Arg inV2) const
+{
+	return sEquals(*this, inV2).TestAllTrue();
+}
+
+BVec16 BVec16::sZero()
+{
+#if defined(JPH_USE_SSE)
+	return _mm_setzero_si128();
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_u8(0);
+#else
+	return BVec16(0, 0);
+#endif
+}
+
+BVec16 BVec16::sReplicate(uint8 inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_set1_epi8(char(inV));
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_u8(inV);
+#else
+	uint64 v(inV);
+	v |= v << 8;
+	v |= v << 16;
+	v |= v << 32;
+	return BVec16(v, v);
+#endif
+}
+
+BVec16 BVec16::sLoadByte16(const uint8 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
+#elif defined(JPH_USE_NEON)
+	return vld1q_u8(inV);
+#else
+	return BVec16(inV[0], inV[1], inV[2], inV[3], inV[4], inV[5], inV[6], inV[7], inV[8], inV[9], inV[10], inV[11], inV[12], inV[13], inV[14], inV[15]);
+#endif
+}
+
+BVec16 BVec16::sEquals(BVec16Arg inV1, BVec16Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_cmpeq_epi8(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vceqq_u8(inV1.mValue, inV2.mValue);
+#else
+	auto equals = [](uint64 inV1, uint64 inV2) {
+		uint64 r = inV1 ^ ~inV2; // Bits that are equal are 1
+		r &= r << 1; // Combine bit 0 through 1
+		r &= r << 2; // Combine bit 0 through 3
+		r &= r << 4; // Combine bit 0 through 7
+		r &= 0x8080808080808080UL; // Keep only the highest bit of each byte
+		return r;
+	};
+	return BVec16(equals(inV1.mU64[0], inV2.mU64[0]), equals(inV1.mU64[1], inV2.mU64[1]));
+#endif
+}
+
+BVec16 BVec16::sOr(BVec16Arg inV1, BVec16Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_or_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vorrq_u8(inV1.mValue, inV2.mValue);
+#else
+	return BVec16(inV1.mU64[0] | inV2.mU64[0], inV1.mU64[1] | inV2.mU64[1]);
+#endif
+}
+
+BVec16 BVec16::sXor(BVec16Arg inV1, BVec16Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_xor_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return veorq_u8(inV1.mValue, inV2.mValue);
+#else
+	return BVec16(inV1.mU64[0] ^ inV2.mU64[0], inV1.mU64[1] ^ inV2.mU64[1]);
+#endif
+}
+
+BVec16 BVec16::sAnd(BVec16Arg inV1, BVec16Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_and_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vandq_u8(inV1.mValue, inV2.mValue);
+#else
+	return BVec16(inV1.mU64[0] & inV2.mU64[0], inV1.mU64[1] & inV2.mU64[1]);
+#endif
+}
+
+
+BVec16 BVec16::sNot(BVec16Arg inV1)
+{
+#if defined(JPH_USE_SSE)
+	return sXor(inV1, sReplicate(0xff));
+#elif defined(JPH_USE_NEON)
+	return vmvnq_u8(inV1.mValue);
+#else
+	return BVec16(~inV1.mU64[0], ~inV1.mU64[1]);
+#endif
+}
+
+int BVec16::GetTrues() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_movemask_epi8(mValue);
+#else
+	int result = 0;
+	for (int i = 0; i < 16; ++i)
+		result |= int(mU8[i] >> 7) << i;
+	return result;
+#endif
+}
+
+bool BVec16::TestAnyTrue() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_movemask_epi8(mValue) != 0;
+#else
+	return ((mU64[0] | mU64[1]) & 0x8080808080808080UL) != 0;
+#endif
+}
+
+bool BVec16::TestAllTrue() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_movemask_epi8(mValue) == 0b1111111111111111;
+#else
+	return ((mU64[0] & mU64[1]) & 0x8080808080808080UL) == 0x8080808080808080UL;
+#endif
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/DMat44.h
+++ b/thirdparty/jolt_physics/Jolt/Math/DMat44.h
@@ -0,0 +1,158 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/MathTypes.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Holds a 4x4 matrix of floats with the last column consisting of doubles
+class [[nodiscard]] alignas(JPH_DVECTOR_ALIGNMENT) DMat44
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying column type
+	using Type = Vec4::Type;
+	using DType = DVec3::Type;
+	using DTypeArg = DVec3::TypeArg;
+
+	// Argument type
+	using ArgType = DMat44Arg;
+
+	/// Constructor
+								DMat44() = default; ///< Intentionally not initialized for performance reasons
+	JPH_INLINE					DMat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, DVec3Arg inC4);
+								DMat44(const DMat44 &inM2) = default;
+	DMat44 &					operator = (const DMat44 &inM2) = default;
+	JPH_INLINE explicit			DMat44(Mat44Arg inM);
+	JPH_INLINE					DMat44(Mat44Arg inRot, DVec3Arg inT);
+	JPH_INLINE					DMat44(Type inC1, Type inC2, Type inC3, DTypeArg inC4);
+
+	/// Zero matrix
+	static JPH_INLINE DMat44	sZero();
+
+	/// Identity matrix
+	static JPH_INLINE DMat44	sIdentity();
+
+	/// Rotate from quaternion
+	static JPH_INLINE DMat44	sRotation(QuatArg inQuat)								{ return DMat44(Mat44::sRotation(inQuat), DVec3::sZero()); }
+
+	/// Get matrix that translates
+	static JPH_INLINE DMat44	sTranslation(DVec3Arg inV)								{ return DMat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), inV); }
+
+	/// Get matrix that rotates and translates
+	static JPH_INLINE DMat44	sRotationTranslation(QuatArg inR, DVec3Arg inT)			{ return DMat44(Mat44::sRotation(inR), inT); }
+
+	/// Get inverse matrix of sRotationTranslation
+	static JPH_INLINE DMat44	sInverseRotationTranslation(QuatArg inR, DVec3Arg inT);
+
+	/// Get matrix that scales (produces a matrix with (inV, 1) on its diagonal)
+	static JPH_INLINE DMat44	sScale(Vec3Arg inV)										{ return DMat44(Mat44::sScale(inV), DVec3::sZero()); }
+
+	/// Convert to Mat44 rounding to nearest
+	JPH_INLINE Mat44			ToMat44() const											{ return Mat44(mCol[0], mCol[1], mCol[2], Vec3(mCol3)); }
+
+	/// Comparison
+	JPH_INLINE bool				operator == (DMat44Arg inM2) const;
+	JPH_INLINE bool				operator != (DMat44Arg inM2) const						{ return !(*this == inM2); }
+
+	/// Test if two matrices are close
+	JPH_INLINE bool				IsClose(DMat44Arg inM2, float inMaxDistSq = 1.0e-12f) const;
+
+	/// Multiply matrix by matrix
+	JPH_INLINE DMat44			operator * (Mat44Arg inM) const;
+
+	/// Multiply matrix by matrix
+	JPH_INLINE DMat44			operator * (DMat44Arg inM) const;
+
+	/// Multiply vector by matrix
+	JPH_INLINE DVec3			operator * (Vec3Arg inV) const;
+
+	/// Multiply vector by matrix
+	JPH_INLINE DVec3			operator * (DVec3Arg inV) const;
+
+	/// Multiply vector by only 3x3 part of the matrix
+	JPH_INLINE Vec3				Multiply3x3(Vec3Arg inV) const							{ return GetRotation().Multiply3x3(inV); }
+
+	/// Multiply vector by only 3x3 part of the matrix
+	JPH_INLINE DVec3			Multiply3x3(DVec3Arg inV) const;
+
+	/// Multiply vector by only 3x3 part of the transpose of the matrix (\f$result = this^T \: inV\f$)
+	JPH_INLINE Vec3				Multiply3x3Transposed(Vec3Arg inV) const				{ return GetRotation().Multiply3x3Transposed(inV); }
+
+	/// Scale a matrix: result = this * Mat44::sScale(inScale)
+	JPH_INLINE DMat44			PreScaled(Vec3Arg inScale) const;
+
+	/// Scale a matrix: result = Mat44::sScale(inScale) * this
+	JPH_INLINE DMat44			PostScaled(Vec3Arg inScale) const;
+
+	/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
+	JPH_INLINE DMat44			PreTranslated(Vec3Arg inTranslation) const;
+
+	/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
+	JPH_INLINE DMat44			PreTranslated(DVec3Arg inTranslation) const;
+
+	/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
+	JPH_INLINE DMat44			PostTranslated(Vec3Arg inTranslation) const;
+
+	/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
+	JPH_INLINE DMat44			PostTranslated(DVec3Arg inTranslation) const;
+
+	/// Access to the columns
+	JPH_INLINE Vec3				GetAxisX() const										{ return Vec3(mCol[0]); }
+	JPH_INLINE void				SetAxisX(Vec3Arg inV)									{ mCol[0] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec3				GetAxisY() const										{ return Vec3(mCol[1]); }
+	JPH_INLINE void				SetAxisY(Vec3Arg inV)									{ mCol[1] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec3				GetAxisZ() const										{ return Vec3(mCol[2]); }
+	JPH_INLINE void				SetAxisZ(Vec3Arg inV)									{ mCol[2] = Vec4(inV, 0.0f); }
+	JPH_INLINE DVec3			GetTranslation() const									{ return mCol3; }
+	JPH_INLINE void				SetTranslation(DVec3Arg inV)							{ mCol3 = inV; }
+	JPH_INLINE Vec3				GetColumn3(uint inCol) const							{ JPH_ASSERT(inCol < 3); return Vec3(mCol[inCol]); }
+	JPH_INLINE void				SetColumn3(uint inCol, Vec3Arg inV)						{ JPH_ASSERT(inCol < 3); mCol[inCol] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec4				GetColumn4(uint inCol) const							{ JPH_ASSERT(inCol < 3); return mCol[inCol]; }
+	JPH_INLINE void				SetColumn4(uint inCol, Vec4Arg inV)						{ JPH_ASSERT(inCol < 3); mCol[inCol] = inV; }
+
+	/// Transpose 3x3 subpart of matrix
+	JPH_INLINE Mat44			Transposed3x3() const									{ return GetRotation().Transposed3x3(); }
+
+	/// Inverse 4x4 matrix
+	JPH_INLINE DMat44			Inversed() const;
+
+	/// Inverse 4x4 matrix when it only contains rotation and translation
+	JPH_INLINE DMat44			InversedRotationTranslation() const;
+
+	/// Get rotation part only (note: retains the first 3 values from the bottom row)
+	JPH_INLINE Mat44			GetRotation() const										{ return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1)); }
+
+	/// Updates the rotation part of this matrix (the first 3 columns)
+	JPH_INLINE void				SetRotation(Mat44Arg inRotation);
+
+	/// Convert to quaternion
+	JPH_INLINE Quat				GetQuaternion() const									{ return GetRotation().GetQuaternion(); }
+
+	/// Get matrix that transforms a direction with the same transform as this matrix (length is not preserved)
+	JPH_INLINE Mat44			GetDirectionPreservingMatrix() const					{ return GetRotation().Inversed3x3().Transposed3x3(); }
+
+	/// Works identical to Mat44::Decompose
+	JPH_INLINE DMat44			Decompose(Vec3 &outScale) const							{ return DMat44(GetRotation().Decompose(outScale), mCol3); }
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, DMat44Arg inM)
+	{
+		inStream << inM.mCol[0] << ", " << inM.mCol[1] << ", " << inM.mCol[2] << ", " << inM.mCol3;
+		return inStream;
+	}
+
+private:
+	Vec4						mCol[3];												///< Rotation columns
+	DVec3						mCol3;													///< Translation column, 4th element is assumed to be 1
+};
+
+static_assert(std::is_trivial<DMat44>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "DMat44.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/DMat44.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/DMat44.inl
@@ -0,0 +1,310 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/DVec3.h>
+
+JPH_NAMESPACE_BEGIN
+
+DMat44::DMat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, DVec3Arg inC4) :
+	mCol { inC1, inC2, inC3 },
+	mCol3(inC4)
+{
+}
+
+DMat44::DMat44(Type inC1, Type inC2, Type inC3, DTypeArg inC4) :
+	mCol { inC1, inC2, inC3 },
+	mCol3(inC4)
+{
+}
+
+DMat44::DMat44(Mat44Arg inM) :
+	mCol { inM.GetColumn4(0), inM.GetColumn4(1), inM.GetColumn4(2) },
+	mCol3(inM.GetTranslation())
+{
+}
+
+DMat44::DMat44(Mat44Arg inRot, DVec3Arg inT) :
+	mCol { inRot.GetColumn4(0), inRot.GetColumn4(1), inRot.GetColumn4(2) },
+	mCol3(inT)
+{
+}
+
+DMat44 DMat44::sZero()
+{
+	return DMat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), DVec3::sZero());
+}
+
+DMat44 DMat44::sIdentity()
+{
+	return DMat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), DVec3::sZero());
+}
+
+DMat44 DMat44::sInverseRotationTranslation(QuatArg inR, DVec3Arg inT)
+{
+	Mat44 m = Mat44::sRotation(inR.Conjugated());
+	DMat44 dm(m, DVec3::sZero());
+	dm.SetTranslation(-dm.Multiply3x3(inT));
+	return dm;
+}
+
+bool DMat44::operator == (DMat44Arg inM2) const
+{
+	return mCol[0] == inM2.mCol[0]
+		&& mCol[1] == inM2.mCol[1]
+		&& mCol[2] == inM2.mCol[2]
+		&& mCol3 == inM2.mCol3;
+}
+
+bool DMat44::IsClose(DMat44Arg inM2, float inMaxDistSq) const
+{
+	for (int i = 0; i < 3; ++i)
+		if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
+			return false;
+	return mCol3.IsClose(inM2.mCol3, double(inMaxDistSq));
+}
+
+DVec3 DMat44::operator * (Vec3Arg inV) const
+{
+#if defined(JPH_USE_AVX)
+	__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+	return DVec3::sFixW(_mm256_add_pd(mCol3.mValue, _mm256_cvtps_pd(t)));
+#elif defined(JPH_USE_SSE)
+	__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+	__m128d low = _mm_add_pd(mCol3.mValue.mLow, _mm_cvtps_pd(t));
+	__m128d high = _mm_add_pd(mCol3.mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(t, t, _MM_SHUFFLE(2, 2, 2, 2))));
+	return DVec3({ low, high });
+#elif defined(JPH_USE_NEON)
+	float32x4_t t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
+	t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
+	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
+	float64x2_t low = vaddq_f64(mCol3.mValue.val[0], vcvt_f64_f32(vget_low_f32(t)));
+	float64x2_t high = vaddq_f64(mCol3.mValue.val[1], vcvt_high_f64_f32(t));
+	return DVec3::sFixW({ low, high });
+#else
+	return DVec3(
+		mCol3.mF64[0] + double(mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2]),
+		mCol3.mF64[1] + double(mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2]),
+		mCol3.mF64[2] + double(mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]));
+#endif
+}
+
+DVec3 DMat44::operator * (DVec3Arg inV) const
+{
+#if defined(JPH_USE_AVX)
+	__m256d t = _mm256_add_pd(mCol3.mValue, _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.mF64[0])));
+	t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.mF64[1])));
+	t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.mF64[2])));
+	return DVec3::sFixW(t);
+#elif defined(JPH_USE_SSE)
+	__m128d xxxx = _mm_set1_pd(inV.mF64[0]);
+	__m128d yyyy = _mm_set1_pd(inV.mF64[1]);
+	__m128d zzzz = _mm_set1_pd(inV.mF64[2]);
+	__m128 col0 = mCol[0].mValue;
+	__m128 col1 = mCol[1].mValue;
+	__m128 col2 = mCol[2].mValue;
+	__m128d t_low = _mm_add_pd(mCol3.mValue.mLow, _mm_mul_pd(_mm_cvtps_pd(col0), xxxx));
+	t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
+	t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
+	__m128d t_high = _mm_add_pd(mCol3.mValue.mHigh, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx));
+	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
+	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
+	return DVec3({ t_low, t_high });
+#elif defined(JPH_USE_NEON)
+	float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
+	float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
+	float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
+	float32x4_t col0 = mCol[0].mValue;
+	float32x4_t col1 = mCol[1].mValue;
+	float32x4_t col2 = mCol[2].mValue;
+	float64x2_t t_low = vaddq_f64(mCol3.mValue.val[0], vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
+	float64x2_t t_high = vaddq_f64(mCol3.mValue.val[1], vmulq_f64(vcvt_high_f64_f32(col0), xxxx));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
+	return DVec3::sFixW({ t_low, t_high });
+#else
+	return DVec3(
+		mCol3.mF64[0] + double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2],
+		mCol3.mF64[1] + double(mCol[0].mF32[1]) * inV.mF64[0] + double(mCol[1].mF32[1]) * inV.mF64[1] + double(mCol[2].mF32[1]) * inV.mF64[2],
+		mCol3.mF64[2] + double(mCol[0].mF32[2]) * inV.mF64[0] + double(mCol[1].mF32[2]) * inV.mF64[1] + double(mCol[2].mF32[2]) * inV.mF64[2]);
+#endif
+}
+
+DVec3 DMat44::Multiply3x3(DVec3Arg inV) const
+{
+#if defined(JPH_USE_AVX)
+	__m256d t = _mm256_mul_pd(_mm256_cvtps_pd(mCol[0].mValue), _mm256_set1_pd(inV.mF64[0]));
+	t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[1].mValue), _mm256_set1_pd(inV.mF64[1])));
+	t = _mm256_add_pd(t, _mm256_mul_pd(_mm256_cvtps_pd(mCol[2].mValue), _mm256_set1_pd(inV.mF64[2])));
+	return DVec3::sFixW(t);
+#elif defined(JPH_USE_SSE)
+	__m128d xxxx = _mm_set1_pd(inV.mF64[0]);
+	__m128d yyyy = _mm_set1_pd(inV.mF64[1]);
+	__m128d zzzz = _mm_set1_pd(inV.mF64[2]);
+	__m128 col0 = mCol[0].mValue;
+	__m128 col1 = mCol[1].mValue;
+	__m128 col2 = mCol[2].mValue;
+	__m128d t_low = _mm_mul_pd(_mm_cvtps_pd(col0), xxxx);
+	t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col1), yyyy));
+	t_low = _mm_add_pd(t_low, _mm_mul_pd(_mm_cvtps_pd(col2), zzzz));
+	__m128d t_high = _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col0, col0, _MM_SHUFFLE(2, 2, 2, 2))), xxxx);
+	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col1, col1, _MM_SHUFFLE(2, 2, 2, 2))), yyyy));
+	t_high = _mm_add_pd(t_high, _mm_mul_pd(_mm_cvtps_pd(_mm_shuffle_ps(col2, col2, _MM_SHUFFLE(2, 2, 2, 2))), zzzz));
+	return DVec3({ t_low, t_high });
+#elif defined(JPH_USE_NEON)
+	float64x2_t xxxx = vdupq_laneq_f64(inV.mValue.val[0], 0);
+	float64x2_t yyyy = vdupq_laneq_f64(inV.mValue.val[0], 1);
+	float64x2_t zzzz = vdupq_laneq_f64(inV.mValue.val[1], 0);
+	float32x4_t col0 = mCol[0].mValue;
+	float32x4_t col1 = mCol[1].mValue;
+	float32x4_t col2 = mCol[2].mValue;
+	float64x2_t t_low = vmulq_f64(vcvt_f64_f32(vget_low_f32(col0)), xxxx);
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col1)), yyyy));
+	t_low = vaddq_f64(t_low, vmulq_f64(vcvt_f64_f32(vget_low_f32(col2)), zzzz));
+	float64x2_t t_high = vmulq_f64(vcvt_high_f64_f32(col0), xxxx);
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col1), yyyy));
+	t_high = vaddq_f64(t_high, vmulq_f64(vcvt_high_f64_f32(col2), zzzz));
+	return DVec3::sFixW({ t_low, t_high });
+#else
+	return DVec3(
+		double(mCol[0].mF32[0]) * inV.mF64[0] + double(mCol[1].mF32[0]) * inV.mF64[1] + double(mCol[2].mF32[0]) * inV.mF64[2],
+		double(mCol[0].mF32[1]) * inV.mF64[0] + double(mCol[1].mF32[1]) * inV.mF64[1] + double(mCol[2].mF32[1]) * inV.mF64[2],
+		double(mCol[0].mF32[2]) * inV.mF64[0] + double(mCol[1].mF32[2]) * inV.mF64[1] + double(mCol[2].mF32[2]) * inV.mF64[2]);
+#endif
+}
+
+DMat44 DMat44::operator * (Mat44Arg inM) const
+{
+	DMat44 result;
+
+	// Rotation part
+#if defined(JPH_USE_SSE)
+	for (int i = 0; i < 3; ++i)
+	{
+		__m128 c = inM.GetColumn4(i).mValue;
+		__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
+		result.mCol[i].mValue = t;
+	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 3; ++i)
+	{
+		Type c = inM.GetColumn4(i).mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		result.mCol[i].mValue = t;
+	}
+#else
+	for (int i = 0; i < 3; ++i)
+	{
+		Vec4 coli = inM.GetColumn4(i);
+		result.mCol[i] = mCol[0] * coli.mF32[0] + mCol[1] * coli.mF32[1] + mCol[2] * coli.mF32[2];
+	}
+#endif
+
+	// Translation part
+	result.mCol3 = *this * inM.GetTranslation();
+
+	return result;
+}
+
+DMat44 DMat44::operator * (DMat44Arg inM) const
+{
+	DMat44 result;
+
+	// Rotation part
+#if defined(JPH_USE_SSE)
+	for (int i = 0; i < 3; ++i)
+	{
+		__m128 c = inM.mCol[i].mValue;
+		__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
+		result.mCol[i].mValue = t;
+	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 3; ++i)
+	{
+		Type c = inM.GetColumn4(i).mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		result.mCol[i].mValue = t;
+	}
+#else
+	for (int i = 0; i < 3; ++i)
+	{
+		Vec4 coli = inM.mCol[i];
+		result.mCol[i] = mCol[0] * coli.mF32[0] + mCol[1] * coli.mF32[1] + mCol[2] * coli.mF32[2];
+	}
+#endif
+
+	// Translation part
+	result.mCol3 = *this * inM.GetTranslation();
+
+	return result;
+}
+
+void DMat44::SetRotation(Mat44Arg inRotation)
+{
+	mCol[0] = inRotation.GetColumn4(0);
+	mCol[1] = inRotation.GetColumn4(1);
+	mCol[2] = inRotation.GetColumn4(2);
+}
+
+DMat44 DMat44::PreScaled(Vec3Arg inScale) const
+{
+	return DMat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol3);
+}
+
+DMat44 DMat44::PostScaled(Vec3Arg inScale) const
+{
+	Vec4 scale(inScale, 1);
+	return DMat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], DVec3(scale) * mCol3);
+}
+
+DMat44 DMat44::PreTranslated(Vec3Arg inTranslation) const
+{
+	return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + Multiply3x3(inTranslation));
+}
+
+DMat44 DMat44::PreTranslated(DVec3Arg inTranslation) const
+{
+	return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + Multiply3x3(inTranslation));
+}
+
+DMat44 DMat44::PostTranslated(Vec3Arg inTranslation) const
+{
+	return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + inTranslation);
+}
+
+DMat44 DMat44::PostTranslated(DVec3Arg inTranslation) const
+{
+	return DMat44(mCol[0], mCol[1], mCol[2], GetTranslation() + inTranslation);
+}
+
+DMat44 DMat44::Inversed() const
+{
+	DMat44 m(GetRotation().Inversed3x3());
+	m.mCol3 = -m.Multiply3x3(mCol3);
+	return m;
+}
+
+DMat44 DMat44::InversedRotationTranslation() const
+{
+	DMat44 m(GetRotation().Transposed3x3());
+	m.mCol3 = -m.Multiply3x3(mCol3);
+	return m;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/DVec3.h
+++ b/thirdparty/jolt_physics/Jolt/Math/DVec3.h
@@ -0,0 +1,291 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Double3.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// 3 component vector of doubles (stored as 4 vectors).
+/// Note that we keep the 4th component the same as the 3rd component to avoid divisions by zero when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED defined
+class [[nodiscard]] alignas(JPH_DVECTOR_ALIGNMENT) DVec3
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying vector type
+#if defined(JPH_USE_AVX)
+	using Type = __m256d;
+	using TypeArg = __m256d;
+#elif defined(JPH_USE_SSE)
+	using Type = struct { __m128d mLow, mHigh; };
+	using TypeArg = const Type &;
+#elif defined(JPH_USE_NEON)
+	using Type = float64x2x2_t;
+	using TypeArg = const Type &;
+#else
+	using Type = struct { double mData[4]; };
+	using TypeArg = const Type &;
+#endif
+
+	// Argument type
+	using ArgType = DVec3Arg;
+
+	/// Constructor
+								DVec3() = default; ///< Intentionally not initialized for performance reasons
+								DVec3(const DVec3 &inRHS) = default;
+	DVec3 &						operator = (const DVec3 &inRHS) = default;
+	JPH_INLINE explicit			DVec3(Vec3Arg inRHS);
+	JPH_INLINE explicit			DVec3(Vec4Arg inRHS);
+	JPH_INLINE					DVec3(TypeArg inRHS) : mValue(inRHS)			{ CheckW(); }
+
+	/// Create a vector from 3 components
+	JPH_INLINE					DVec3(double inX, double inY, double inZ);
+
+	/// Load 3 doubles from memory
+	explicit JPH_INLINE			DVec3(const Double3 &inV);
+
+	/// Vector with all zeros
+	static JPH_INLINE DVec3		sZero();
+
+	/// Vector with all ones
+	static JPH_INLINE DVec3		sOne();
+
+	/// Vectors with the principal axis
+	static JPH_INLINE DVec3		sAxisX()										{ return DVec3(1, 0, 0); }
+	static JPH_INLINE DVec3		sAxisY()										{ return DVec3(0, 1, 0); }
+	static JPH_INLINE DVec3		sAxisZ()										{ return DVec3(0, 0, 1); }
+
+	/// Replicate inV across all components
+	static JPH_INLINE DVec3		sReplicate(double inV);
+
+	/// Vector with all NaN's
+	static JPH_INLINE DVec3		sNaN();
+
+	/// Load 3 doubles from memory (reads 64 bits extra which it doesn't use)
+	static JPH_INLINE DVec3		sLoadDouble3Unsafe(const Double3 &inV);
+
+	/// Store 3 doubles to memory
+	JPH_INLINE void				StoreDouble3(Double3 *outV) const;
+
+	/// Convert to float vector 3 rounding to nearest
+	JPH_INLINE explicit			operator Vec3() const;
+
+	/// Prepare to convert to float vector 3 rounding towards zero (returns DVec3 that can be converted to a Vec3 to get the rounding)
+	JPH_INLINE DVec3			PrepareRoundToZero() const;
+
+	/// Prepare to convert to float vector 3 rounding towards positive/negative inf (returns DVec3 that can be converted to a Vec3 to get the rounding)
+	JPH_INLINE DVec3			PrepareRoundToInf() const;
+
+	/// Convert to float vector 3 rounding down
+	JPH_INLINE Vec3				ToVec3RoundDown() const;
+
+	/// Convert to float vector 3 rounding up
+	JPH_INLINE Vec3				ToVec3RoundUp() const;
+
+	/// Return the minimum value of each of the components
+	static JPH_INLINE DVec3		sMin(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Return the maximum of each of the components
+	static JPH_INLINE DVec3		sMax(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Clamp a vector between min and max (component wise)
+	static JPH_INLINE DVec3		sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax);
+
+	/// Equals (component wise)
+	static JPH_INLINE DVec3		sEquals(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Less than (component wise)
+	static JPH_INLINE DVec3		sLess(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Less than or equal (component wise)
+	static JPH_INLINE DVec3		sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Greater than (component wise)
+	static JPH_INLINE DVec3		sGreater(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Greater than or equal (component wise)
+	static JPH_INLINE DVec3		sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Calculates inMul1 * inMul2 + inAdd
+	static JPH_INLINE DVec3		sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd);
+
+	/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
+	static JPH_INLINE DVec3		sSelect(DVec3Arg inNotSet, DVec3Arg inSet, DVec3Arg inControl);
+
+	/// Logical or (component wise)
+	static JPH_INLINE DVec3		sOr(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Logical xor (component wise)
+	static JPH_INLINE DVec3		sXor(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Logical and (component wise)
+	static JPH_INLINE DVec3		sAnd(DVec3Arg inV1, DVec3Arg inV2);
+
+	/// Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of component is set)
+	JPH_INLINE int				GetTrues() const;
+
+	/// Test if any of the components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAnyTrue() const;
+
+	/// Test if all components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAllTrue() const;
+
+	/// Get individual components
+#if defined(JPH_USE_AVX)
+	JPH_INLINE double			GetX() const									{ return _mm_cvtsd_f64(_mm256_castpd256_pd128(mValue)); }
+	JPH_INLINE double			GetY() const									{ return mF64[1]; }
+	JPH_INLINE double			GetZ() const									{ return mF64[2]; }
+#elif defined(JPH_USE_SSE)
+	JPH_INLINE double			GetX() const									{ return _mm_cvtsd_f64(mValue.mLow); }
+	JPH_INLINE double			GetY() const									{ return mF64[1]; }
+	JPH_INLINE double			GetZ() const									{ return _mm_cvtsd_f64(mValue.mHigh); }
+#elif defined(JPH_USE_NEON)
+	JPH_INLINE double			GetX() const									{ return vgetq_lane_f64(mValue.val[0], 0); }
+	JPH_INLINE double			GetY() const									{ return vgetq_lane_f64(mValue.val[0], 1); }
+	JPH_INLINE double			GetZ() const									{ return vgetq_lane_f64(mValue.val[1], 0); }
+#else
+	JPH_INLINE double			GetX() const									{ return mF64[0]; }
+	JPH_INLINE double			GetY() const									{ return mF64[1]; }
+	JPH_INLINE double			GetZ() const									{ return mF64[2]; }
+#endif
+
+	/// Set individual components
+	JPH_INLINE void				SetX(double inX)								{ mF64[0] = inX; }
+	JPH_INLINE void				SetY(double inY)								{ mF64[1] = inY; }
+	JPH_INLINE void				SetZ(double inZ)								{ mF64[2] = mF64[3] = inZ; } // Assure Z and W are the same
+
+	/// Set all components
+	JPH_INLINE void				Set(double inX, double inY, double inZ)			{ *this = DVec3(inX, inY, inZ); }
+
+	/// Get double component by index
+	JPH_INLINE double			operator [] (uint inCoordinate) const			{ JPH_ASSERT(inCoordinate < 3); return mF64[inCoordinate]; }
+
+	/// Set double component by index
+	JPH_INLINE void				SetComponent(uint inCoordinate, double inValue)	{ JPH_ASSERT(inCoordinate < 3); mF64[inCoordinate] = inValue; mValue = sFixW(mValue); } // Assure Z and W are the same
+
+	/// Comparison
+	JPH_INLINE bool				operator == (DVec3Arg inV2) const;
+	JPH_INLINE bool				operator != (DVec3Arg inV2) const				{ return !(*this == inV2); }
+
+	/// Test if two vectors are close
+	JPH_INLINE bool				IsClose(DVec3Arg inV2, double inMaxDistSq = 1.0e-24) const;
+
+	/// Test if vector is near zero
+	JPH_INLINE bool				IsNearZero(double inMaxDistSq = 1.0e-24) const;
+
+	/// Test if vector is normalized
+	JPH_INLINE bool				IsNormalized(double inTolerance = 1.0e-12) const;
+
+	/// Test if vector contains NaN elements
+	JPH_INLINE bool				IsNaN() const;
+
+	/// Multiply two double vectors (component wise)
+	JPH_INLINE DVec3			operator * (DVec3Arg inV2) const;
+
+	/// Multiply vector with double
+	JPH_INLINE DVec3			operator * (double inV2) const;
+
+	/// Multiply vector with double
+	friend JPH_INLINE DVec3		operator * (double inV1, DVec3Arg inV2);
+
+	/// Divide vector by double
+	JPH_INLINE DVec3			operator / (double inV2) const;
+
+	/// Multiply vector with double
+	JPH_INLINE DVec3 &			operator *= (double inV2);
+
+	/// Multiply vector with vector
+	JPH_INLINE DVec3 &			operator *= (DVec3Arg inV2);
+
+	/// Divide vector by double
+	JPH_INLINE DVec3 &			operator /= (double inV2);
+
+	/// Add two vectors (component wise)
+	JPH_INLINE DVec3			operator + (Vec3Arg inV2) const;
+
+	/// Add two double vectors (component wise)
+	JPH_INLINE DVec3			operator + (DVec3Arg inV2) const;
+
+	/// Add two vectors (component wise)
+	JPH_INLINE DVec3 &			operator += (Vec3Arg inV2);
+
+	/// Add two double vectors (component wise)
+	JPH_INLINE DVec3 &			operator += (DVec3Arg inV2);
+
+	/// Negate
+	JPH_INLINE DVec3			operator - () const;
+
+	/// Subtract two vectors (component wise)
+	JPH_INLINE DVec3			operator - (Vec3Arg inV2) const;
+
+	/// Subtract two double vectors (component wise)
+	JPH_INLINE DVec3			operator - (DVec3Arg inV2) const;
+
+	/// Subtract two vectors (component wise)
+	JPH_INLINE DVec3 &			operator -= (Vec3Arg inV2);
+
+	/// Subtract two vectors (component wise)
+	JPH_INLINE DVec3 &			operator -= (DVec3Arg inV2);
+
+	/// Divide (component wise)
+	JPH_INLINE DVec3			operator / (DVec3Arg inV2) const;
+
+	/// Return the absolute value of each of the components
+	JPH_INLINE DVec3			Abs() const;
+
+	/// Reciprocal vector (1 / value) for each of the components
+	JPH_INLINE DVec3			Reciprocal() const;
+
+	/// Cross product
+	JPH_INLINE DVec3			Cross(DVec3Arg inV2) const;
+
+	/// Dot product
+	JPH_INLINE double			Dot(DVec3Arg inV2) const;
+
+	/// Squared length of vector
+	JPH_INLINE double			LengthSq() const;
+
+	/// Length of vector
+	JPH_INLINE double			Length() const;
+
+	/// Normalize vector
+	JPH_INLINE DVec3			Normalized() const;
+
+	/// Component wise square root
+	JPH_INLINE DVec3			Sqrt() const;
+
+	/// Get vector that contains the sign of each element (returns 1 if positive, -1 if negative)
+	JPH_INLINE DVec3			GetSign() const;
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, DVec3Arg inV)
+	{
+		inStream << inV.mF64[0] << ", " << inV.mF64[1] << ", " << inV.mF64[2];
+		return inStream;
+	}
+
+	/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
+	JPH_INLINE void				CheckW() const;
+
+	/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
+	static JPH_INLINE Type		sFixW(TypeArg inValue);
+
+	/// Representations of true and false for boolean operations
+	inline static const double	cTrue = BitCast<double>(~uint64(0));
+	inline static const double	cFalse = 0.0;
+
+	union
+	{
+		Type					mValue;
+		double					mF64[4];
+	};
+};
+
+static_assert(std::is_trivial<DVec3>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "DVec3.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/DVec3.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/DVec3.inl
@@ -0,0 +1,941 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/HashCombine.h>
+
+// Create a std::hash/JPH::Hash for DVec3
+JPH_MAKE_HASHABLE(JPH::DVec3, t.GetX(), t.GetY(), t.GetZ())
+
+JPH_NAMESPACE_BEGIN
+
+DVec3::DVec3(Vec3Arg inRHS)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_cvtps_pd(inRHS.mValue);
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_cvtps_pd(inRHS.mValue);
+	mValue.mHigh = _mm_cvtps_pd(_mm_shuffle_ps(inRHS.mValue, inRHS.mValue, _MM_SHUFFLE(2, 2, 2, 2)));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vcvt_f64_f32(vget_low_f32(inRHS.mValue));
+	mValue.val[1] = vcvt_high_f64_f32(inRHS.mValue);
+#else
+	mF64[0] = (double)inRHS.GetX();
+	mF64[1] = (double)inRHS.GetY();
+	mF64[2] = (double)inRHS.GetZ();
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+}
+
+DVec3::DVec3(Vec4Arg inRHS) :
+	DVec3(Vec3(inRHS))
+{
+}
+
+DVec3::DVec3(double inX, double inY, double inZ)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_set_pd(inZ, inZ, inY, inX); // Assure Z and W are the same
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_set_pd(inY, inX);
+	mValue.mHigh = _mm_set1_pd(inZ);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vcombine_f64(vcreate_f64(BitCast<uint64>(inX)), vcreate_f64(BitCast<uint64>(inY)));
+	mValue.val[1] = vdupq_n_f64(inZ);
+#else
+	mF64[0] = inX;
+	mF64[1] = inY;
+	mF64[2] = inZ;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+}
+
+DVec3::DVec3(const Double3 &inV)
+{
+#if defined(JPH_USE_AVX)
+	Type x = _mm256_castpd128_pd256(_mm_load_sd(&inV.x));
+	Type y = _mm256_castpd128_pd256(_mm_load_sd(&inV.y));
+	Type z = _mm256_broadcast_sd(&inV.z);
+	Type xy = _mm256_unpacklo_pd(x, y);
+	mValue = _mm256_blend_pd(xy, z, 0b1100); // Assure Z and W are the same
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_loadu_pd(&inV.x);
+	mValue.mHigh = _mm_set1_pd(inV.z);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vld1q_f64(&inV.x);
+	mValue.val[1] = vdupq_n_f64(inV.z);
+#else
+	mF64[0] = inV.x;
+	mF64[1] = inV.y;
+	mF64[2] = inV.z;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+}
+
+void DVec3::CheckW() const
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	// Avoid asserts when both components are NaN
+	JPH_ASSERT(reinterpret_cast<const uint64 *>(mF64)[2] == reinterpret_cast<const uint64 *>(mF64)[3]);
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+}
+
+/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
+DVec3::Type DVec3::sFixW(TypeArg inValue)
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	#if defined(JPH_USE_AVX)
+		return _mm256_shuffle_pd(inValue, inValue, 2);
+	#elif defined(JPH_USE_SSE)
+		Type value;
+		value.mLow = inValue.mLow;
+		value.mHigh = _mm_shuffle_pd(inValue.mHigh, inValue.mHigh, 0);
+		return value;
+	#elif defined(JPH_USE_NEON)
+		Type value;
+		value.val[0] = inValue.val[0];
+		value.val[1] = vdupq_laneq_f64(inValue.val[1], 0);
+		return value;
+	#else
+		Type value;
+		value.mData[0] = inValue.mData[0];
+		value.mData[1] = inValue.mData[1];
+		value.mData[2] = inValue.mData[2];
+		value.mData[3] = inValue.mData[2];
+		return value;
+	#endif
+#else
+	return inValue;
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+}
+
+DVec3 DVec3::sZero()
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_setzero_pd();
+#elif defined(JPH_USE_SSE)
+	__m128d zero = _mm_setzero_pd();
+	return DVec3({ zero, zero });
+#elif defined(JPH_USE_NEON)
+	float64x2_t zero = vdupq_n_f64(0.0);
+	return DVec3({ zero, zero });
+#else
+	return DVec3(0, 0, 0);
+#endif
+}
+
+DVec3 DVec3::sReplicate(double inV)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_set1_pd(inV);
+#elif defined(JPH_USE_SSE)
+	__m128d value = _mm_set1_pd(inV);
+	return DVec3({ value, value });
+#elif defined(JPH_USE_NEON)
+	float64x2_t value = vdupq_n_f64(inV);
+	return DVec3({ value, value });
+#else
+	return DVec3(inV, inV, inV);
+#endif
+}
+
+DVec3 DVec3::sOne()
+{
+	return sReplicate(1.0);
+}
+
+DVec3 DVec3::sNaN()
+{
+	return sReplicate(numeric_limits<double>::quiet_NaN());
+}
+
+DVec3 DVec3::sLoadDouble3Unsafe(const Double3 &inV)
+{
+#if defined(JPH_USE_AVX)
+	Type v = _mm256_loadu_pd(&inV.x);
+#elif defined(JPH_USE_SSE)
+	Type v;
+	v.mLow = _mm_loadu_pd(&inV.x);
+	v.mHigh = _mm_set1_pd(inV.z);
+#elif defined(JPH_USE_NEON)
+	Type v = vld1q_f64_x2(&inV.x);
+#else
+	Type v = { inV.x, inV.y, inV.z };
+#endif
+	return sFixW(v);
+}
+
+void DVec3::StoreDouble3(Double3 *outV) const
+{
+	outV->x = mF64[0];
+	outV->y = mF64[1];
+	outV->z = mF64[2];
+}
+
+DVec3::operator Vec3() const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cvtpd_ps(mValue);
+#elif defined(JPH_USE_SSE)
+	__m128 low = _mm_cvtpd_ps(mValue.mLow);
+	__m128 high = _mm_cvtpd_ps(mValue.mHigh);
+	return _mm_shuffle_ps(low, high, _MM_SHUFFLE(1, 0, 1, 0));
+#elif defined(JPH_USE_NEON)
+	return vcvt_high_f32_f64(vcvtx_f32_f64(mValue.val[0]), mValue.val[1]);
+#else
+	return Vec3((float)GetX(), (float)GetY(), (float)GetZ());
+#endif
+}
+
+DVec3 DVec3::sMin(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_min_pd(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_min_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_min_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vminq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vminq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(min(inV1.mF64[0], inV2.mF64[0]),
+				 min(inV1.mF64[1], inV2.mF64[1]),
+				 min(inV1.mF64[2], inV2.mF64[2]));
+#endif
+}
+
+DVec3 DVec3::sMax(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_max_pd(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_max_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_max_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmaxq_f64(inV1.mValue.val[0], inV2.mValue.val[0]), vmaxq_f64(inV1.mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(max(inV1.mF64[0], inV2.mF64[0]),
+				 max(inV1.mF64[1], inV2.mF64[1]),
+				 max(inV1.mF64[2], inV2.mF64[2]));
+#endif
+}
+
+DVec3 DVec3::sClamp(DVec3Arg inV, DVec3Arg inMin, DVec3Arg inMax)
+{
+	return sMax(sMin(inV, inMax), inMin);
+}
+
+DVec3 DVec3::sEquals(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_EQ_OQ);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_cmpeq_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpeq_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vceqq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vceqq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
+#else
+	return DVec3(inV1.mF64[0] == inV2.mF64[0]? cTrue : cFalse,
+				 inV1.mF64[1] == inV2.mF64[1]? cTrue : cFalse,
+				 inV1.mF64[2] == inV2.mF64[2]? cTrue : cFalse);
+#endif
+}
+
+DVec3 DVec3::sLess(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LT_OQ);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_cmplt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmplt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vcltq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcltq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
+#else
+	return DVec3(inV1.mF64[0] < inV2.mF64[0]? cTrue : cFalse,
+				 inV1.mF64[1] < inV2.mF64[1]? cTrue : cFalse,
+				 inV1.mF64[2] < inV2.mF64[2]? cTrue : cFalse);
+#endif
+}
+
+DVec3 DVec3::sLessOrEqual(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_LE_OQ);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_cmple_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmple_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vcleq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcleq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
+#else
+	return DVec3(inV1.mF64[0] <= inV2.mF64[0]? cTrue : cFalse,
+				 inV1.mF64[1] <= inV2.mF64[1]? cTrue : cFalse,
+				 inV1.mF64[2] <= inV2.mF64[2]? cTrue : cFalse);
+#endif
+}
+
+DVec3 DVec3::sGreater(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GT_OQ);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_cmpgt_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpgt_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vcgtq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcgtq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
+#else
+	return DVec3(inV1.mF64[0] > inV2.mF64[0]? cTrue : cFalse,
+				 inV1.mF64[1] > inV2.mF64[1]? cTrue : cFalse,
+				 inV1.mF64[2] > inV2.mF64[2]? cTrue : cFalse);
+#endif
+}
+
+DVec3 DVec3::sGreaterOrEqual(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_cmp_pd(inV1.mValue, inV2.mValue, _CMP_GE_OQ);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_cmpge_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_cmpge_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vcgeq_f64(inV1.mValue.val[0], inV2.mValue.val[0])), vreinterpretq_f64_u64(vcgeq_f64(inV1.mValue.val[1], inV2.mValue.val[1])) });
+#else
+	return DVec3(inV1.mF64[0] >= inV2.mF64[0]? cTrue : cFalse,
+				 inV1.mF64[1] >= inV2.mF64[1]? cTrue : cFalse,
+				 inV1.mF64[2] >= inV2.mF64[2]? cTrue : cFalse);
+#endif
+}
+
+DVec3 DVec3::sFusedMultiplyAdd(DVec3Arg inMul1, DVec3Arg inMul2, DVec3Arg inAdd)
+{
+#if defined(JPH_USE_AVX)
+	#ifdef JPH_USE_FMADD
+		return _mm256_fmadd_pd(inMul1.mValue, inMul2.mValue, inAdd.mValue);
+	#else
+		return _mm256_add_pd(_mm256_mul_pd(inMul1.mValue, inMul2.mValue), inAdd.mValue);
+	#endif
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmlaq_f64(inAdd.mValue.val[0], inMul1.mValue.val[0], inMul2.mValue.val[0]), vmlaq_f64(inAdd.mValue.val[1], inMul1.mValue.val[1], inMul2.mValue.val[1]) });
+#else
+	return inMul1 * inMul2 + inAdd;
+#endif
+}
+
+DVec3 DVec3::sSelect(DVec3Arg inNotSet, DVec3Arg inSet, DVec3Arg inControl)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_blendv_pd(inNotSet.mValue, inSet.mValue, inControl.mValue);
+#elif defined(JPH_USE_SSE4_1)
+	Type v = { _mm_blendv_pd(inNotSet.mValue.mLow, inSet.mValue.mLow, inControl.mValue.mLow), _mm_blendv_pd(inNotSet.mValue.mHigh, inSet.mValue.mHigh, inControl.mValue.mHigh) };
+	return sFixW(v);
+#elif defined(JPH_USE_NEON)
+	Type v = { vbslq_f64(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_f64(inControl.mValue.val[0]), 63)), inSet.mValue.val[0], inNotSet.mValue.val[0]),
+			   vbslq_f64(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_f64(inControl.mValue.val[1]), 63)), inSet.mValue.val[1], inNotSet.mValue.val[1]) };
+	return sFixW(v);
+#else
+	DVec3 result;
+	for (int i = 0; i < 3; i++)
+		result.mF64[i] = (BitCast<uint64>(inControl.mF64[i]) & (uint64(1) << 63))? inSet.mF64[i] : inNotSet.mF64[i];
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	result.mF64[3] = result.mF64[2];
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	return result;
+#endif
+}
+
+DVec3 DVec3::sOr(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_or_pd(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_or_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_or_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
+				   vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) | BitCast<uint64>(inV2.mF64[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) | BitCast<uint64>(inV2.mF64[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[2]) | BitCast<uint64>(inV2.mF64[2])));
+#endif
+}
+
+DVec3 DVec3::sXor(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_xor_pd(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_xor_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_xor_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
+				   vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) ^ BitCast<uint64>(inV2.mF64[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) ^ BitCast<uint64>(inV2.mF64[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[2]) ^ BitCast<uint64>(inV2.mF64[2])));
+#endif
+}
+
+DVec3 DVec3::sAnd(DVec3Arg inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_and_pd(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_and_pd(inV1.mValue.mLow, inV2.mValue.mLow), _mm_and_pd(inV1.mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(inV1.mValue.val[0]), vreinterpretq_u64_f64(inV2.mValue.val[0]))),
+				   vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(inV1.mValue.val[1]), vreinterpretq_u64_f64(inV2.mValue.val[1]))) });
+#else
+	return DVec3(BitCast<double>(BitCast<uint64>(inV1.mF64[0]) & BitCast<uint64>(inV2.mF64[0])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[1]) & BitCast<uint64>(inV2.mF64[1])),
+				 BitCast<double>(BitCast<uint64>(inV1.mF64[2]) & BitCast<uint64>(inV2.mF64[2])));
+#endif
+}
+
+int DVec3::GetTrues() const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_movemask_pd(mValue) & 0x7;
+#elif defined(JPH_USE_SSE)
+	return (_mm_movemask_pd(mValue.mLow) + (_mm_movemask_pd(mValue.mHigh) << 2)) & 0x7;
+#else
+	return int((BitCast<uint64>(mF64[0]) >> 63) | ((BitCast<uint64>(mF64[1]) >> 63) << 1) | ((BitCast<uint64>(mF64[2]) >> 63) << 2));
+#endif
+}
+
+bool DVec3::TestAnyTrue() const
+{
+	return GetTrues() != 0;
+}
+
+bool DVec3::TestAllTrue() const
+{
+	return GetTrues() == 0x7;
+}
+
+bool DVec3::operator == (DVec3Arg inV2) const
+{
+	return sEquals(*this, inV2).TestAllTrue();
+}
+
+bool DVec3::IsClose(DVec3Arg inV2, double inMaxDistSq) const
+{
+	return (inV2 - *this).LengthSq() <= inMaxDistSq;
+}
+
+bool DVec3::IsNearZero(double inMaxDistSq) const
+{
+	return LengthSq() <= inMaxDistSq;
+}
+
+DVec3 DVec3::operator * (DVec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_mul_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_mul_pd(mValue.mLow, inV2.mValue.mLow), _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_f64(mValue.val[0], inV2.mValue.val[0]), vmulq_f64(mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(mF64[0] * inV2.mF64[0], mF64[1] * inV2.mF64[1], mF64[2] * inV2.mF64[2]);
+#endif
+}
+
+DVec3 DVec3::operator * (double inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
+#elif defined(JPH_USE_SSE)
+	__m128d v = _mm_set1_pd(inV2);
+	return DVec3({ _mm_mul_pd(mValue.mLow, v), _mm_mul_pd(mValue.mHigh, v) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_n_f64(mValue.val[0], inV2), vmulq_n_f64(mValue.val[1], inV2) });
+#else
+	return DVec3(mF64[0] * inV2, mF64[1] * inV2, mF64[2] * inV2);
+#endif
+}
+
+DVec3 operator * (double inV1, DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_mul_pd(_mm256_set1_pd(inV1), inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	__m128d v = _mm_set1_pd(inV1);
+	return DVec3({ _mm_mul_pd(v, inV2.mValue.mLow), _mm_mul_pd(v, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vmulq_n_f64(inV2.mValue.val[0], inV1), vmulq_n_f64(inV2.mValue.val[1], inV1) });
+#else
+	return DVec3(inV1 * inV2.mF64[0], inV1 * inV2.mF64[1], inV1 * inV2.mF64[2]);
+#endif
+}
+
+DVec3 DVec3::operator / (double inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
+#elif defined(JPH_USE_SSE)
+	__m128d v = _mm_set1_pd(inV2);
+	return DVec3({ _mm_div_pd(mValue.mLow, v), _mm_div_pd(mValue.mHigh, v) });
+#elif defined(JPH_USE_NEON)
+	float64x2_t v = vdupq_n_f64(inV2);
+	return DVec3({ vdivq_f64(mValue.val[0], v), vdivq_f64(mValue.val[1], v) });
+#else
+	return DVec3(mF64[0] / inV2, mF64[1] / inV2, mF64[2] / inV2);
+#endif
+}
+
+DVec3 &DVec3::operator *= (double inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_mul_pd(mValue, _mm256_set1_pd(inV2));
+#elif defined(JPH_USE_SSE)
+	__m128d v = _mm_set1_pd(inV2);
+	mValue.mLow = _mm_mul_pd(mValue.mLow, v);
+	mValue.mHigh = _mm_mul_pd(mValue.mHigh, v);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vmulq_n_f64(mValue.val[0], inV2);
+	mValue.val[1] = vmulq_n_f64(mValue.val[1], inV2);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] *= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 &DVec3::operator *= (DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_mul_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_mul_pd(mValue.mLow, inV2.mValue.mLow);
+	mValue.mHigh = _mm_mul_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] *= inV2.mF64[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 &DVec3::operator /= (double inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_div_pd(mValue, _mm256_set1_pd(inV2));
+#elif defined(JPH_USE_SSE)
+	__m128d v = _mm_set1_pd(inV2);
+	mValue.mLow = _mm_div_pd(mValue.mLow, v);
+	mValue.mHigh = _mm_div_pd(mValue.mHigh, v);
+#elif defined(JPH_USE_NEON)
+	float64x2_t v = vdupq_n_f64(inV2);
+	mValue.val[0] = vdivq_f64(mValue.val[0], v);
+	mValue.val[1] = vdivq_f64(mValue.val[1], v);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] /= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 DVec3::operator + (Vec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_add_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
+#else
+	return DVec3(mF64[0] + inV2.mF32[0], mF64[1] + inV2.mF32[1], mF64[2] + inV2.mF32[2]);
+#endif
+}
+
+DVec3 DVec3::operator + (DVec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_add_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_add_pd(mValue.mLow, inV2.mValue.mLow), _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vaddq_f64(mValue.val[0], inV2.mValue.val[0]), vaddq_f64(mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(mF64[0] + inV2.mF64[0], mF64[1] + inV2.mF64[1], mF64[2] + inV2.mF64[2]);
+#endif
+}
+
+DVec3 &DVec3::operator += (Vec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_add_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_add_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
+	mValue.mHigh = _mm_add_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vaddq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
+	mValue.val[1] = vaddq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] += inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 &DVec3::operator += (DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_add_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_add_pd(mValue.mLow, inV2.mValue.mLow);
+	mValue.mHigh = _mm_add_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vaddq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vaddq_f64(mValue.val[1], inV2.mValue.val[1]);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] += inV2.mF64[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 DVec3::operator - () const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_sub_pd(_mm256_setzero_pd(), mValue);
+#elif defined(JPH_USE_SSE)
+	__m128d zero = _mm_setzero_pd();
+	return DVec3({ _mm_sub_pd(zero, mValue.mLow), _mm_sub_pd(zero, mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		float64x2_t zero = vdupq_n_f64(0);
+		return DVec3({ vsubq_f64(zero, mValue.val[0]), vsubq_f64(zero, mValue.val[1]) });
+	#else
+		return DVec3({ vnegq_f64(mValue.val[0]), vnegq_f64(mValue.val[1]) });
+	#endif
+#else
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		return DVec3(0.0 - mF64[0], 0.0 - mF64[1], 0.0 - mF64[2]);
+	#else
+		return DVec3(-mF64[0], -mF64[1], -mF64[2]);
+	#endif
+#endif
+}
+
+DVec3 DVec3::operator - (Vec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_sub_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue)), _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2)))) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue))), vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue)) });
+#else
+	return DVec3(mF64[0] - inV2.mF32[0], mF64[1] - inV2.mF32[1], mF64[2] - inV2.mF32[2]);
+#endif
+}
+
+DVec3 DVec3::operator - (DVec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_sub_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_sub_pd(mValue.mLow, inV2.mValue.mLow), _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsubq_f64(mValue.val[0], inV2.mValue.val[0]), vsubq_f64(mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(mF64[0] - inV2.mF64[0], mF64[1] - inV2.mF64[1], mF64[2] - inV2.mF64[2]);
+#endif
+}
+
+DVec3 &DVec3::operator -= (Vec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_sub_pd(mValue, _mm256_cvtps_pd(inV2.mValue));
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_sub_pd(mValue.mLow, _mm_cvtps_pd(inV2.mValue));
+	mValue.mHigh = _mm_sub_pd(mValue.mHigh, _mm_cvtps_pd(_mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vsubq_f64(mValue.val[0], vcvt_f64_f32(vget_low_f32(inV2.mValue)));
+	mValue.val[1] = vsubq_f64(mValue.val[1], vcvt_high_f64_f32(inV2.mValue));
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] -= inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 &DVec3::operator -= (DVec3Arg inV2)
+{
+#if defined(JPH_USE_AVX)
+	mValue = _mm256_sub_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	mValue.mLow = _mm_sub_pd(mValue.mLow, inV2.mValue.mLow);
+	mValue.mHigh = _mm_sub_pd(mValue.mHigh, inV2.mValue.mHigh);
+#elif defined(JPH_USE_NEON)
+	mValue.val[0] = vsubq_f64(mValue.val[0], inV2.mValue.val[0]);
+	mValue.val[1] = vsubq_f64(mValue.val[1], inV2.mValue.val[1]);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF64[i] -= inV2.mF64[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF64[3] = mF64[2];
+	#endif
+#endif
+	return *this;
+}
+
+DVec3 DVec3::operator / (DVec3Arg inV2) const
+{
+	inV2.CheckW();
+#if defined(JPH_USE_AVX)
+	return _mm256_div_pd(mValue, inV2.mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_div_pd(mValue.mLow, inV2.mValue.mLow), _mm_div_pd(mValue.mHigh, inV2.mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vdivq_f64(mValue.val[0], inV2.mValue.val[0]), vdivq_f64(mValue.val[1], inV2.mValue.val[1]) });
+#else
+	return DVec3(mF64[0] / inV2.mF64[0], mF64[1] / inV2.mF64[1], mF64[2] / inV2.mF64[2]);
+#endif
+}
+
+DVec3 DVec3::Abs() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm256_range_pd(mValue, mValue, 0b1000);
+#elif defined(JPH_USE_AVX)
+	return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), mValue), mValue);
+#elif defined(JPH_USE_SSE)
+	__m128d zero = _mm_setzero_pd();
+	return DVec3({ _mm_max_pd(_mm_sub_pd(zero, mValue.mLow), mValue.mLow), _mm_max_pd(_mm_sub_pd(zero, mValue.mHigh), mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vabsq_f64(mValue.val[0]), vabsq_f64(mValue.val[1]) });
+#else
+	return DVec3(abs(mF64[0]), abs(mF64[1]), abs(mF64[2]));
+#endif
+}
+
+DVec3 DVec3::Reciprocal() const
+{
+	return sOne() / mValue;
+}
+
+DVec3 DVec3::Cross(DVec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX2)
+	__m256d t1 = _mm256_permute4x64_pd(inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+	t1 = _mm256_mul_pd(t1, mValue);
+	__m256d t2 = _mm256_permute4x64_pd(mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+	t2 = _mm256_mul_pd(t2, inV2.mValue);
+	__m256d t3 = _mm256_sub_pd(t1, t2);
+	return _mm256_permute4x64_pd(t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+#else
+	return DVec3(mF64[1] * inV2.mF64[2] - mF64[2] * inV2.mF64[1],
+				 mF64[2] * inV2.mF64[0] - mF64[0] * inV2.mF64[2],
+				 mF64[0] * inV2.mF64[1] - mF64[1] * inV2.mF64[0]);
+#endif
+}
+
+double DVec3::Dot(DVec3Arg inV2) const
+{
+#if defined(JPH_USE_AVX)
+	__m256d mul = _mm256_mul_pd(mValue, inV2.mValue);
+	__m128d xy = _mm256_castpd256_pd128(mul);
+	__m128d yx = _mm_shuffle_pd(xy, xy, 1);
+	__m128d sum = _mm_add_pd(xy, yx);
+	__m128d zw = _mm256_extractf128_pd(mul, 1);
+	sum = _mm_add_pd(sum, zw);
+	return _mm_cvtsd_f64(sum);
+#elif defined(JPH_USE_SSE)
+	__m128d xy = _mm_mul_pd(mValue.mLow, inV2.mValue.mLow);
+	__m128d yx = _mm_shuffle_pd(xy, xy, 1);
+	__m128d sum = _mm_add_pd(xy, yx);
+	__m128d z = _mm_mul_sd(mValue.mHigh, inV2.mValue.mHigh);
+	sum = _mm_add_pd(sum, z);
+	return _mm_cvtsd_f64(sum);
+#elif defined(JPH_USE_NEON)
+	float64x2_t mul_low = vmulq_f64(mValue.val[0], inV2.mValue.val[0]);
+	float64x2_t mul_high = vmulq_f64(mValue.val[1], inV2.mValue.val[1]);
+	return vaddvq_f64(mul_low) + vgetq_lane_f64(mul_high, 0);
+#else
+	double dot = 0.0;
+	for (int i = 0; i < 3; i++)
+		dot += mF64[i] * inV2.mF64[i];
+	return dot;
+#endif
+}
+
+double DVec3::LengthSq() const
+{
+	return Dot(*this);
+}
+
+DVec3 DVec3::Sqrt() const
+{
+#if defined(JPH_USE_AVX)
+	return _mm256_sqrt_pd(mValue);
+#elif defined(JPH_USE_SSE)
+	return DVec3({ _mm_sqrt_pd(mValue.mLow), _mm_sqrt_pd(mValue.mHigh) });
+#elif defined(JPH_USE_NEON)
+	return DVec3({ vsqrtq_f64(mValue.val[0]), vsqrtq_f64(mValue.val[1]) });
+#else
+	return DVec3(sqrt(mF64[0]), sqrt(mF64[1]), sqrt(mF64[2]));
+#endif
+}
+
+double DVec3::Length() const
+{
+	return sqrt(Dot(*this));
+}
+
+DVec3 DVec3::Normalized() const
+{
+	return *this / Length();
+}
+
+bool DVec3::IsNormalized(double inTolerance) const
+{
+	return abs(LengthSq() - 1.0) <= inTolerance;
+}
+
+bool DVec3::IsNaN() const
+{
+#if defined(JPH_USE_AVX512)
+	return (_mm256_fpclass_pd_mask(mValue, 0b10000001) & 0x7) != 0;
+#elif defined(JPH_USE_AVX)
+	return (_mm256_movemask_pd(_mm256_cmp_pd(mValue, mValue, _CMP_UNORD_Q)) & 0x7) != 0;
+#elif defined(JPH_USE_SSE)
+	return ((_mm_movemask_pd(_mm_cmpunord_pd(mValue.mLow, mValue.mLow)) + (_mm_movemask_pd(_mm_cmpunord_pd(mValue.mHigh, mValue.mHigh)) << 2)) & 0x7) != 0;
+#else
+	return isnan(mF64[0]) || isnan(mF64[1]) || isnan(mF64[2]);
+#endif
+}
+
+DVec3 DVec3::GetSign() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm256_fixupimm_pd(mValue, mValue, _mm256_set1_epi32(0xA9A90A00), 0);
+#elif defined(JPH_USE_AVX)
+	__m256d minus_one = _mm256_set1_pd(-1.0);
+	__m256d one = _mm256_set1_pd(1.0);
+	return _mm256_or_pd(_mm256_and_pd(mValue, minus_one), one);
+#elif defined(JPH_USE_SSE)
+	__m128d minus_one = _mm_set1_pd(-1.0);
+	__m128d one = _mm_set1_pd(1.0);
+	return DVec3({ _mm_or_pd(_mm_and_pd(mValue.mLow, minus_one), one), _mm_or_pd(_mm_and_pd(mValue.mHigh, minus_one), one) });
+#elif defined(JPH_USE_NEON)
+	uint64x2_t minus_one = vreinterpretq_u64_f64(vdupq_n_f64(-1.0f));
+	uint64x2_t one = vreinterpretq_u64_f64(vdupq_n_f64(1.0f));
+	return DVec3({ vreinterpretq_f64_u64(vorrq_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), minus_one), one)),
+				   vreinterpretq_f64_u64(vorrq_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), minus_one), one)) });
+#else
+	return DVec3(std::signbit(mF64[0])? -1.0 : 1.0,
+				 std::signbit(mF64[1])? -1.0 : 1.0,
+				 std::signbit(mF64[2])? -1.0 : 1.0);
+#endif
+}
+
+DVec3 DVec3::PrepareRoundToZero() const
+{
+	// Float has 23 bit mantissa, double 52 bit mantissa => we lose 29 bits when converting from double to float
+	constexpr uint64 cDoubleToFloatMantissaLoss = (1U << 29) - 1;
+
+#if defined(JPH_USE_AVX)
+	return _mm256_and_pd(mValue, _mm256_castsi256_pd(_mm256_set1_epi64x(int64_t(~cDoubleToFloatMantissaLoss))));
+#elif defined(JPH_USE_SSE)
+	__m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(int64_t(~cDoubleToFloatMantissaLoss)));
+	return DVec3({ _mm_and_pd(mValue.mLow, mask), _mm_and_pd(mValue.mHigh, mask) });
+#elif defined(JPH_USE_NEON)
+	uint64x2_t mask = vdupq_n_u64(~cDoubleToFloatMantissaLoss);
+	return DVec3({ vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), mask)),
+				   vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), mask)) });
+#else
+	double x = BitCast<double>(BitCast<uint64>(mF64[0]) & ~cDoubleToFloatMantissaLoss);
+	double y = BitCast<double>(BitCast<uint64>(mF64[1]) & ~cDoubleToFloatMantissaLoss);
+	double z = BitCast<double>(BitCast<uint64>(mF64[2]) & ~cDoubleToFloatMantissaLoss);
+
+	return DVec3(x, y, z);
+#endif
+}
+
+DVec3 DVec3::PrepareRoundToInf() const
+{
+	// Float has 23 bit mantissa, double 52 bit mantissa => we lose 29 bits when converting from double to float
+	constexpr uint64 cDoubleToFloatMantissaLoss = (1U << 29) - 1;
+
+#if defined(JPH_USE_AVX512)
+	__m256i mantissa_loss = _mm256_set1_epi64x(cDoubleToFloatMantissaLoss);
+	__mmask8 is_zero = _mm256_testn_epi64_mask(_mm256_castpd_si256(mValue), mantissa_loss);
+	__m256d value_or_mantissa_loss = _mm256_or_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
+	return _mm256_mask_blend_pd(is_zero, value_or_mantissa_loss, mValue);
+#elif defined(JPH_USE_AVX)
+	__m256i mantissa_loss = _mm256_set1_epi64x(cDoubleToFloatMantissaLoss);
+	__m256d value_and_mantissa_loss = _mm256_and_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
+	__m256d is_zero = _mm256_cmp_pd(value_and_mantissa_loss, _mm256_setzero_pd(), _CMP_EQ_OQ);
+	__m256d value_or_mantissa_loss = _mm256_or_pd(mValue, _mm256_castsi256_pd(mantissa_loss));
+	return _mm256_blendv_pd(value_or_mantissa_loss, mValue, is_zero);
+#elif defined(JPH_USE_SSE4_1)
+	__m128i mantissa_loss = _mm_set1_epi64x(cDoubleToFloatMantissaLoss);
+	__m128d zero = _mm_setzero_pd();
+	__m128d value_and_mantissa_loss_low = _mm_and_pd(mValue.mLow, _mm_castsi128_pd(mantissa_loss));
+	__m128d is_zero_low = _mm_cmpeq_pd(value_and_mantissa_loss_low, zero);
+	__m128d value_or_mantissa_loss_low = _mm_or_pd(mValue.mLow, _mm_castsi128_pd(mantissa_loss));
+	__m128d value_and_mantissa_loss_high = _mm_and_pd(mValue.mHigh, _mm_castsi128_pd(mantissa_loss));
+	__m128d is_zero_high = _mm_cmpeq_pd(value_and_mantissa_loss_high, zero);
+	__m128d value_or_mantissa_loss_high = _mm_or_pd(mValue.mHigh, _mm_castsi128_pd(mantissa_loss));
+	return DVec3({ _mm_blendv_pd(value_or_mantissa_loss_low, mValue.mLow, is_zero_low), _mm_blendv_pd(value_or_mantissa_loss_high, mValue.mHigh, is_zero_high) });
+#elif defined(JPH_USE_NEON)
+	uint64x2_t mantissa_loss = vdupq_n_u64(cDoubleToFloatMantissaLoss);
+	float64x2_t zero = vdupq_n_f64(0.0);
+	float64x2_t value_and_mantissa_loss_low = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[0]), mantissa_loss));
+	uint64x2_t is_zero_low = vceqq_f64(value_and_mantissa_loss_low, zero);
+	float64x2_t value_or_mantissa_loss_low = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(mValue.val[0]), mantissa_loss));
+	float64x2_t value_and_mantissa_loss_high = vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(mValue.val[1]), mantissa_loss));
+	float64x2_t value_low = vbslq_f64(is_zero_low, mValue.val[0], value_or_mantissa_loss_low);
+	uint64x2_t is_zero_high = vceqq_f64(value_and_mantissa_loss_high, zero);
+	float64x2_t value_or_mantissa_loss_high = vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(mValue.val[1]), mantissa_loss));
+	float64x2_t value_high = vbslq_f64(is_zero_high, mValue.val[1], value_or_mantissa_loss_high);
+	return DVec3({ value_low, value_high });
+#else
+	uint64 ux = BitCast<uint64>(mF64[0]);
+	uint64 uy = BitCast<uint64>(mF64[1]);
+	uint64 uz = BitCast<uint64>(mF64[2]);
+
+	double x = BitCast<double>((ux & cDoubleToFloatMantissaLoss) == 0? ux : (ux | cDoubleToFloatMantissaLoss));
+	double y = BitCast<double>((uy & cDoubleToFloatMantissaLoss) == 0? uy : (uy | cDoubleToFloatMantissaLoss));
+	double z = BitCast<double>((uz & cDoubleToFloatMantissaLoss) == 0? uz : (uz | cDoubleToFloatMantissaLoss));
+
+	return DVec3(x, y, z);
+#endif
+}
+
+Vec3 DVec3::ToVec3RoundDown() const
+{
+	DVec3 to_zero = PrepareRoundToZero();
+	DVec3 to_inf = PrepareRoundToInf();
+	return Vec3(DVec3::sSelect(to_zero, to_inf, DVec3::sLess(*this, DVec3::sZero())));
+}
+
+Vec3 DVec3::ToVec3RoundUp() const
+{
+	DVec3 to_zero = PrepareRoundToZero();
+	DVec3 to_inf = PrepareRoundToInf();
+	return Vec3(DVec3::sSelect(to_inf, to_zero, DVec3::sLess(*this, DVec3::sZero())));
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Double3.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Double3.h
@@ -0,0 +1,48 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/HashCombine.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that holds 3 doubles. Used as a storage class. Convert to DVec3 for calculations.
+class [[nodiscard]] Double3
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+				Double3() = default; ///< Intentionally not initialized for performance reasons
+				Double3(const Double3 &inRHS) = default;
+	Double3 &	operator = (const Double3 &inRHS) = default;
+				Double3(double inX, double inY, double inZ) : x(inX), y(inY), z(inZ) { }
+
+	double		operator [] (int inCoordinate) const
+	{
+		JPH_ASSERT(inCoordinate < 3);
+		return *(&x + inCoordinate);
+	}
+
+	bool		operator == (const Double3 &inRHS) const
+	{
+		return x == inRHS.x && y == inRHS.y && z == inRHS.z;
+	}
+
+	bool		operator != (const Double3 &inRHS) const
+	{
+		return x != inRHS.x || y != inRHS.y || z != inRHS.z;
+	}
+
+	double		x;
+	double		y;
+	double		z;
+};
+
+static_assert(std::is_trivial<Double3>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+// Create a std::hash/JPH::Hash for Double3
+JPH_MAKE_HASHABLE(JPH::Double3, t.x, t.y, t.z)
--- a/thirdparty/jolt_physics/Jolt/Math/DynMatrix.h
+++ b/thirdparty/jolt_physics/Jolt/Math/DynMatrix.h
@@ -0,0 +1,31 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Dynamic resizable matrix class
+class [[nodiscard]] DynMatrix
+{
+public:
+	/// Constructor
+					DynMatrix(const DynMatrix &) = default;
+					DynMatrix(uint inRows, uint inCols)			: mRows(inRows), mCols(inCols) { mElements.resize(inRows * inCols); }
+
+	/// Access an element
+	float			operator () (uint inRow, uint inCol) const	{ JPH_ASSERT(inRow < mRows && inCol < mCols); return mElements[inRow * mCols + inCol]; }
+	float &			operator () (uint inRow, uint inCol)		{ JPH_ASSERT(inRow < mRows && inCol < mCols); return mElements[inRow * mCols + inCol]; }
+
+	/// Get dimensions
+	uint			GetCols() const								{ return mCols; }
+	uint			GetRows() const								{ return mRows; }
+
+private:
+	uint			mRows;
+	uint			mCols;
+	Array<float>	mElements;
+};
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/EigenValueSymmetric.h
+++ b/thirdparty/jolt_physics/Jolt/Math/EigenValueSymmetric.h
@@ -0,0 +1,177 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/FPException.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Function to determine the eigen vectors and values of a N x N real symmetric matrix
+/// by Jacobi transformations. This method is most suitable for N < 10.
+///
+/// Taken and adapted from Numerical Recipes paragraph 11.1
+///
+/// An eigen vector is a vector v for which \f$A \: v = \lambda \: v\f$
+///
+/// Where:
+/// A: A square matrix.
+/// \f$\lambda\f$: a non-zero constant value.
+///
+/// @see https://en.wikipedia.org/wiki/Eigenvalues_and_eigenvectors
+///
+/// Matrix is a matrix type, which has dimensions N x N.
+/// @param inMatrix is the matrix of which to return the eigenvalues and vectors
+/// @param outEigVec will contain a matrix whose columns contain the normalized eigenvectors (must be identity before call)
+/// @param outEigVal will contain the eigenvalues
+template <class Vector, class Matrix>
+bool EigenValueSymmetric(const Matrix &inMatrix, Matrix &outEigVec, Vector &outEigVal)
+{
+	// This algorithm can generate infinite values, see comment below
+	FPExceptionDisableInvalid disable_invalid;
+	JPH_UNUSED(disable_invalid);
+
+	// Maximum number of sweeps to make
+	const int cMaxSweeps = 50;
+
+	// Get problem dimension
+	const uint n = inMatrix.GetRows();
+
+	// Make sure the dimensions are right
+	JPH_ASSERT(inMatrix.GetRows() == n);
+	JPH_ASSERT(inMatrix.GetCols() == n);
+	JPH_ASSERT(outEigVec.GetRows() == n);
+	JPH_ASSERT(outEigVec.GetCols() == n);
+	JPH_ASSERT(outEigVal.GetRows() == n);
+	JPH_ASSERT(outEigVec.IsIdentity());
+
+	// Get the matrix in a so we can mess with it
+	Matrix a = inMatrix;
+
+	Vector b, z;
+
+	for (uint ip = 0; ip < n; ++ip)
+	{
+		// Initialize b to diagonal of a
+		b[ip] = a(ip, ip);
+
+		// Initialize output to diagonal of a
+		outEigVal[ip] = a(ip, ip);
+
+		// Reset z
+		z[ip] = 0.0f;
+	}
+
+	for (int sweep = 0; sweep < cMaxSweeps; ++sweep)
+	{
+		// Get the sum of the off-diagonal elements of a
+		float sm = 0.0f;
+		for (uint ip = 0; ip < n - 1; ++ip)
+			for (uint iq = ip + 1; iq < n; ++iq)
+				sm += abs(a(ip, iq));
+		float avg_sm = sm / Square(n);
+
+		// Normal return, convergence to machine underflow
+		if (avg_sm < FLT_MIN) // Original code: sm == 0.0f, when the average is denormal, we also consider it machine underflow
+		{
+			// Sanity checks
+			#ifdef JPH_ENABLE_ASSERTS
+				for (uint c = 0; c < n; ++c)
+				{
+					// Check if the eigenvector is normalized
+					JPH_ASSERT(outEigVec.GetColumn(c).IsNormalized());
+
+					// Check if inMatrix * eigen_vector = eigen_value * eigen_vector
+					Vector mat_eigvec = inMatrix * outEigVec.GetColumn(c);
+					Vector eigval_eigvec = outEigVal[c] * outEigVec.GetColumn(c);
+					JPH_ASSERT(mat_eigvec.IsClose(eigval_eigvec, max(mat_eigvec.LengthSq(), eigval_eigvec.LengthSq()) * 1.0e-6f));
+				}
+			#endif
+
+			// Success
+			return true;
+		}
+
+		// On the first three sweeps use a fraction of the sum of the off diagonal elements as threshold
+		// Note that we pick a minimum threshold of FLT_MIN because dividing by a denormalized number is likely to result in infinity.
+		float thresh = sweep < 4? 0.2f * avg_sm : FLT_MIN; // Original code: 0.0f instead of FLT_MIN
+
+		for (uint ip = 0; ip < n - 1; ++ip)
+			for (uint iq = ip + 1; iq < n; ++iq)
+			{
+				float &a_pq = a(ip, iq);
+				float &eigval_p = outEigVal[ip];
+				float &eigval_q = outEigVal[iq];
+
+				float abs_a_pq = abs(a_pq);
+				float g = 100.0f * abs_a_pq;
+
+				// After four sweeps, skip the rotation if the off-diagonal element is small
+				if (sweep > 4
+					&& abs(eigval_p) + g == abs(eigval_p)
+					&& abs(eigval_q) + g == abs(eigval_q))
+				{
+					a_pq = 0.0f;
+				}
+				else if (abs_a_pq > thresh)
+				{
+					float h = eigval_q - eigval_p;
+					float abs_h = abs(h);
+
+					float t;
+					if (abs_h + g == abs_h)
+					{
+						t = a_pq / h;
+					}
+					else
+					{
+						float theta = 0.5f * h / a_pq; // Warning: Can become infinite if a(ip, iq) is very small which may trigger an invalid float exception
+						t = 1.0f / (abs(theta) + sqrt(1.0f + theta * theta)); // If theta becomes inf, t will be 0 so the infinite is not a problem for the algorithm
+						if (theta < 0.0f) t = -t;
+					}
+
+					float c = 1.0f / sqrt(1.0f + t * t);
+					float s = t * c;
+					float tau = s / (1.0f + c);
+					h = t * a_pq;
+
+					a_pq = 0.0f;
+
+					z[ip] -= h;
+					z[iq] += h;
+
+					eigval_p -= h;
+					eigval_q += h;
+
+					#define JPH_EVS_ROTATE(a, i, j, k, l)		\
+						g = a(i, j),							\
+						h = a(k, l),							\
+						a(i, j) = g - s * (h + g * tau),		\
+						a(k, l) = h + s * (g - h * tau)
+
+					uint j;
+					for (j = 0; j < ip; ++j)		JPH_EVS_ROTATE(a, j, ip, j, iq);
+					for (j = ip + 1; j < iq; ++j)	JPH_EVS_ROTATE(a, ip, j, j, iq);
+					for (j = iq + 1; j < n; ++j)	JPH_EVS_ROTATE(a, ip, j, iq, j);
+					for (j = 0; j < n; ++j)			JPH_EVS_ROTATE(outEigVec, j, ip, j, iq);
+
+					#undef JPH_EVS_ROTATE
+				}
+			}
+
+		// Update eigenvalues with the sum of ta_pq and reinitialize z
+		for (uint ip = 0; ip < n; ++ip)
+		{
+			b[ip] += z[ip];
+			outEigVal[ip] = b[ip];
+			z[ip] = 0.0f;
+		}
+	}
+
+	// Failure
+	JPH_ASSERT(false, "Too many iterations");
+	return false;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/FindRoot.h
+++ b/thirdparty/jolt_physics/Jolt/Math/FindRoot.h
@@ -0,0 +1,42 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Find the roots of \f$inA \: x^2 + inB \: x + inC = 0\f$.
+/// @return The number of roots, actual roots in outX1 and outX2.
+/// If number of roots returned is 1 then outX1 == outX2.
+template <typename T>
+inline int FindRoot(const T inA, const T inB, const T inC, T &outX1, T &outX2)
+{
+	// Check if this is a linear equation
+	if (inA == T(0))
+	{
+		// Check if this is a constant equation
+		if (inB == T(0))
+			return 0;
+
+		// Linear equation with 1 solution
+		outX1 = outX2 = -inC / inB;
+		return 1;
+	}
+
+	// See Numerical Recipes in C, Chapter 5.6 Quadratic and Cubic Equations
+	T det = Square(inB) - T(4) * inA * inC;
+	if (det < T(0))
+		return 0;
+	T q = (inB + Sign(inB) * sqrt(det)) / T(-2);
+	outX1 = q / inA;
+	if (q == T(0))
+	{
+		outX2 = outX1;
+		return 1;
+	}
+	outX2 = inC / q;
+	return 2;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Float2.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Float2.h
@@ -0,0 +1,36 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that holds 2 floats, used as a storage class mainly.
+class [[nodiscard]] Float2
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+						Float2() = default; ///< Intentionally not initialized for performance reasons
+						Float2(const Float2 &inRHS) = default;
+	Float2 &			operator = (const Float2 &inRHS) = default;
+						Float2(float inX, float inY)					: x(inX), y(inY) { }
+
+	bool				operator == (const Float2 &inRHS) const			{ return x == inRHS.x && y == inRHS.y; }
+	bool				operator != (const Float2 &inRHS) const			{ return x != inRHS.x || y != inRHS.y; }
+
+	/// To String
+	friend ostream &	operator << (ostream &inStream, const Float2 &inV)
+	{
+		inStream << inV.x << ", " << inV.y;
+		return inStream;
+	}
+
+	float				x;
+	float				y;
+};
+
+static_assert(std::is_trivial<Float2>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Float3.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Float3.h
@@ -0,0 +1,50 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/HashCombine.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that holds 3 floats. Used as a storage class. Convert to Vec3 for calculations.
+class [[nodiscard]] Float3
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+				Float3() = default; ///< Intentionally not initialized for performance reasons
+				Float3(const Float3 &inRHS) = default;
+	Float3 &	operator = (const Float3 &inRHS) = default;
+	constexpr	Float3(float inX, float inY, float inZ) : x(inX), y(inY), z(inZ) { }
+
+	float		operator [] (int inCoordinate) const
+	{
+		JPH_ASSERT(inCoordinate < 3);
+		return *(&x + inCoordinate);
+	}
+
+	bool		operator == (const Float3 &inRHS) const
+	{
+		return x == inRHS.x && y == inRHS.y && z == inRHS.z;
+	}
+
+	bool		operator != (const Float3 &inRHS) const
+	{
+		return x != inRHS.x || y != inRHS.y || z != inRHS.z;
+	}
+
+	float		x;
+	float		y;
+	float		z;
+};
+
+using VertexList = Array<Float3>;
+
+static_assert(std::is_trivial<Float3>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+// Create a std::hash/JPH::Hash for Float3
+JPH_MAKE_HASHABLE(JPH::Float3, t.x, t.y, t.z)
--- a/thirdparty/jolt_physics/Jolt/Math/Float4.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Float4.h
@@ -0,0 +1,33 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Class that holds 4 float values. Convert to Vec4 to perform calculations.
+class [[nodiscard]] Float4
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+				Float4() = default; ///< Intentionally not initialized for performance reasons
+				Float4(const Float4 &inRHS) = default;
+				Float4(float inX, float inY, float inZ, float inW) : x(inX), y(inY), z(inZ), w(inW) { }
+
+	float		operator [] (int inCoordinate) const
+	{
+		JPH_ASSERT(inCoordinate < 4);
+		return *(&x + inCoordinate);
+	}
+
+	float		x;
+	float		y;
+	float		z;
+	float		w;
+};
+
+static_assert(std::is_trivial<Float4>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/GaussianElimination.h
+++ b/thirdparty/jolt_physics/Jolt/Math/GaussianElimination.h
@@ -0,0 +1,102 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// This function performs Gauss-Jordan elimination to solve a matrix equation.
+/// A must be an NxN matrix and B must be an NxM matrix forming the equation A * x = B
+/// on output B will contain x and A will be destroyed.
+///
+/// This code can be used for example to compute the inverse of a matrix.
+/// Set A to the matrix to invert, set B to identity and let GaussianElimination solve
+/// the equation, on return B will be the inverse of A. And A is destroyed.
+///
+/// Taken and adapted from Numerical Recipes in C paragraph 2.1
+template <class MatrixA, class MatrixB>
+bool GaussianElimination(MatrixA &ioA, MatrixB &ioB, float inTolerance = 1.0e-16f)
+{
+	// Get problem dimensions
+	const uint n = ioA.GetCols();
+	const uint m = ioB.GetCols();
+
+	// Check matrix requirement
+	JPH_ASSERT(ioA.GetRows() == n);
+	JPH_ASSERT(ioB.GetRows() == n);
+
+	// Create array for bookkeeping on pivoting
+	int *ipiv = (int *)JPH_STACK_ALLOC(n * sizeof(int));
+	memset(ipiv, 0, n * sizeof(int));
+
+	for (uint i = 0; i < n; ++i)
+	{
+		// Initialize pivot element as the diagonal
+		uint pivot_row = i, pivot_col = i;
+
+		// Determine pivot element
+		float largest_element = 0.0f;
+		for (uint j = 0; j < n; ++j)
+			if (ipiv[j] != 1)
+				for (uint k = 0; k < n; ++k)
+				{
+					if (ipiv[k] == 0)
+					{
+						float element = abs(ioA(j, k));
+						if (element >= largest_element)
+						{
+							largest_element = element;
+							pivot_row = j;
+							pivot_col = k;
+						}
+					}
+					else if (ipiv[k] > 1)
+					{
+						return false;
+					}
+				}
+
+		// Mark this column as used
+		++ipiv[pivot_col];
+
+		// Exchange rows when needed so that the pivot element is at ioA(pivot_col, pivot_col) instead of at ioA(pivot_row, pivot_col)
+		if (pivot_row != pivot_col)
+		{
+			for (uint j = 0; j < n; ++j)
+				std::swap(ioA(pivot_row, j), ioA(pivot_col, j));
+			for (uint j = 0; j < m; ++j)
+				std::swap(ioB(pivot_row, j), ioB(pivot_col, j));
+		}
+
+		// Get diagonal element that we are about to set to 1
+		float diagonal_element = ioA(pivot_col, pivot_col);
+		if (abs(diagonal_element) < inTolerance)
+			return false;
+
+		// Divide the whole row by the pivot element, making ioA(pivot_col, pivot_col) = 1
+		for (uint j = 0; j < n; ++j)
+			ioA(pivot_col, j) /= diagonal_element;
+		for (uint j = 0; j < m; ++j)
+			ioB(pivot_col, j) /= diagonal_element;
+		ioA(pivot_col, pivot_col) = 1.0f;
+
+		// Next reduce the rows, except for the pivot one,
+		// after this step the pivot_col column is zero except for the pivot element which is 1
+		for (uint j = 0; j < n; ++j)
+			if (j != pivot_col)
+			{
+				float element = ioA(j, pivot_col);
+				for (uint k = 0; k < n; ++k)
+					ioA(j, k) -= ioA(pivot_col, k) * element;
+				for (uint k = 0; k < m; ++k)
+					ioB(j, k) -= ioB(pivot_col, k) * element;
+				ioA(j, pivot_col) = 0.0f;
+			}
+	}
+
+	// Success
+	return true;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/HalfFloat.h
+++ b/thirdparty/jolt_physics/Jolt/Math/HalfFloat.h
@@ -0,0 +1,208 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Vec4.h>
+#include <Jolt/Core/FPException.h>
+
+JPH_NAMESPACE_BEGIN
+
+using HalfFloat = uint16;
+
+// Define half float constant values
+static constexpr HalfFloat HALF_FLT_MAX				= 0x7bff;
+static constexpr HalfFloat HALF_FLT_MAX_NEGATIVE	= 0xfbff;
+static constexpr HalfFloat HALF_FLT_INF				= 0x7c00;
+static constexpr HalfFloat HALF_FLT_INF_NEGATIVE	= 0xfc00;
+static constexpr HalfFloat HALF_FLT_NANQ			= 0x7e00;
+static constexpr HalfFloat HALF_FLT_NANQ_NEGATIVE	= 0xfe00;
+
+namespace HalfFloatConversion {
+
+// Layout of a float
+static constexpr int FLOAT_SIGN_POS = 31;
+static constexpr int FLOAT_EXPONENT_POS = 23;
+static constexpr int FLOAT_EXPONENT_BITS = 8;
+static constexpr int FLOAT_EXPONENT_MASK = (1 << FLOAT_EXPONENT_BITS) - 1;
+static constexpr int FLOAT_EXPONENT_BIAS = 127;
+static constexpr int FLOAT_MANTISSA_BITS = 23;
+static constexpr int FLOAT_MANTISSA_MASK = (1 << FLOAT_MANTISSA_BITS) - 1;
+static constexpr int FLOAT_EXPONENT_AND_MANTISSA_MASK = FLOAT_MANTISSA_MASK + (FLOAT_EXPONENT_MASK << FLOAT_EXPONENT_POS);
+
+// Layout of half float
+static constexpr int HALF_FLT_SIGN_POS = 15;
+static constexpr int HALF_FLT_EXPONENT_POS = 10;
+static constexpr int HALF_FLT_EXPONENT_BITS = 5;
+static constexpr int HALF_FLT_EXPONENT_MASK = (1 << HALF_FLT_EXPONENT_BITS) - 1;
+static constexpr int HALF_FLT_EXPONENT_BIAS = 15;
+static constexpr int HALF_FLT_MANTISSA_BITS = 10;
+static constexpr int HALF_FLT_MANTISSA_MASK = (1 << HALF_FLT_MANTISSA_BITS) - 1;
+static constexpr int HALF_FLT_EXPONENT_AND_MANTISSA_MASK = HALF_FLT_MANTISSA_MASK + (HALF_FLT_EXPONENT_MASK << HALF_FLT_EXPONENT_POS);
+
+/// Define half-float rounding modes
+enum ERoundingMode
+{
+	ROUND_TO_NEG_INF,				///< Round to negative infinity
+	ROUND_TO_POS_INF,				///< Round to positive infinity
+	ROUND_TO_NEAREST,				///< Round to nearest value
+};
+
+/// Convert a float (32-bits) to a half float (16-bits), fallback version when no intrinsics available
+template <int RoundingMode>
+inline HalfFloat FromFloatFallback(float inV)
+{
+	// Reinterpret the float as an uint32
+	uint32 value = BitCast<uint32>(inV);
+
+	// Extract exponent
+	uint32 exponent = (value >> FLOAT_EXPONENT_POS) & FLOAT_EXPONENT_MASK;
+
+	// Extract mantissa
+	uint32 mantissa = value & FLOAT_MANTISSA_MASK;
+
+	// Extract the sign and move it into the right spot for the half float (so we can just or it in at the end)
+	HalfFloat hf_sign = HalfFloat(value >> (FLOAT_SIGN_POS - HALF_FLT_SIGN_POS)) & (1 << HALF_FLT_SIGN_POS);
+
+	// Check NaN or INF
+	if (exponent == FLOAT_EXPONENT_MASK) // NaN or INF
+		return hf_sign | (mantissa == 0? HALF_FLT_INF : HALF_FLT_NANQ);
+
+	// Rebias the exponent for half floats
+	int rebiased_exponent = int(exponent) - FLOAT_EXPONENT_BIAS + HALF_FLT_EXPONENT_BIAS;
+
+	// Check overflow to infinity
+	if (rebiased_exponent >= HALF_FLT_EXPONENT_MASK)
+	{
+		bool round_up = RoundingMode == ROUND_TO_NEAREST || (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF);
+		return hf_sign | (round_up? HALF_FLT_INF : HALF_FLT_MAX);
+	}
+
+	// Check underflow to zero
+	if (rebiased_exponent < -HALF_FLT_MANTISSA_BITS)
+	{
+		bool round_up = RoundingMode != ROUND_TO_NEAREST && (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF) && (value & FLOAT_EXPONENT_AND_MANTISSA_MASK) != 0;
+		return hf_sign | (round_up? 1 : 0);
+	}
+
+	HalfFloat hf_exponent;
+	int shift;
+	if (rebiased_exponent <= 0)
+	{
+		// Underflow to denormalized number
+		hf_exponent = 0;
+		mantissa |= 1 << FLOAT_MANTISSA_BITS; // Add the implicit 1 bit to the mantissa
+		shift = FLOAT_MANTISSA_BITS - HALF_FLT_MANTISSA_BITS + 1 - rebiased_exponent;
+	}
+	else
+	{
+		// Normal half float
+		hf_exponent = HalfFloat(rebiased_exponent << HALF_FLT_EXPONENT_POS);
+		shift = FLOAT_MANTISSA_BITS - HALF_FLT_MANTISSA_BITS;
+	}
+
+	// Compose the half float
+	HalfFloat hf_mantissa = HalfFloat(mantissa >> shift);
+	HalfFloat hf = hf_sign | hf_exponent | hf_mantissa;
+
+	// Calculate the remaining bits that we're discarding
+	uint remainder = mantissa & ((1 << shift) - 1);
+
+	if constexpr (RoundingMode == ROUND_TO_NEAREST)
+	{
+		// Round to nearest
+		uint round_threshold = 1 << (shift - 1);
+		if (remainder > round_threshold // Above threshold, we must always round
+			|| (remainder == round_threshold && (hf_mantissa & 1))) // When equal, round to nearest even
+			hf++; // May overflow to infinity
+	}
+	else
+	{
+		// Round up or down (truncate) depending on the rounding mode
+		bool round_up = (hf_sign == 0) == (RoundingMode == ROUND_TO_POS_INF) && remainder != 0;
+		if (round_up)
+			hf++; // May overflow to infinity
+	}
+
+	return hf;
+}
+
+/// Convert a float (32-bits) to a half float (16-bits)
+template <int RoundingMode>
+JPH_INLINE HalfFloat FromFloat(float inV)
+{
+#ifdef JPH_USE_F16C
+	FPExceptionDisableOverflow disable_overflow;
+	JPH_UNUSED(disable_overflow);
+
+	union
+	{
+		__m128i		u128;
+		HalfFloat	u16[8];
+	} hf;
+	__m128 val = _mm_load_ss(&inV);
+	switch (RoundingMode)
+	{
+	case ROUND_TO_NEG_INF:
+		hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_NEG_INF);
+		break;
+	case ROUND_TO_POS_INF:
+		hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_POS_INF);
+		break;
+	case ROUND_TO_NEAREST:
+		hf.u128 = _mm_cvtps_ph(val, _MM_FROUND_TO_NEAREST_INT);
+		break;
+	}
+	return hf.u16[0];
+#else
+	return FromFloatFallback<RoundingMode>(inV);
+#endif
+}
+
+/// Convert 4 half floats (lower 64 bits) to floats, fallback version when no intrinsics available
+inline Vec4 ToFloatFallback(UVec4Arg inValue)
+{
+	// Unpack half floats to 4 uint32's
+	UVec4 value = inValue.Expand4Uint16Lo();
+
+	// Normal half float path, extract the exponent and mantissa, shift them into place and update the exponent bias
+	UVec4 exponent_mantissa = UVec4::sAnd(value, UVec4::sReplicate(HALF_FLT_EXPONENT_AND_MANTISSA_MASK)).LogicalShiftLeft<FLOAT_EXPONENT_POS - HALF_FLT_EXPONENT_POS>() + UVec4::sReplicate((FLOAT_EXPONENT_BIAS - HALF_FLT_EXPONENT_BIAS) << FLOAT_EXPONENT_POS);
+
+	// Denormalized half float path, renormalize the float
+	UVec4 exponent_mantissa_denormalized = ((exponent_mantissa + UVec4::sReplicate(1 << FLOAT_EXPONENT_POS)).ReinterpretAsFloat() - UVec4::sReplicate((FLOAT_EXPONENT_BIAS - HALF_FLT_EXPONENT_BIAS + 1) << FLOAT_EXPONENT_POS).ReinterpretAsFloat()).ReinterpretAsInt();
+
+	// NaN / INF path, set all exponent bits
+	UVec4 exponent_mantissa_nan_inf = UVec4::sOr(exponent_mantissa, UVec4::sReplicate(FLOAT_EXPONENT_MASK << FLOAT_EXPONENT_POS));
+
+	// Get the exponent to determine which of the paths we should take
+	UVec4 exponent_mask = UVec4::sReplicate(HALF_FLT_EXPONENT_MASK << HALF_FLT_EXPONENT_POS);
+	UVec4 exponent = UVec4::sAnd(value, exponent_mask);
+	UVec4 is_denormalized = UVec4::sEquals(exponent, UVec4::sZero());
+	UVec4 is_nan_inf = UVec4::sEquals(exponent, exponent_mask);
+
+	// Select the correct result
+	UVec4 result_exponent_mantissa = UVec4::sSelect(UVec4::sSelect(exponent_mantissa, exponent_mantissa_nan_inf, is_nan_inf), exponent_mantissa_denormalized, is_denormalized);
+
+	// Extract the sign bit and shift it to the left
+	UVec4 sign = UVec4::sAnd(value, UVec4::sReplicate(1 << HALF_FLT_SIGN_POS)).LogicalShiftLeft<FLOAT_SIGN_POS - HALF_FLT_SIGN_POS>();
+
+	// Construct the float
+	return UVec4::sOr(sign, result_exponent_mantissa).ReinterpretAsFloat();
+}
+
+/// Convert 4 half floats (lower 64 bits) to floats
+JPH_INLINE Vec4 ToFloat(UVec4Arg inValue)
+{
+#if defined(JPH_USE_F16C)
+	return _mm_cvtph_ps(inValue.mValue);
+#elif defined(JPH_USE_NEON)
+	return vcvt_f32_f16(vreinterpret_f16_u32(vget_low_u32(inValue.mValue)));
+#else
+	return ToFloatFallback(inValue);
+#endif
+}
+
+} // HalfFloatConversion
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Mat44.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Mat44.h
@@ -0,0 +1,243 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/MathTypes.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Holds a 4x4 matrix of floats, but supports also operations on the 3x3 upper left part of the matrix.
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Mat44
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying column type
+	using Type = Vec4::Type;
+
+	// Argument type
+	using ArgType = Mat44Arg;
+
+	/// Constructor
+								Mat44() = default; ///< Intentionally not initialized for performance reasons
+	JPH_INLINE					Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4);
+	JPH_INLINE					Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4);
+								Mat44(const Mat44 &inM2) = default;
+	Mat44 &						operator = (const Mat44 &inM2) = default;
+	JPH_INLINE					Mat44(Type inC1, Type inC2, Type inC3, Type inC4);
+
+	/// Zero matrix
+	static JPH_INLINE Mat44		sZero();
+
+	/// Identity matrix
+	static JPH_INLINE Mat44		sIdentity();
+
+	/// Matrix filled with NaN's
+	static JPH_INLINE Mat44		sNaN();
+
+	/// Load 16 floats from memory
+	static JPH_INLINE Mat44		sLoadFloat4x4(const Float4 *inV);
+
+	/// Load 16 floats from memory, 16 bytes aligned
+	static JPH_INLINE Mat44		sLoadFloat4x4Aligned(const Float4 *inV);
+
+	/// Rotate around X, Y or Z axis (angle in radians)
+	static JPH_INLINE Mat44		sRotationX(float inX);
+	static JPH_INLINE Mat44		sRotationY(float inY);
+	static JPH_INLINE Mat44		sRotationZ(float inZ);
+
+	/// Rotate around arbitrary axis
+	static JPH_INLINE Mat44		sRotation(Vec3Arg inAxis, float inAngle);
+
+	/// Rotate from quaternion
+	static JPH_INLINE Mat44		sRotation(QuatArg inQuat);
+
+	/// Get matrix that translates
+	static JPH_INLINE Mat44		sTranslation(Vec3Arg inV);
+
+	/// Get matrix that rotates and translates
+	static JPH_INLINE Mat44		sRotationTranslation(QuatArg inR, Vec3Arg inT);
+
+	/// Get inverse matrix of sRotationTranslation
+	static JPH_INLINE Mat44		sInverseRotationTranslation(QuatArg inR, Vec3Arg inT);
+
+	/// Get matrix that scales uniformly
+	static JPH_INLINE Mat44		sScale(float inScale);
+
+	/// Get matrix that scales (produces a matrix with (inV, 1) on its diagonal)
+	static JPH_INLINE Mat44		sScale(Vec3Arg inV);
+
+	/// Get outer product of inV and inV2 (equivalent to \f$inV1 \otimes inV2\f$)
+	static JPH_INLINE Mat44		sOuterProduct(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Get matrix that represents a cross product \f$A \times B = \text{sCrossProduct}(A) \: B\f$
+	static JPH_INLINE Mat44		sCrossProduct(Vec3Arg inV);
+
+	/// Returns matrix ML so that \f$ML(q) \: p = q \: p\f$ (where p and q are quaternions)
+	static JPH_INLINE Mat44		sQuatLeftMultiply(QuatArg inQ);
+
+	/// Returns matrix MR so that \f$MR(q) \: p = p \: q\f$ (where p and q are quaternions)
+	static JPH_INLINE Mat44		sQuatRightMultiply(QuatArg inQ);
+
+	/// Returns a look at matrix that transforms from world space to view space
+	/// @param inPos Position of the camera
+	/// @param inTarget Target of the camera
+	/// @param inUp Up vector
+	static JPH_INLINE Mat44		sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp);
+
+	/// Returns a right-handed perspective projection matrix
+	static JPH_INLINE Mat44		sPerspective(float inFovY, float inAspect, float inNear, float inFar);
+
+	/// Get float component by element index
+	JPH_INLINE float			operator () (uint inRow, uint inColumn) const			{ JPH_ASSERT(inRow < 4); JPH_ASSERT(inColumn < 4); return mCol[inColumn].mF32[inRow]; }
+	JPH_INLINE float &			operator () (uint inRow, uint inColumn)					{ JPH_ASSERT(inRow < 4); JPH_ASSERT(inColumn < 4); return mCol[inColumn].mF32[inRow]; }
+
+	/// Comparison
+	JPH_INLINE bool				operator == (Mat44Arg inM2) const;
+	JPH_INLINE bool				operator != (Mat44Arg inM2) const						{ return !(*this == inM2); }
+
+	/// Test if two matrices are close
+	JPH_INLINE bool				IsClose(Mat44Arg inM2, float inMaxDistSq = 1.0e-12f) const;
+
+	/// Multiply matrix by matrix
+	JPH_INLINE Mat44			operator * (Mat44Arg inM) const;
+
+	/// Multiply vector by matrix
+	JPH_INLINE Vec3				operator * (Vec3Arg inV) const;
+	JPH_INLINE Vec4				operator * (Vec4Arg inV) const;
+
+	/// Multiply vector by only 3x3 part of the matrix
+	JPH_INLINE Vec3				Multiply3x3(Vec3Arg inV) const;
+
+	/// Multiply vector by only 3x3 part of the transpose of the matrix (\f$result = this^T \: inV\f$)
+	JPH_INLINE Vec3				Multiply3x3Transposed(Vec3Arg inV) const;
+
+	/// Multiply 3x3 matrix by 3x3 matrix
+	JPH_INLINE Mat44			Multiply3x3(Mat44Arg inM) const;
+
+	/// Multiply transpose of 3x3 matrix by 3x3 matrix (\f$result = this^T \: inM\f$)
+	JPH_INLINE Mat44			Multiply3x3LeftTransposed(Mat44Arg inM) const;
+
+	/// Multiply 3x3 matrix by the transpose of a 3x3 matrix (\f$result = this \: inM^T\f$)
+	JPH_INLINE Mat44			Multiply3x3RightTransposed(Mat44Arg inM) const;
+
+	/// Multiply matrix with float
+	JPH_INLINE Mat44			operator * (float inV) const;
+	friend JPH_INLINE Mat44		operator * (float inV, Mat44Arg inM)					{ return inM * inV; }
+
+	/// Multiply matrix with float
+	JPH_INLINE Mat44 &			operator *= (float inV);
+
+	/// Per element addition of matrix
+	JPH_INLINE Mat44			operator + (Mat44Arg inM) const;
+
+	/// Negate
+	JPH_INLINE Mat44			operator - () const;
+
+	/// Per element subtraction of matrix
+	JPH_INLINE Mat44			operator - (Mat44Arg inM) const;
+
+	/// Per element addition of matrix
+	JPH_INLINE Mat44 &			operator += (Mat44Arg inM);
+
+	/// Access to the columns
+	JPH_INLINE Vec3				GetAxisX() const										{ return Vec3(mCol[0]); }
+	JPH_INLINE void				SetAxisX(Vec3Arg inV)									{ mCol[0] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec3				GetAxisY() const										{ return Vec3(mCol[1]); }
+	JPH_INLINE void				SetAxisY(Vec3Arg inV)									{ mCol[1] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec3				GetAxisZ() const										{ return Vec3(mCol[2]); }
+	JPH_INLINE void				SetAxisZ(Vec3Arg inV)									{ mCol[2] = Vec4(inV, 0.0f); }
+	JPH_INLINE Vec3				GetTranslation() const									{ return Vec3(mCol[3]); }
+	JPH_INLINE void				SetTranslation(Vec3Arg inV)								{ mCol[3] = Vec4(inV, 1.0f); }
+	JPH_INLINE Vec3				GetDiagonal3() const									{ return Vec3(mCol[0][0], mCol[1][1], mCol[2][2]); }
+	JPH_INLINE void				SetDiagonal3(Vec3Arg inV)								{ mCol[0][0] = inV.GetX(); mCol[1][1] = inV.GetY(); mCol[2][2] = inV.GetZ(); }
+	JPH_INLINE Vec4				GetDiagonal4() const									{ return Vec4(mCol[0][0], mCol[1][1], mCol[2][2], mCol[3][3]); }
+	JPH_INLINE void				SetDiagonal4(Vec4Arg inV)								{ mCol[0][0] = inV.GetX(); mCol[1][1] = inV.GetY(); mCol[2][2] = inV.GetZ(); mCol[3][3] = inV.GetW(); }
+	JPH_INLINE Vec3				GetColumn3(uint inCol) const							{ JPH_ASSERT(inCol < 4); return Vec3(mCol[inCol]); }
+	JPH_INLINE void				SetColumn3(uint inCol, Vec3Arg inV)						{ JPH_ASSERT(inCol < 4); mCol[inCol] = Vec4(inV, inCol == 3? 1.0f : 0.0f); }
+	JPH_INLINE Vec4				GetColumn4(uint inCol) const							{ JPH_ASSERT(inCol < 4); return mCol[inCol]; }
+	JPH_INLINE void				SetColumn4(uint inCol, Vec4Arg inV)						{ JPH_ASSERT(inCol < 4); mCol[inCol] = inV; }
+
+	/// Store matrix to memory
+	JPH_INLINE void				StoreFloat4x4(Float4 *outV) const;
+
+	/// Transpose matrix
+	JPH_INLINE Mat44			Transposed() const;
+
+	/// Transpose 3x3 subpart of matrix
+	JPH_INLINE Mat44			Transposed3x3() const;
+
+	/// Inverse 4x4 matrix
+	JPH_INLINE Mat44			Inversed() const;
+
+	/// Inverse 4x4 matrix when it only contains rotation and translation
+	JPH_INLINE Mat44			InversedRotationTranslation() const;
+
+	/// Get the determinant of a 3x3 matrix
+	JPH_INLINE float			GetDeterminant3x3() const;
+
+	/// Get the adjoint of a 3x3 matrix
+	JPH_INLINE Mat44			Adjointed3x3() const;
+
+	/// Inverse 3x3 matrix
+	JPH_INLINE Mat44			Inversed3x3() const;
+
+	/// *this = inM.Inversed3x3(), returns false if the matrix is singular in which case *this is unchanged
+	JPH_INLINE bool				SetInversed3x3(Mat44Arg inM);
+
+	/// Get rotation part only (note: retains the first 3 values from the bottom row)
+	JPH_INLINE Mat44			GetRotation() const;
+
+	/// Get rotation part only (note: also clears the bottom row)
+	JPH_INLINE Mat44			GetRotationSafe() const;
+
+	/// Updates the rotation part of this matrix (the first 3 columns)
+	JPH_INLINE void				SetRotation(Mat44Arg inRotation);
+
+	/// Convert to quaternion
+	JPH_INLINE Quat				GetQuaternion() const;
+
+	/// Get matrix that transforms a direction with the same transform as this matrix (length is not preserved)
+	JPH_INLINE Mat44			GetDirectionPreservingMatrix() const					{ return GetRotation().Inversed3x3().Transposed3x3(); }
+
+	/// Pre multiply by translation matrix: result = this * Mat44::sTranslation(inTranslation)
+	JPH_INLINE Mat44			PreTranslated(Vec3Arg inTranslation) const;
+
+	/// Post multiply by translation matrix: result = Mat44::sTranslation(inTranslation) * this (i.e. add inTranslation to the 4-th column)
+	JPH_INLINE Mat44			PostTranslated(Vec3Arg inTranslation) const;
+
+	/// Scale a matrix: result = this * Mat44::sScale(inScale)
+	JPH_INLINE Mat44			PreScaled(Vec3Arg inScale) const;
+
+	/// Scale a matrix: result = Mat44::sScale(inScale) * this
+	JPH_INLINE Mat44			PostScaled(Vec3Arg inScale) const;
+
+	/// Decompose a matrix into a rotation & translation part and into a scale part so that:
+	/// this = return_value * Mat44::sScale(outScale).
+	/// This equation only holds when the matrix is orthogonal, if it is not the returned matrix
+	/// will be made orthogonal using the modified Gram-Schmidt algorithm (see: https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process)
+	JPH_INLINE Mat44			Decompose(Vec3 &outScale) const;
+
+#ifndef JPH_DOUBLE_PRECISION
+	/// In single precision mode just return the matrix itself
+	JPH_INLINE Mat44			ToMat44() const											{ return *this; }
+#endif // !JPH_DOUBLE_PRECISION
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, Mat44Arg inM)
+	{
+		inStream << inM.mCol[0] << ", " << inM.mCol[1] << ", " << inM.mCol[2] << ", " << inM.mCol[3];
+		return inStream;
+	}
+
+private:
+	Vec4						mCol[4];												///< Column
+};
+
+static_assert(std::is_trivial<Mat44>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "Mat44.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/Mat44.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/Mat44.inl
@@ -0,0 +1,952 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Vec3.h>
+#include <Jolt/Math/Vec4.h>
+#include <Jolt/Math/Quat.h>
+
+JPH_NAMESPACE_BEGIN
+
+#define JPH_EL(r, c) mCol[c].mF32[r]
+
+Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
+	mCol { inC1, inC2, inC3, inC4 }
+{
+}
+
+Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4) :
+	mCol { inC1, inC2, inC3, Vec4(inC4, 1.0f) }
+{
+}
+
+Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
+	mCol { inC1, inC2, inC3, inC4 }
+{
+}
+
+Mat44 Mat44::sZero()
+{
+	return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
+}
+
+Mat44 Mat44::sIdentity()
+{
+	return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sNaN()
+{
+	return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
+}
+
+Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
+{
+	Mat44 result;
+	for (int c = 0; c < 4; ++c)
+		result.mCol[c] = Vec4::sLoadFloat4(inV + c);
+	return result;
+}
+
+Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
+{
+	Mat44 result;
+	for (int c = 0; c < 4; ++c)
+		result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
+	return result;
+}
+
+Mat44 Mat44::sRotationX(float inX)
+{
+	Vec4 sv, cv;
+	Vec4::sReplicate(inX).SinCos(sv, cv);
+	float s = sv.GetX(), c = cv.GetX();
+	return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sRotationY(float inY)
+{
+	Vec4 sv, cv;
+	Vec4::sReplicate(inY).SinCos(sv, cv);
+	float s = sv.GetX(), c = cv.GetX();
+	return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sRotationZ(float inZ)
+{
+	Vec4 sv, cv;
+	Vec4::sReplicate(inZ).SinCos(sv, cv);
+	float s = sv.GetX(), c = cv.GetX();
+	return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sRotation(QuatArg inQuat)
+{
+	JPH_ASSERT(inQuat.IsNormalized());
+
+	// See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
+#ifdef JPH_USE_SSE4_1
+	__m128 xyzw = inQuat.mValue.mValue;
+	__m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
+	__m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
+	__m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
+	__m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
+	__m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
+	__m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw));	// (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
+	__m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww));										// 2 * (xz + yw, xy + zw, yz + xw, ww)
+	__m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww));										// 2 * (xy - zw, yz - xw, xz - yw, 0)
+
+	// Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
+	// There doesn't appear to be a reliable way to turn this off in Clang
+	minus = _mm_insert_ps(minus, minus, 0b1000);
+
+	__m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100);	// (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
+	__m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100);	// (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
+	__m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100);	// (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
+	__m128 col3 = _mm_set_ps(1, 0, 0, 0);
+
+	return Mat44(col0, col1, col2, col3);
+#else
+	float x = inQuat.GetX();
+	float y = inQuat.GetY();
+	float z = inQuat.GetZ();
+	float w = inQuat.GetW();
+
+	float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
+	float ty = y + y;
+	float tz = z + z;
+
+	float xx = tx * x;
+	float yy = ty * y;
+	float zz = tz * z;
+	float xy = tx * y;
+	float xz = tx * z;
+	float xw = tx * w;
+	float yz = ty * z;
+	float yw = ty * w;
+	float zw = tz * w;
+
+	return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
+				 Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
+				 Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
+				 Vec4(0.0f, 0.0f, 0.0f, 1.0f));
+#endif
+}
+
+Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
+{
+	return sRotation(Quat::sRotation(inAxis, inAngle));
+}
+
+Mat44 Mat44::sTranslation(Vec3Arg inV)
+{
+	return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
+}
+
+Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
+{
+	Mat44 m = sRotation(inR);
+	m.SetTranslation(inT);
+	return m;
+}
+
+Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
+{
+	Mat44 m = sRotation(inR.Conjugated());
+	m.SetTranslation(-m.Multiply3x3(inT));
+	return m;
+}
+
+Mat44 Mat44::sScale(float inScale)
+{
+	return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sScale(Vec3Arg inV)
+{
+	return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
+{
+	Vec4 v1(inV1, 0);
+	return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::sCrossProduct(Vec3Arg inV)
+{
+#ifdef JPH_USE_SSE4_1
+	// Zero out the W component
+	__m128 zero = _mm_setzero_ps();
+	__m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
+
+	// Negate
+	__m128 min_v = _mm_sub_ps(zero, v);
+
+	return Mat44(
+		_mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
+		_mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
+		_mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
+		Vec4(0, 0, 0, 1));
+#else
+	float x = inV.GetX();
+	float y = inV.GetY();
+	float z = inV.GetZ();
+
+	return Mat44(
+		Vec4(0, z, -y, 0),
+		Vec4(-z, 0, x, 0),
+		Vec4(y, -x, 0, 0),
+		Vec4(0, 0, 0, 1));
+#endif
+}
+
+Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
+{
+	Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
+	Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
+	Vec3 up = right.Cross(direction);
+
+	return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
+}
+
+Mat44 Mat44::sPerspective(float inFovY, float inAspect, float inNear, float inFar)
+{
+	float height = 1.0f / Tan(0.5f * inFovY);
+	float width = height / inAspect;
+	float range = inFar / (inNear - inFar);
+
+	return Mat44(Vec4(width, 0.0f, 0.0f, 0.0f), Vec4(0.0f, height, 0.0f, 0.0f), Vec4(0.0f, 0.0f, range, -1.0f), Vec4(0.0f, 0.0f, range * inNear, 0.0f));
+}
+
+bool Mat44::operator == (Mat44Arg inM2) const
+{
+	return UVec4::sAnd(
+		UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
+		UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
+	).TestAllTrue();
+}
+
+bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
+{
+	for (int i = 0; i < 4; ++i)
+		if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
+			return false;
+	return true;
+}
+
+Mat44 Mat44::operator * (Mat44Arg inM) const
+{
+	Mat44 result;
+#if defined(JPH_USE_SSE)
+	for (int i = 0; i < 4; ++i)
+	{
+		__m128 c = inM.mCol[i].mValue;
+		__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
+		result.mCol[i].mValue = t;
+	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 4; ++i)
+	{
+		Type c = inM.mCol[i].mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
+		result.mCol[i].mValue = t;
+	}
+#else
+	for (int i = 0; i < 4; ++i)
+		result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
+#endif
+	return result;
+}
+
+Vec3 Mat44::operator * (Vec3Arg inV) const
+{
+#if defined(JPH_USE_SSE)
+	__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+	t = _mm_add_ps(t, mCol[3].mValue);
+	return Vec3::sFixW(t);
+#elif defined(JPH_USE_NEON)
+	Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
+	t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
+	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
+	t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
+	return Vec3::sFixW(t);
+#else
+	return Vec3(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0],
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1],
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
+#endif
+}
+
+Vec4 Mat44::operator * (Vec4Arg inV) const
+{
+#if defined(JPH_USE_SSE)
+	__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
+	return t;
+#elif defined(JPH_USE_NEON)
+	Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
+	t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
+	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
+	t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
+	return t;
+#else
+	return Vec4(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3],
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3],
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3],
+		mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
+#endif
+}
+
+Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
+{
+#if defined(JPH_USE_SSE)
+	__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
+	t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
+	return Vec3::sFixW(t);
+#elif defined(JPH_USE_NEON)
+	Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
+	t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
+	t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
+	return Vec3::sFixW(t);
+#else
+	return Vec3(
+		mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2],
+		mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2],
+		mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
+#endif
+}
+
+Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
+{
+#if defined(JPH_USE_SSE4_1)
+	__m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
+	__m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
+	__m128 xy = _mm_blend_ps(x, y, 0b0010);
+	__m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
+	__m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
+	return xyzz;
+#else
+	return Transposed3x3().Multiply3x3(inV);
+#endif
+}
+
+Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
+{
+	JPH_ASSERT(mCol[0][3] == 0.0f);
+	JPH_ASSERT(mCol[1][3] == 0.0f);
+	JPH_ASSERT(mCol[2][3] == 0.0f);
+
+	Mat44 result;
+#if defined(JPH_USE_SSE)
+	for (int i = 0; i < 3; ++i)
+	{
+		__m128 c = inM.mCol[i].mValue;
+		__m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
+		t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
+		result.mCol[i].mValue = t;
+	}
+#elif defined(JPH_USE_NEON)
+	for (int i = 0; i < 3; ++i)
+	{
+		Type c = inM.mCol[i].mValue;
+		Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
+		t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
+		t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
+		result.mCol[i].mValue = t;
+	}
+#else
+	for (int i = 0; i < 3; ++i)
+		result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
+#endif
+	result.mCol[3] = Vec4(0, 0, 0, 1);
+	return result;
+}
+
+Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
+{
+	// Transpose left hand side
+	Mat44 trans = Transposed3x3();
+
+	// Do 3x3 matrix multiply
+	Mat44 result;
+	result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
+	result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
+	result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
+	result.mCol[3] = Vec4(0, 0, 0, 1);
+	return result;
+}
+
+Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
+{
+	JPH_ASSERT(mCol[0][3] == 0.0f);
+	JPH_ASSERT(mCol[1][3] == 0.0f);
+	JPH_ASSERT(mCol[2][3] == 0.0f);
+
+	Mat44 result;
+	result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
+	result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
+	result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
+	result.mCol[3] = Vec4(0, 0, 0, 1);
+	return result;
+}
+
+Mat44 Mat44::operator * (float inV) const
+{
+	Vec4 multiplier = Vec4::sReplicate(inV);
+
+	Mat44 result;
+	for (int c = 0; c < 4; ++c)
+		result.mCol[c] = mCol[c] * multiplier;
+	return result;
+}
+
+Mat44 &Mat44::operator *= (float inV)
+{
+	for (int c = 0; c < 4; ++c)
+		mCol[c] *= inV;
+
+	return *this;
+}
+
+Mat44 Mat44::operator + (Mat44Arg inM) const
+{
+	Mat44 result;
+	for (int i = 0; i < 4; ++i)
+		result.mCol[i] = mCol[i] + inM.mCol[i];
+	return result;
+}
+
+Mat44 Mat44::operator - () const
+{
+	Mat44 result;
+	for (int i = 0; i < 4; ++i)
+		result.mCol[i] = -mCol[i];
+	return result;
+}
+
+Mat44 Mat44::operator - (Mat44Arg inM) const
+{
+	Mat44 result;
+	for (int i = 0; i < 4; ++i)
+		result.mCol[i] = mCol[i] - inM.mCol[i];
+	return result;
+}
+
+Mat44 &Mat44::operator += (Mat44Arg inM)
+{
+	for (int c = 0; c < 4; ++c)
+		mCol[c] += inM.mCol[c];
+
+	return *this;
+}
+
+void Mat44::StoreFloat4x4(Float4 *outV) const
+{
+	for (int c = 0; c < 4; ++c)
+		mCol[c].StoreFloat4(outV + c);
+}
+
+Mat44 Mat44::Transposed() const
+{
+#if defined(JPH_USE_SSE)
+	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
+	__m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
+
+	Mat44 result;
+	result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
+	result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
+	result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
+	result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
+	return result;
+#elif defined(JPH_USE_NEON)
+	float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
+	float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
+	float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
+	float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
+
+	Mat44 result;
+	result.mCol[0].mValue = tmp3.val[0];
+	result.mCol[1].mValue = tmp3.val[1];
+	result.mCol[2].mValue = tmp4.val[0];
+	result.mCol[3].mValue = tmp4.val[1];
+	return result;
+#else
+	Mat44 result;
+	for (int c = 0; c < 4; ++c)
+		for (int r = 0; r < 4; ++r)
+			result.mCol[r].mF32[c] = mCol[c].mF32[r];
+	return result;
+#endif
+}
+
+Mat44 Mat44::Transposed3x3() const
+{
+#if defined(JPH_USE_SSE)
+	__m128 zero = _mm_setzero_ps();
+	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
+	__m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
+
+	Mat44 result;
+	result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
+	result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
+	result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
+#elif defined(JPH_USE_NEON)
+	float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
+	float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
+	float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
+	float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
+
+	Mat44 result;
+	result.mCol[0].mValue = tmp3.val[0];
+	result.mCol[1].mValue = tmp3.val[1];
+	result.mCol[2].mValue = tmp4.val[0];
+#else
+	Mat44 result;
+	for (int c = 0; c < 3; ++c)
+	{
+		for (int r = 0; r < 3; ++r)
+			result.mCol[c].mF32[r] = mCol[r].mF32[c];
+		result.mCol[c].mF32[3] = 0;
+	}
+#endif
+	result.mCol[3] = Vec4(0, 0, 0, 1);
+	return result;
+}
+
+Mat44 Mat44::Inversed() const
+{
+#if defined(JPH_USE_SSE)
+	// Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
+	// Streaming SIMD Extensions - Inverse of 4x4 Matrix
+	// Adapted to load data using _mm_shuffle_ps instead of loading from memory
+	// Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
+
+	__m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
+	__m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
+	row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+	tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
+	__m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
+	__m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
+	row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+
+	tmp1 = _mm_mul_ps(row2, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	__m128 minor0 = _mm_mul_ps(row1, tmp1);
+	__m128 minor1 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
+	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
+	minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row1, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
+	__m128 minor3 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
+	minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
+	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
+	__m128 minor2 = _mm_mul_ps(row0, tmp1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
+	minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
+
+	tmp1 = _mm_mul_ps(row0, row1);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row3);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
+	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
+	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
+
+	tmp1 = _mm_mul_ps(row0, row2);
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
+	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
+	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
+	tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
+	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
+
+	__m128 det = _mm_mul_ps(row0, minor0);
+	det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
+	det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
+	det = _mm_div_ss(_mm_set_ss(1.0f), det);
+	det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
+
+	Mat44 result;
+	result.mCol[0].mValue = _mm_mul_ps(det, minor0);
+	result.mCol[1].mValue = _mm_mul_ps(det, minor1);
+	result.mCol[2].mValue = _mm_mul_ps(det, minor2);
+	result.mCol[3].mValue = _mm_mul_ps(det, minor3);
+	return result;
+#elif defined(JPH_USE_NEON)
+	// Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
+	Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
+	Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
+	Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
+	row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
+	Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
+	Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
+	row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
+
+	tmp1 = vmulq_f32(row2, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	Type minor0 = vmulq_f32(row1, tmp1);
+	Type minor1 = vmulq_f32(row0, tmp1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
+	minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
+	minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
+
+	tmp1 = vmulq_f32(row1, row2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
+	Type minor3 = vmulq_f32(row0, tmp1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
+	minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
+	minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
+
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
+	tmp1 = vmulq_f32(tmp1, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
+	minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
+	Type minor2 = vmulq_f32(row0, tmp1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
+	minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
+	minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
+
+	tmp1 = vmulq_f32(row0, row1);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
+
+	tmp1 = vmulq_f32(row0, row3);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
+	minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
+	minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
+
+	tmp1 = vmulq_f32(row0, row2);
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
+	minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
+	minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
+	tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
+	minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
+	minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
+
+	Type det = vmulq_f32(row0, minor0);
+	det = vdupq_n_f32(vaddvq_f32(det));
+	det = vdivq_f32(vdupq_n_f32(1.0f), det);
+
+	Mat44 result;
+	result.mCol[0].mValue = vmulq_f32(det, minor0);
+	result.mCol[1].mValue = vmulq_f32(det, minor1);
+	result.mCol[2].mValue = vmulq_f32(det, minor2);
+	result.mCol[3].mValue = vmulq_f32(det, minor3);
+	return result;
+#else
+	float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
+	float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
+	float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
+	float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
+
+	float m10211120 = m10 * m21 - m11 * m20;
+	float m10221220 = m10 * m22 - m12 * m20;
+	float m10231320 = m10 * m23 - m13 * m20;
+	float m10311130 = m10 * m31 - m11 * m30;
+	float m10321230 = m10 * m32 - m12 * m30;
+	float m10331330 = m10 * m33 - m13 * m30;
+	float m11221221 = m11 * m22 - m12 * m21;
+	float m11231321 = m11 * m23 - m13 * m21;
+	float m11321231 = m11 * m32 - m12 * m31;
+	float m11331331 = m11 * m33 - m13 * m31;
+	float m12231322 = m12 * m23 - m13 * m22;
+	float m12331332 = m12 * m33 - m13 * m32;
+	float m20312130 = m20 * m31 - m21 * m30;
+	float m20322230 = m20 * m32 - m22 * m30;
+	float m20332330 = m20 * m33 - m23 * m30;
+	float m21322231 = m21 * m32 - m22 * m31;
+	float m21332331 = m21 * m33 - m23 * m31;
+	float m22332332 = m22 * m33 - m23 * m32;
+
+	Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231,		-m10 * m22332332 + m12 * m20332330 - m13 * m20322230,		m10 * m21332331 - m11 * m20332330 + m13 * m20312130,		-m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
+	Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231,		m00 * m22332332 - m02 * m20332330 + m03 * m20322230,		-m00 * m21332331 + m01 * m20332330 - m03 * m20312130,		m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
+	Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231,		-m00 * m12331332 + m02 * m10331330 - m03 * m10321230,		m00 * m11331331 - m01 * m10331330 + m03 * m10311130,		-m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
+	Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221,		m00 * m12231322 - m02 * m10231320 + m03 * m10221220,		-m00 * m11231321 + m01 * m10231320 - m03 * m10211120,		m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
+
+	float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
+
+	return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
+#endif
+}
+
+Mat44 Mat44::InversedRotationTranslation() const
+{
+	Mat44 m = Transposed3x3();
+	m.SetTranslation(-m.Multiply3x3(GetTranslation()));
+	return m;
+}
+
+float Mat44::GetDeterminant3x3() const
+{
+	return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
+}
+
+Mat44 Mat44::Adjointed3x3() const
+{
+	return Mat44(
+		Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0),
+		Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0),
+		Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0),
+		Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::Inversed3x3() const
+{
+	float det = GetDeterminant3x3();
+
+	return Mat44(
+		(Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
+			- Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)) / det,
+		(Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
+			- Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)) / det,
+		(Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
+			- Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0)) / det,
+		Vec4(0, 0, 0, 1));
+}
+
+bool Mat44::SetInversed3x3(Mat44Arg inM)
+{
+	float det = inM.GetDeterminant3x3();
+
+	// If the determinant is zero the matrix is singular and we return false
+	if (det == 0.0f)
+		return false;
+
+	// Finish calculating the inverse
+	*this = inM.Adjointed3x3();
+	mCol[0] /= det;
+	mCol[1] /= det;
+	mCol[2] /= det;
+	return true;
+}
+
+Quat Mat44::GetQuaternion() const
+{
+	float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
+
+	if (tr >= 0.0f)
+	{
+		float s = sqrt(tr + 1.0f);
+		float is = 0.5f / s;
+		return Quat(
+			(mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
+			(mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
+			(mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
+			0.5f * s);
+	}
+	else
+	{
+		int i = 0;
+		if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
+		if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
+
+		if (i == 0)
+		{
+			float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
+			float is = 0.5f / s;
+			return Quat(
+				0.5f * s,
+				(mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
+				(mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
+				(mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
+		}
+		else if (i == 1)
+		{
+			float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
+			float is = 0.5f / s;
+			return Quat(
+				(mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
+				0.5f * s,
+				(mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
+				(mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
+		}
+		else
+		{
+			JPH_ASSERT(i == 2);
+
+			float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
+			float is = 0.5f / s;
+			return Quat(
+				(mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
+				(mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
+				0.5f * s,
+				(mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
+		}
+	}
+}
+
+Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
+{
+	return Mat44(
+		Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
+		Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
+		Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
+		inQ.mValue);
+}
+
+Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
+{
+	return Mat44(
+		Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
+		Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
+		Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
+		inQ.mValue);
+}
+
+Mat44 Mat44::GetRotation() const
+{
+	JPH_ASSERT(mCol[0][3] == 0.0f);
+	JPH_ASSERT(mCol[1][3] == 0.0f);
+	JPH_ASSERT(mCol[2][3] == 0.0f);
+
+	return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
+}
+
+Mat44 Mat44::GetRotationSafe() const
+{
+#if defined(JPH_USE_AVX512)
+	return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
+				 _mm_maskz_mov_ps(0b0111, mCol[1].mValue),
+				 _mm_maskz_mov_ps(0b0111, mCol[2].mValue),
+				 Vec4(0, 0, 0, 1));
+#elif defined(JPH_USE_SSE4_1)
+	__m128 zero = _mm_setzero_ps();
+	return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
+				 _mm_blend_ps(mCol[1].mValue, zero, 8),
+				 _mm_blend_ps(mCol[2].mValue, zero, 8),
+				 Vec4(0, 0, 0, 1));
+#elif defined(JPH_USE_NEON)
+	return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
+				 vsetq_lane_f32(0, mCol[1].mValue, 3),
+				 vsetq_lane_f32(0, mCol[2].mValue, 3),
+				 Vec4(0, 0, 0, 1));
+#else
+	return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
+				 Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
+				 Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
+				 Vec4(0, 0, 0, 1));
+#endif
+}
+
+void Mat44::SetRotation(Mat44Arg inRotation)
+{
+	mCol[0] = inRotation.mCol[0];
+	mCol[1] = inRotation.mCol[1];
+	mCol[2] = inRotation.mCol[2];
+}
+
+Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
+{
+	return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
+}
+
+Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
+{
+	return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
+}
+
+Mat44 Mat44::PreScaled(Vec3Arg inScale) const
+{
+	return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
+}
+
+Mat44 Mat44::PostScaled(Vec3Arg inScale) const
+{
+	Vec4 scale(inScale, 1);
+	return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
+}
+
+Mat44 Mat44::Decompose(Vec3 &outScale) const
+{
+	// Start the modified Gram-Schmidt algorithm
+	// X axis will just be normalized
+	Vec3 x = GetAxisX();
+
+	// Make Y axis perpendicular to X
+	Vec3 y = GetAxisY();
+	float x_dot_x = x.LengthSq();
+	y -= (x.Dot(y) / x_dot_x) * x;
+
+	// Make Z axis perpendicular to X
+	Vec3 z = GetAxisZ();
+	z -= (x.Dot(z) / x_dot_x) * x;
+
+	// Make Z axis perpendicular to Y
+	float y_dot_y = y.LengthSq();
+	z -= (y.Dot(z) / y_dot_y) * y;
+
+	// Determine the scale
+	float z_dot_z = z.LengthSq();
+	outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
+
+	// If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
+	if (x.Cross(y).Dot(z) < 0.0f)
+		outScale.SetZ(-outScale.GetZ());
+
+	// Determine the rotation and translation
+	return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
+}
+
+#undef JPH_EL
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Math.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Math.h
@@ -0,0 +1,208 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// The constant \f$\pi\f$
+static constexpr float JPH_PI = 3.14159265358979323846f;
+
+/// A large floating point value which, when squared, is still much smaller than FLT_MAX
+static constexpr float cLargeFloat = 1.0e15f;
+
+/// Convert a value from degrees to radians
+JPH_INLINE constexpr float DegreesToRadians(float inV)
+{
+	return inV * (JPH_PI / 180.0f);
+}
+
+/// Convert a value from radians to degrees
+JPH_INLINE constexpr float RadiansToDegrees(float inV)
+{
+	return inV * (180.0f / JPH_PI);
+}
+
+/// Convert angle in radians to the range \f$[-\pi, \pi]\f$
+inline float CenterAngleAroundZero(float inV)
+{
+	if (inV < -JPH_PI)
+	{
+		do
+			inV += 2.0f * JPH_PI;
+		while (inV < -JPH_PI);
+	}
+	else if (inV > JPH_PI)
+	{
+		do
+			inV -= 2.0f * JPH_PI;
+		while (inV > JPH_PI);
+	}
+	JPH_ASSERT(inV >= -JPH_PI && inV <= JPH_PI);
+	return inV;
+}
+
+/// Clamp a value between two values
+template <typename T>
+JPH_INLINE constexpr T Clamp(T inV, T inMin, T inMax)
+{
+	return min(max(inV, inMin), inMax);
+}
+
+/// Square a value
+template <typename T>
+JPH_INLINE constexpr T Square(T inV)
+{
+	return inV * inV;
+}
+
+/// Returns \f$inV^3\f$.
+template <typename T>
+JPH_INLINE constexpr T Cubed(T inV)
+{
+	return inV * inV * inV;
+}
+
+/// Get the sign of a value
+template <typename T>
+JPH_INLINE constexpr T Sign(T inV)
+{
+	return inV < 0? T(-1) : T(1);
+}
+
+/// Check if inV is a power of 2
+template <typename T>
+constexpr bool IsPowerOf2(T inV)
+{
+	return inV > 0 && (inV & (inV - 1)) == 0;
+}
+
+/// Align inV up to the next inAlignment bytes
+template <typename T>
+inline T AlignUp(T inV, uint64 inAlignment)
+{
+	JPH_ASSERT(IsPowerOf2(inAlignment));
+	return T((uint64(inV) + inAlignment - 1) & ~(inAlignment - 1));
+}
+
+/// Check if inV is inAlignment aligned
+template <typename T>
+inline bool IsAligned(T inV, uint64 inAlignment)
+{
+	JPH_ASSERT(IsPowerOf2(inAlignment));
+	return (uint64(inV) & (inAlignment - 1)) == 0;
+}
+
+/// Compute number of trailing zero bits (how many low bits are zero)
+inline uint CountTrailingZeros(uint32 inValue)
+{
+#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
+	#if defined(JPH_USE_TZCNT)
+		return _tzcnt_u32(inValue);
+	#elif defined(JPH_COMPILER_MSVC)
+		if (inValue == 0)
+			return 32;
+		unsigned long result;
+		_BitScanForward(&result, inValue);
+		return result;
+	#else
+		if (inValue == 0)
+			return 32;
+		return __builtin_ctz(inValue);
+	#endif
+#elif defined(JPH_CPU_ARM)
+	#if defined(JPH_COMPILER_MSVC)
+		if (inValue == 0)
+			return 32;
+		unsigned long result;
+		_BitScanForward(&result, inValue);
+		return result;
+	#else
+		if (inValue == 0)
+			return 32;
+		return __builtin_ctz(inValue);
+	#endif
+#elif defined(JPH_CPU_E2K) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
+	return inValue ? __builtin_ctz(inValue) : 32;
+#else
+	#error Undefined
+#endif
+}
+
+/// Compute the number of leading zero bits (how many high bits are zero)
+inline uint CountLeadingZeros(uint32 inValue)
+{
+#if defined(JPH_CPU_X86) || defined(JPH_CPU_WASM)
+	#if defined(JPH_USE_LZCNT)
+		return _lzcnt_u32(inValue);
+	#elif defined(JPH_COMPILER_MSVC)
+		if (inValue == 0)
+			return 32;
+		unsigned long result;
+		_BitScanReverse(&result, inValue);
+		return 31 - result;
+	#else
+		if (inValue == 0)
+			return 32;
+		return __builtin_clz(inValue);
+	#endif
+#elif defined(JPH_CPU_ARM)
+	#if defined(JPH_COMPILER_MSVC)
+		return _CountLeadingZeros(inValue);
+	#else
+		return __builtin_clz(inValue);
+	#endif
+#elif defined(JPH_CPU_E2K) || defined(JPH_CPU_RISCV) || defined(JPH_CPU_PPC) || defined(JPH_CPU_LOONGARCH)
+	return inValue ? __builtin_clz(inValue) : 32;
+#else
+	#error Undefined
+#endif
+}
+
+/// Count the number of 1 bits in a value
+inline uint CountBits(uint32 inValue)
+{
+#if defined(JPH_COMPILER_CLANG) || defined(JPH_COMPILER_GCC)
+	return __builtin_popcount(inValue);
+#elif defined(JPH_COMPILER_MSVC)
+	#if defined(JPH_USE_SSE4_2)
+		return _mm_popcnt_u32(inValue);
+	#elif defined(JPH_USE_NEON) && (_MSC_VER >= 1930) // _CountOneBits not available on MSVC2019
+		return _CountOneBits(inValue);
+	#else
+		inValue = inValue - ((inValue >> 1) & 0x55555555);
+		inValue = (inValue & 0x33333333) + ((inValue >> 2) & 0x33333333);
+		inValue = (inValue + (inValue >> 4)) & 0x0F0F0F0F;
+		return (inValue * 0x01010101) >> 24;
+	#endif
+#else
+	#error Undefined
+#endif
+}
+
+/// Get the next higher power of 2 of a value, or the value itself if the value is already a power of 2
+inline uint32 GetNextPowerOf2(uint32 inValue)
+{
+	return inValue <= 1? uint32(1) : uint32(1) << (32 - CountLeadingZeros(inValue - 1));
+}
+
+// Simple implementation of C++20 std::bit_cast (unfortunately not constexpr)
+template <class To, class From>
+JPH_INLINE To BitCast(const From &inValue)
+{
+	static_assert(std::is_trivially_constructible_v<To>);
+	static_assert(sizeof(From) == sizeof(To));
+
+	union FromTo
+	{
+		To			mTo;
+		From		mFrom;
+	};
+
+	FromTo convert;
+	convert.mFrom = inValue;
+	return convert.mTo;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/MathTypes.h
+++ b/thirdparty/jolt_physics/Jolt/Math/MathTypes.h
@@ -0,0 +1,32 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+class Vec3;
+class DVec3;
+class Vec4;
+class UVec4;
+class BVec16;
+class Quat;
+class Mat44;
+class DMat44;
+
+// Types to use for passing arguments to functions
+using Vec3Arg = const Vec3;
+#ifdef JPH_USE_AVX
+	using DVec3Arg = const DVec3;
+#else
+	using DVec3Arg = const DVec3 &;
+#endif
+using Vec4Arg = const Vec4;
+using UVec4Arg = const UVec4;
+using BVec16Arg = const BVec16;
+using QuatArg = const Quat;
+using Mat44Arg = const Mat44 &;
+using DMat44Arg = const DMat44 &;
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Matrix.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Matrix.h
@@ -0,0 +1,259 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Vector.h>
+#include <Jolt/Math/GaussianElimination.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Templatized matrix class
+template <uint Rows, uint Cols>
+class [[nodiscard]] Matrix
+{
+public:
+	/// Constructor
+	inline									Matrix() = default;
+	inline									Matrix(const Matrix &inM2)								{ *this = inM2; }
+
+	/// Dimensions
+	inline uint								GetRows() const											{ return Rows; }
+	inline uint								GetCols() const											{ return Cols; }
+
+	/// Zero matrix
+	inline void								SetZero()
+	{
+		for (uint c = 0; c < Cols; ++c)
+			mCol[c].SetZero();
+	}
+
+	inline static Matrix					sZero()													{ Matrix m; m.SetZero(); return m; }
+
+	/// Check if this matrix consists of all zeros
+	inline bool								IsZero() const
+	{
+		for (uint c = 0; c < Cols; ++c)
+			if (!mCol[c].IsZero())
+				return false;
+
+		return true;
+	}
+
+	/// Identity matrix
+	inline void								SetIdentity()
+	{
+		// Clear matrix
+		SetZero();
+
+		// Set diagonal to 1
+		for (uint rc = 0, min_rc = min(Rows, Cols); rc < min_rc; ++rc)
+			mCol[rc].mF32[rc] = 1.0f;
+	}
+
+	inline static Matrix					sIdentity()												{ Matrix m; m.SetIdentity(); return m; }
+
+	/// Check if this matrix is identity
+	bool									IsIdentity() const										{ return *this == sIdentity(); }
+
+	/// Diagonal matrix
+	inline void								SetDiagonal(const Vector<Rows < Cols? Rows : Cols> &inV)
+	{
+		// Clear matrix
+		SetZero();
+
+		// Set diagonal
+		for (uint rc = 0, min_rc = min(Rows, Cols); rc < min_rc; ++rc)
+			mCol[rc].mF32[rc] = inV[rc];
+	}
+
+	inline static Matrix					sDiagonal(const Vector<Rows < Cols? Rows : Cols> &inV)
+	{
+		Matrix m;
+		m.SetDiagonal(inV);
+		return m;
+	}
+
+	/// Copy a (part) of another matrix into this matrix
+	template <class OtherMatrix>
+		void								CopyPart(const OtherMatrix &inM, uint inSourceRow, uint inSourceCol, uint inNumRows, uint inNumCols, uint inDestRow, uint inDestCol)
+		{
+			for (uint c = 0; c < inNumCols; ++c)
+				for (uint r = 0; r < inNumRows; ++r)
+					mCol[inDestCol + c].mF32[inDestRow + r] = inM(inSourceRow + r, inSourceCol + c);
+		}
+
+	/// Get float component by element index
+	inline float							operator () (uint inRow, uint inColumn) const
+	{
+		JPH_ASSERT(inRow < Rows);
+		JPH_ASSERT(inColumn < Cols);
+		return mCol[inColumn].mF32[inRow];
+	}
+
+	inline float &							operator () (uint inRow, uint inColumn)
+	{
+		JPH_ASSERT(inRow < Rows);
+		JPH_ASSERT(inColumn < Cols);
+		return mCol[inColumn].mF32[inRow];
+	}
+
+	/// Comparison
+	inline bool								operator == (const Matrix &inM2) const
+	{
+		for (uint c = 0; c < Cols; ++c)
+			if (mCol[c] != inM2.mCol[c])
+				return false;
+		return true;
+	}
+
+	inline bool								operator != (const Matrix &inM2) const
+	{
+		for (uint c = 0; c < Cols; ++c)
+			if (mCol[c] != inM2.mCol[c])
+				return true;
+		return false;
+	}
+
+	/// Assignment
+	inline Matrix &							operator = (const Matrix &inM2)
+	{
+		for (uint c = 0; c < Cols; ++c)
+			mCol[c] = inM2.mCol[c];
+		return *this;
+	}
+
+	/// Multiply matrix by matrix
+	template <uint OtherCols>
+	inline Matrix<Rows, OtherCols>	operator * (const Matrix<Cols, OtherCols> &inM) const
+	{
+		Matrix<Rows, OtherCols> m;
+		for (uint c = 0; c < OtherCols; ++c)
+			for (uint r = 0; r < Rows; ++r)
+			{
+				float dot = 0.0f;
+				for (uint i = 0; i < Cols; ++i)
+					dot += mCol[i].mF32[r] * inM.mCol[c].mF32[i];
+				m.mCol[c].mF32[r] = dot;
+			}
+		return m;
+	}
+
+	/// Multiply vector by matrix
+	inline Vector<Rows>						operator * (const Vector<Cols> &inV) const
+	{
+		Vector<Rows> v;
+		for (uint r = 0; r < Rows; ++r)
+		{
+			float dot = 0.0f;
+			for (uint c = 0; c < Cols; ++c)
+				dot += mCol[c].mF32[r] * inV.mF32[c];
+			v.mF32[r] = dot;
+		}
+		return v;
+	}
+
+	/// Multiply matrix with float
+	inline Matrix							operator * (float inV) const
+	{
+		Matrix m;
+		for (uint c = 0; c < Cols; ++c)
+			m.mCol[c] = mCol[c] * inV;
+		return m;
+	}
+
+	inline friend Matrix					operator * (float inV, const Matrix &inM)
+	{
+		return inM * inV;
+	}
+
+	/// Per element addition of matrix
+	inline Matrix							operator + (const Matrix &inM) const
+	{
+		Matrix m;
+		for (uint c = 0; c < Cols; ++c)
+			m.mCol[c] = mCol[c] + inM.mCol[c];
+		return m;
+	}
+
+	/// Per element subtraction of matrix
+	inline Matrix							operator - (const Matrix &inM) const
+	{
+		Matrix m;
+		for (uint c = 0; c < Cols; ++c)
+			m.mCol[c] = mCol[c] - inM.mCol[c];
+		return m;
+	}
+
+	/// Transpose matrix
+	inline Matrix<Cols, Rows>				Transposed() const
+	{
+		Matrix<Cols, Rows> m;
+		for (uint r = 0; r < Rows; ++r)
+			for (uint c = 0; c < Cols; ++c)
+				m.mCol[r].mF32[c] = mCol[c].mF32[r];
+		return m;
+	}
+
+	/// Inverse matrix
+	bool									SetInversed(const Matrix &inM)
+	{
+		if constexpr (Rows != Cols) JPH_ASSERT(false);
+		Matrix copy(inM);
+		SetIdentity();
+		return GaussianElimination(copy, *this);
+	}
+
+	inline Matrix							Inversed() const
+	{
+		Matrix m;
+		m.SetInversed(*this);
+		return m;
+	}
+
+	/// To String
+	friend ostream &						operator << (ostream &inStream, const Matrix &inM)
+	{
+		for (uint i = 0; i < Cols - 1; ++i)
+			inStream << inM.mCol[i] << ", ";
+		inStream << inM.mCol[Cols - 1];
+		return inStream;
+	}
+
+	/// Column access
+	const Vector<Rows> &					GetColumn(int inIdx) const					{ return mCol[inIdx]; }
+	Vector<Rows> &							GetColumn(int inIdx)						{ return mCol[inIdx]; }
+
+	Vector<Rows>							mCol[Cols];									///< Column
+};
+
+// The template specialization doesn't sit well with Doxygen
+#ifndef JPH_PLATFORM_DOXYGEN
+
+/// Specialization of SetInversed for 2x2 matrix
+template <>
+inline bool Matrix<2, 2>::SetInversed(const Matrix<2, 2> &inM)
+{
+	// Fetch elements
+	float a = inM.mCol[0].mF32[0];
+	float b = inM.mCol[1].mF32[0];
+	float c = inM.mCol[0].mF32[1];
+	float d = inM.mCol[1].mF32[1];
+
+	// Calculate determinant
+	float det = a * d - b * c;
+	if (det == 0.0f)
+		return false;
+
+	// Construct inverse
+	mCol[0].mF32[0] = d / det;
+	mCol[1].mF32[0] = -b / det;
+	mCol[0].mF32[1] = -c / det;
+	mCol[1].mF32[1] = a / det;
+	return true;
+}
+
+#endif // !JPH_PLATFORM_DOXYGEN
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Quat.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Quat.h
@@ -0,0 +1,255 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Vec3.h>
+#include <Jolt/Math/Vec4.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// Quaternion class, quaternions are 4 dimensional vectors which can describe rotations in 3 dimensional
+/// space if their length is 1.
+///
+/// They are written as:
+///
+/// \f$q = w + x \: i + y \: j + z \: k\f$
+///
+/// or in vector notation:
+///
+/// \f$q = [w, v] = [w, x, y, z]\f$
+///
+/// Where:
+///
+/// w = the real part
+/// v = the imaginary part, (x, y, z)
+///
+/// Note that we store the quaternion in a Vec4 as [x, y, z, w] because that makes
+/// it easy to extract the rotation axis of the quaternion:
+///
+/// q = [cos(angle / 2), sin(angle / 2) * rotation_axis]
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Quat
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	///@name Constructors
+	///@{
+	inline						Quat() = default; ///< Intentionally not initialized for performance reasons
+								Quat(const Quat &inRHS) = default;
+	Quat &						operator = (const Quat &inRHS) = default;
+	inline						Quat(float inX, float inY, float inZ, float inW)				: mValue(inX, inY, inZ, inW) { }
+	inline explicit				Quat(Vec4Arg inV)												: mValue(inV) { }
+	///@}
+
+	///@name Tests
+	///@{
+
+	/// Check if two quaternions are exactly equal
+	inline bool					operator == (QuatArg inRHS) const								{ return mValue == inRHS.mValue; }
+
+	/// Check if two quaternions are different
+	inline bool					operator != (QuatArg inRHS) const								{ return mValue != inRHS.mValue; }
+
+	/// If this quaternion is close to inRHS. Note that q and -q represent the same rotation, this is not checked here.
+	inline bool					IsClose(QuatArg inRHS, float inMaxDistSq = 1.0e-12f) const		{ return mValue.IsClose(inRHS.mValue, inMaxDistSq); }
+
+	/// If the length of this quaternion is 1 +/- inTolerance
+	inline bool					IsNormalized(float inTolerance = 1.0e-5f) const					{ return mValue.IsNormalized(inTolerance); }
+
+	/// If any component of this quaternion is a NaN (not a number)
+	inline bool					IsNaN() const													{ return mValue.IsNaN(); }
+
+	///@}
+	///@name Get components
+	///@{
+
+	/// Get X component (imaginary part i)
+	JPH_INLINE float			GetX() const													{ return mValue.GetX(); }
+
+	/// Get Y component (imaginary part j)
+	JPH_INLINE float			GetY() const													{ return mValue.GetY(); }
+
+	/// Get Z component (imaginary part k)
+	JPH_INLINE float			GetZ() const													{ return mValue.GetZ(); }
+
+	/// Get W component (real part)
+	JPH_INLINE float			GetW() const													{ return mValue.GetW(); }
+
+	/// Get the imaginary part of the quaternion
+	JPH_INLINE Vec3				GetXYZ() const													{ return Vec3(mValue); }
+
+	/// Get the quaternion as a Vec4
+	JPH_INLINE Vec4				GetXYZW() const													{ return mValue; }
+
+	/// Set individual components
+	JPH_INLINE void				SetX(float inX)													{ mValue.SetX(inX); }
+	JPH_INLINE void				SetY(float inY)													{ mValue.SetY(inY); }
+	JPH_INLINE void				SetZ(float inZ)													{ mValue.SetZ(inZ); }
+	JPH_INLINE void				SetW(float inW)													{ mValue.SetW(inW); }
+
+	/// Set all components
+	JPH_INLINE void				Set(float inX, float inY, float inZ, float inW)					{ mValue.Set(inX, inY, inZ, inW); }
+
+	///@}
+	///@name Default quaternions
+	///@{
+
+	/// @return [0, 0, 0, 0]
+	JPH_INLINE static Quat		sZero()															{ return Quat(Vec4::sZero()); }
+
+	/// @return [1, 0, 0, 0] (or in storage format Quat(0, 0, 0, 1))
+	JPH_INLINE static Quat		sIdentity()														{ return Quat(0, 0, 0, 1); }
+
+	///@}
+
+	/// Rotation from axis and angle
+	JPH_INLINE static Quat		sRotation(Vec3Arg inAxis, float inAngle);
+
+	/// Get axis and angle that represents this quaternion, outAngle will always be in the range \f$[0, \pi]\f$
+	JPH_INLINE void				GetAxisAngle(Vec3 &outAxis, float &outAngle) const;
+
+	/// Create quaternion that rotates a vector from the direction of inFrom to the direction of inTo along the shortest path
+	/// @see https://www.euclideanspace.com/maths/algebra/vectors/angleBetween/index.htm
+	JPH_INLINE static Quat		sFromTo(Vec3Arg inFrom, Vec3Arg inTo);
+
+	/// Random unit quaternion
+	template <class Random>
+	inline static Quat			sRandom(Random &inRandom);
+
+	/// Conversion from Euler angles. Rotation order is X then Y then Z (RotZ * RotY * RotX). Angles in radians.
+	inline static Quat			sEulerAngles(Vec3Arg inAngles);
+
+	/// Conversion to Euler angles. Rotation order is X then Y then Z (RotZ * RotY * RotX). Angles in radians.
+	inline Vec3					GetEulerAngles() const;
+
+	///@name Length / normalization operations
+	///@{
+
+	/// Squared length of quaternion.
+	/// @return Squared length of quaternion (\f$|v|^2\f$)
+	JPH_INLINE float			LengthSq() const												{ return mValue.LengthSq(); }
+
+	/// Length of quaternion.
+	/// @return Length of quaternion (\f$|v|\f$)
+	JPH_INLINE float			Length() const													{ return mValue.Length(); }
+
+	/// Normalize the quaternion (make it length 1)
+	JPH_INLINE Quat				Normalized() const												{ return Quat(mValue.Normalized()); }
+
+	///@}
+	///@name Additions / multiplications
+	///@{
+
+	JPH_INLINE void				operator += (QuatArg inRHS)										{ mValue += inRHS.mValue; }
+	JPH_INLINE void				operator -= (QuatArg inRHS)										{ mValue -= inRHS.mValue; }
+	JPH_INLINE void				operator *= (float inValue)										{ mValue *= inValue; }
+	JPH_INLINE void				operator /= (float inValue)										{ mValue /= inValue; }
+	JPH_INLINE Quat				operator - () const												{ return Quat(-mValue); }
+	JPH_INLINE Quat				operator + (QuatArg inRHS) const								{ return Quat(mValue + inRHS.mValue); }
+	JPH_INLINE Quat				operator - (QuatArg inRHS) const								{ return Quat(mValue - inRHS.mValue); }
+	JPH_INLINE Quat				operator * (QuatArg inRHS) const;
+	JPH_INLINE Quat				operator * (float inValue) const								{ return Quat(mValue * inValue); }
+	inline friend Quat			operator * (float inValue, QuatArg inRHS)						{ return Quat(inRHS.mValue * inValue); }
+	JPH_INLINE Quat				operator / (float inValue) const								{ return Quat(mValue / inValue); }
+
+	///@}
+
+	/// Rotate a vector by this quaternion
+	JPH_INLINE Vec3				operator * (Vec3Arg inValue) const;
+
+	/// Rotate a vector by the inverse of this quaternion
+	JPH_INLINE Vec3				InverseRotate(Vec3Arg inValue) const;
+
+	/// Rotate a the vector (1, 0, 0) with this quaternion
+	JPH_INLINE Vec3				RotateAxisX() const;
+
+	/// Rotate a the vector (0, 1, 0) with this quaternion
+	JPH_INLINE Vec3				RotateAxisY() const;
+
+	/// Rotate a the vector (0, 0, 1) with this quaternion
+	JPH_INLINE Vec3				RotateAxisZ() const;
+
+	/// Dot product
+	JPH_INLINE float			Dot(QuatArg inRHS) const										{ return mValue.Dot(inRHS.mValue); }
+
+	/// The conjugate [w, -x, -y, -z] is the same as the inverse for unit quaternions
+	JPH_INLINE Quat				Conjugated() const												{ return Quat(Vec4::sXor(mValue, UVec4(0x80000000, 0x80000000, 0x80000000, 0).ReinterpretAsFloat())); }
+
+	/// Get inverse quaternion
+	JPH_INLINE Quat				Inversed() const												{ return Conjugated() / Length(); }
+
+	/// Ensures that the W component is positive by negating the entire quaternion if it is not. This is useful when you want to store a quaternion as a 3 vector by discarding W and reconstructing it as sqrt(1 - x^2 - y^2 - z^2).
+	JPH_INLINE Quat				EnsureWPositive() const											{ return Quat(Vec4::sXor(mValue, Vec4::sAnd(mValue.SplatW(), UVec4::sReplicate(0x80000000).ReinterpretAsFloat()))); }
+
+	/// Get a quaternion that is perpendicular to this quaternion
+	JPH_INLINE Quat				GetPerpendicular() const										{ return Quat(Vec4(1, -1, 1, -1) * mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>()); }
+
+	/// Get rotation angle around inAxis (uses Swing Twist Decomposition to get the twist quaternion and uses q(axis, angle) = [cos(angle / 2), axis * sin(angle / 2)])
+	JPH_INLINE float			GetRotationAngle(Vec3Arg inAxis) const							{ return GetW() == 0.0f? JPH_PI : 2.0f * ATan(GetXYZ().Dot(inAxis) / GetW()); }
+
+	/// Swing Twist Decomposition: any quaternion can be split up as:
+	///
+	/// \f[q = q_{swing} \: q_{twist}\f]
+	///
+	/// where \f$q_{twist}\f$ rotates only around axis v.
+	///
+	/// \f$q_{twist}\f$ is:
+	///
+	/// \f[q_{twist} = \frac{[q_w, q_{ijk} \cdot v \: v]}{\left|[q_w, q_{ijk} \cdot v \: v]\right|}\f]
+	///
+	/// where q_w is the real part of the quaternion and q_i the imaginary part (a 3 vector).
+	///
+	/// The swing can then be calculated as:
+	///
+	/// \f[q_{swing} = q \: q_{twist}^* \f]
+	///
+	/// Where \f$q_{twist}^*\f$ = complex conjugate of \f$q_{twist}\f$
+	JPH_INLINE Quat				GetTwist(Vec3Arg inAxis) const;
+
+	/// Decomposes quaternion into swing and twist component:
+	///
+	/// \f$q = q_{swing} \: q_{twist}\f$
+	///
+	/// where \f$q_{swing} \: \hat{x} = q_{twist} \: \hat{y} = q_{twist} \: \hat{z} = 0\f$
+	///
+	/// In other words:
+	///
+	/// - \f$q_{twist}\f$ only rotates around the X-axis.
+	/// - \f$q_{swing}\f$ only rotates around the Y and Z-axis.
+	///
+	/// @see Gino van den Bergen - Rotational Joint Limits in Quaternion Space - GDC 2016
+	JPH_INLINE void				GetSwingTwist(Quat &outSwing, Quat &outTwist) const;
+
+	/// Linear interpolation between two quaternions (for small steps).
+	/// @param inFraction is in the range [0, 1]
+	/// @param inDestination The destination quaternion
+	/// @return (1 - inFraction) * this + fraction * inDestination
+	JPH_INLINE Quat				LERP(QuatArg inDestination, float inFraction) const;
+
+	/// Spherical linear interpolation between two quaternions.
+	/// @param inFraction is in the range [0, 1]
+	/// @param inDestination The destination quaternion
+	/// @return When fraction is zero this quaternion is returned, when fraction is 1 inDestination is returned.
+	/// When fraction is between 0 and 1 an interpolation along the shortest path is returned.
+	JPH_INLINE Quat				SLERP(QuatArg inDestination, float inFraction) const;
+
+	/// Load 3 floats from memory (X, Y and Z component and then calculates W) reads 32 bits extra which it doesn't use
+	static JPH_INLINE Quat		sLoadFloat3Unsafe(const Float3 &inV);
+
+	/// Store 3 as floats to memory (X, Y and Z component)
+	JPH_INLINE void				StoreFloat3(Float3 *outV) const;
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, QuatArg inQ)					{ inStream << inQ.mValue; return inStream; }
+
+	/// 4 vector that stores [x, y, z, w] parts of the quaternion
+	Vec4						mValue;
+};
+
+static_assert(std::is_trivial<Quat>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "Quat.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/Quat.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/Quat.inl
@@ -0,0 +1,328 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+Quat Quat::operator * (QuatArg inRHS) const
+{
+#if defined(JPH_USE_SSE4_1)
+	// Taken from: http://momchil-velikov.blogspot.nl/2013/10/fast-sse-quternion-multiplication.html
+	__m128 abcd = mValue.mValue;
+	__m128 xyzw = inRHS.mValue.mValue;
+
+	__m128 t0 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(3, 3, 3, 3));
+	__m128 t1 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2, 3, 0, 1));
+
+	__m128 t3 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 t4 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(1, 0, 3, 2));
+
+	__m128 t5 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 t6 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(2, 0, 3, 1));
+
+	// [d,d,d,d] * [z,w,x,y] = [dz,dw,dx,dy]
+	__m128 m0 = _mm_mul_ps(t0, t1);
+
+	// [a,a,a,a] * [y,x,w,z] = [ay,ax,aw,az]
+	__m128 m1 = _mm_mul_ps(t3, t4);
+
+	// [b,b,b,b] * [z,x,w,y] = [bz,bx,bw,by]
+	__m128 m2 = _mm_mul_ps(t5, t6);
+
+	// [c,c,c,c] * [w,z,x,y] = [cw,cz,cx,cy]
+	__m128 t7 = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2, 2, 2, 2));
+	__m128 t8 = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 2, 0, 1));
+	__m128 m3 = _mm_mul_ps(t7, t8);
+
+	// [dz,dw,dx,dy] + -[ay,ax,aw,az] = [dz+ay,dw-ax,dx+aw,dy-az]
+	__m128 e = _mm_addsub_ps(m0, m1);
+
+	// [dx+aw,dz+ay,dy-az,dw-ax]
+	e = _mm_shuffle_ps(e, e, _MM_SHUFFLE(1, 3, 0, 2));
+
+	// [dx+aw,dz+ay,dy-az,dw-ax] + -[bz,bx,bw,by] = [dx+aw+bz,dz+ay-bx,dy-az+bw,dw-ax-by]
+	e = _mm_addsub_ps(e, m2);
+
+	// [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz]
+	e = _mm_shuffle_ps(e, e, _MM_SHUFFLE(2, 0, 1, 3));
+
+	// [dz+ay-bx,dw-ax-by,dy-az+bw,dx+aw+bz] + -[cw,cz,cx,cy] = [dz+ay-bx+cw,dw-ax-by-cz,dy-az+bw+cx,dx+aw+bz-cy]
+	e = _mm_addsub_ps(e, m3);
+
+	// [dw-ax-by-cz,dz+ay-bx+cw,dy-az+bw+cx,dx+aw+bz-cy]
+	return Quat(Vec4(_mm_shuffle_ps(e, e, _MM_SHUFFLE(2, 3, 1, 0))));
+#else
+	float lx = mValue.GetX();
+	float ly = mValue.GetY();
+	float lz = mValue.GetZ();
+	float lw = mValue.GetW();
+
+	float rx = inRHS.mValue.GetX();
+	float ry = inRHS.mValue.GetY();
+	float rz = inRHS.mValue.GetZ();
+	float rw = inRHS.mValue.GetW();
+
+	float x = lw * rx + lx * rw + ly * rz - lz * ry;
+	float y = lw * ry - lx * rz + ly * rw + lz * rx;
+	float z = lw * rz + lx * ry - ly * rx + lz * rw;
+	float w = lw * rw - lx * rx - ly * ry - lz * rz;
+
+	return Quat(x, y, z, w);
+#endif
+}
+
+Quat Quat::sRotation(Vec3Arg inAxis, float inAngle)
+{
+	// returns [inAxis * sin(0.5f * inAngle), cos(0.5f * inAngle)]
+	JPH_ASSERT(inAxis.IsNormalized());
+	Vec4 s, c;
+	Vec4::sReplicate(0.5f * inAngle).SinCos(s, c);
+	return Quat(Vec4::sSelect(Vec4(inAxis) * s, c, UVec4(0, 0, 0, 0xffffffffU)));
+}
+
+void Quat::GetAxisAngle(Vec3 &outAxis, float &outAngle) const
+{
+	JPH_ASSERT(IsNormalized());
+	Quat w_pos = EnsureWPositive();
+	float abs_w = w_pos.GetW();
+	if (abs_w >= 1.0f)
+	{
+		outAxis = Vec3::sZero();
+		outAngle = 0.0f;
+	}
+	else
+	{
+		outAngle = 2.0f * ACos(abs_w);
+		outAxis = w_pos.GetXYZ().NormalizedOr(Vec3::sZero());
+	}
+}
+
+Quat Quat::sFromTo(Vec3Arg inFrom, Vec3Arg inTo)
+{
+	/*
+		Uses (inFrom = v1, inTo = v2):
+
+		angle = arcos(v1 . v2 / |v1||v2|)
+		axis = normalize(v1 x v2)
+
+		Quaternion is then:
+
+		s = sin(angle / 2)
+		x = axis.x * s
+		y = axis.y * s
+		z = axis.z * s
+		w = cos(angle / 2)
+
+		Using identities:
+
+		sin(2 * a) = 2 * sin(a) * cos(a)
+		cos(2 * a) = cos(a)^2 - sin(a)^2
+		sin(a)^2 + cos(a)^2 = 1
+
+		This reduces to:
+
+		x = (v1 x v2).x
+		y = (v1 x v2).y
+		z = (v1 x v2).z
+		w = |v1||v2| + v1 . v2
+
+		which then needs to be normalized because the whole equation was multiplied by 2 cos(angle / 2)
+	*/
+
+	float len_v1_v2 = sqrt(inFrom.LengthSq() * inTo.LengthSq());
+	float w = len_v1_v2 + inFrom.Dot(inTo);
+
+	if (w == 0.0f)
+	{
+		if (len_v1_v2 == 0.0f)
+		{
+			// If either of the vectors has zero length, there is no rotation and we return identity
+			return Quat::sIdentity();
+		}
+		else
+		{
+			// If vectors are perpendicular, take one of the many 180 degree rotations that exist
+			return Quat(Vec4(inFrom.GetNormalizedPerpendicular(), 0));
+		}
+	}
+
+	Vec3 v = inFrom.Cross(inTo);
+	return Quat(Vec4(v, w)).Normalized();
+}
+
+template <class Random>
+Quat Quat::sRandom(Random &inRandom)
+{
+	std::uniform_real_distribution<float> zero_to_one(0.0f, 1.0f);
+	float x0 = zero_to_one(inRandom);
+	float r1 = sqrt(1.0f - x0), r2 = sqrt(x0);
+	std::uniform_real_distribution<float> zero_to_two_pi(0.0f, 2.0f * JPH_PI);
+	Vec4 s, c;
+	Vec4(zero_to_two_pi(inRandom), zero_to_two_pi(inRandom), 0, 0).SinCos(s, c);
+	return Quat(s.GetX() * r1, c.GetX() * r1, s.GetY() * r2, c.GetY() * r2);
+}
+
+Quat Quat::sEulerAngles(Vec3Arg inAngles)
+{
+	Vec4 half(0.5f * inAngles);
+	Vec4 s, c;
+	half.SinCos(s, c);
+
+	float cx = c.GetX();
+	float sx = s.GetX();
+	float cy = c.GetY();
+	float sy = s.GetY();
+	float cz = c.GetZ();
+	float sz = s.GetZ();
+
+	return Quat(
+		cz * sx * cy - sz * cx * sy,
+		cz * cx * sy + sz * sx * cy,
+		sz * cx * cy - cz * sx * sy,
+		cz * cx * cy + sz * sx * sy);
+}
+
+Vec3 Quat::GetEulerAngles() const
+{
+	float y_sq = GetY() * GetY();
+
+	// X
+	float t0 = 2.0f * (GetW() * GetX() + GetY() * GetZ());
+	float t1 = 1.0f - 2.0f * (GetX() * GetX() + y_sq);
+
+	// Y
+	float t2 = 2.0f * (GetW() * GetY() - GetZ() * GetX());
+	t2 = t2 > 1.0f? 1.0f : t2;
+	t2 = t2 < -1.0f? -1.0f : t2;
+
+	// Z
+	float t3 = 2.0f * (GetW() * GetZ() + GetX() * GetY());
+	float t4 = 1.0f - 2.0f * (y_sq + GetZ() * GetZ());
+
+	return Vec3(ATan2(t0, t1), ASin(t2), ATan2(t3, t4));
+}
+
+Quat Quat::GetTwist(Vec3Arg inAxis) const
+{
+	Quat twist(Vec4(GetXYZ().Dot(inAxis) * inAxis, GetW()));
+	float twist_len = twist.LengthSq();
+	if (twist_len != 0.0f)
+		return twist / sqrt(twist_len);
+	else
+		return Quat::sIdentity();
+}
+
+void Quat::GetSwingTwist(Quat &outSwing, Quat &outTwist) const
+{
+	float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
+	float s = sqrt(Square(w) + Square(x));
+	if (s != 0.0f)
+	{
+		outTwist = Quat(x / s, 0, 0, w / s);
+		outSwing = Quat(0, (w * y - x * z) / s, (w * z + x * y) / s, s);
+	}
+	else
+	{
+		// If both x and w are zero, this must be a 180 degree rotation around either y or z
+		outTwist = Quat::sIdentity();
+		outSwing = *this;
+	}
+}
+
+Quat Quat::LERP(QuatArg inDestination, float inFraction) const
+{
+	float scale0 = 1.0f - inFraction;
+	return Quat(Vec4::sReplicate(scale0) * mValue + Vec4::sReplicate(inFraction) * inDestination.mValue);
+}
+
+Quat Quat::SLERP(QuatArg inDestination, float inFraction) const
+{
+	// Difference at which to LERP instead of SLERP
+	const float delta = 0.0001f;
+
+	// Calc cosine
+	float sign_scale1 = 1.0f;
+	float cos_omega = Dot(inDestination);
+
+	// Adjust signs (if necessary)
+	if (cos_omega < 0.0f)
+	{
+		cos_omega = -cos_omega;
+		sign_scale1 = -1.0f;
+	}
+
+	// Calculate coefficients
+	float scale0, scale1;
+	if (1.0f - cos_omega > delta)
+	{
+		// Standard case (slerp)
+		float omega = ACos(cos_omega);
+		float sin_omega = Sin(omega);
+		scale0 = Sin((1.0f - inFraction) * omega) / sin_omega;
+		scale1 = sign_scale1 * Sin(inFraction * omega) / sin_omega;
+	}
+	else
+	{
+		// Quaternions are very close so we can do a linear interpolation
+		scale0 = 1.0f - inFraction;
+		scale1 = sign_scale1 * inFraction;
+	}
+
+	// Interpolate between the two quaternions
+	return Quat(Vec4::sReplicate(scale0) * mValue + Vec4::sReplicate(scale1) * inDestination.mValue).Normalized();
+}
+
+Vec3 Quat::operator * (Vec3Arg inValue) const
+{
+	// Rotating a vector by a quaternion is done by: p' = q * p * q^-1 (q^-1 = conjugated(q) for a unit quaternion)
+	JPH_ASSERT(IsNormalized());
+	return Vec3((*this * Quat(Vec4(inValue, 0)) * Conjugated()).mValue);
+}
+
+Vec3 Quat::InverseRotate(Vec3Arg inValue) const
+{
+	JPH_ASSERT(IsNormalized());
+	return Vec3((Conjugated() * Quat(Vec4(inValue, 0)) * *this).mValue);
+}
+
+Vec3 Quat::RotateAxisX() const
+{
+	// This is *this * Vec3::sAxisX() written out:
+	JPH_ASSERT(IsNormalized());
+	float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
+	float tx = 2.0f * x, tw = 2.0f * w;
+	return Vec3(tx * x + tw * w - 1.0f, tx * y + z * tw, tx * z - y * tw);
+}
+
+Vec3 Quat::RotateAxisY() const
+{
+	// This is *this * Vec3::sAxisY() written out:
+	JPH_ASSERT(IsNormalized());
+	float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
+	float ty = 2.0f * y, tw = 2.0f * w;
+	return Vec3(x * ty - z * tw, tw * w + ty * y - 1.0f, x * tw + ty * z);
+}
+
+Vec3 Quat::RotateAxisZ() const
+{
+	// This is *this * Vec3::sAxisZ() written out:
+	JPH_ASSERT(IsNormalized());
+	float x = GetX(), y = GetY(), z = GetZ(), w = GetW();
+	float tz = 2.0f * z, tw = 2.0f * w;
+	return Vec3(x * tz + y * tw, y * tz - x * tw, tw * w + tz * z - 1.0f);
+}
+
+void Quat::StoreFloat3(Float3 *outV) const
+{
+	JPH_ASSERT(IsNormalized());
+	EnsureWPositive().GetXYZ().StoreFloat3(outV);
+}
+
+Quat Quat::sLoadFloat3Unsafe(const Float3 &inV)
+{
+	Vec3 v = Vec3::sLoadFloat3Unsafe(inV);
+	float w = sqrt(max(1.0f - v.LengthSq(), 0.0f)); // It is possible that the length of v is a fraction above 1, and we don't want to introduce NaN's in that case so we clamp to 0
+	return Quat(Vec4(v, w));
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Real.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Real.h
@@ -0,0 +1,44 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2022 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/DVec3.h>
+#include <Jolt/Math/DMat44.h>
+
+JPH_NAMESPACE_BEGIN
+
+#ifdef JPH_DOUBLE_PRECISION
+
+// Define real to double
+using Real = double;
+using Real3 = Double3;
+using RVec3 = DVec3;
+using RVec3Arg = DVec3Arg;
+using RMat44 = DMat44;
+using RMat44Arg = DMat44Arg;
+
+#define JPH_RVECTOR_ALIGNMENT JPH_DVECTOR_ALIGNMENT
+
+#else
+
+// Define real to float
+using Real = float;
+using Real3 = Float3;
+using RVec3 = Vec3;
+using RVec3Arg = Vec3Arg;
+using RMat44 = Mat44;
+using RMat44Arg = Mat44Arg;
+
+#define JPH_RVECTOR_ALIGNMENT JPH_VECTOR_ALIGNMENT
+
+#endif // JPH_DOUBLE_PRECISION
+
+// Put the 'real' operator in a namespace so that users can opt in to use it:
+// using namespace JPH::literals;
+namespace literals {
+	constexpr Real operator ""_r (long double inValue) { return Real(inValue); }
+};
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Swizzle.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Swizzle.h
@@ -0,0 +1,19 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Enum indicating which component to use when swizzling
+enum
+{
+	SWIZZLE_X = 0,			///< Use the X component
+	SWIZZLE_Y = 1,			///< Use the Y component
+	SWIZZLE_Z = 2,			///< Use the Z component
+	SWIZZLE_W = 3,			///< Use the W component
+	SWIZZLE_UNUSED = 2,		///< We always use the Z component when we don't specifically want to initialize a value, this is consistent with what is done in Vec3(x, y, z), Vec3(Float3 &) and Vec3::sLoadFloat3Unsafe
+};
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Trigonometry.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Trigonometry.h
@@ -0,0 +1,79 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+// Note that this file exists because std::sin etc. are not platform independent and will lead to non-deterministic simulation
+
+/// Sine of x (input in radians)
+JPH_INLINE float Sin(float inX)
+{
+	Vec4 s, c;
+	Vec4::sReplicate(inX).SinCos(s, c);
+	return s.GetX();
+}
+
+/// Cosine of x (input in radians)
+JPH_INLINE float Cos(float inX)
+{
+	Vec4 s, c;
+	Vec4::sReplicate(inX).SinCos(s, c);
+	return c.GetX();
+}
+
+/// Tangent of x (input in radians)
+JPH_INLINE float Tan(float inX)
+{
+	return Vec4::sReplicate(inX).Tan().GetX();
+}
+
+/// Arc sine of x (returns value in the range [-PI / 2, PI / 2])
+/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::asin
+JPH_INLINE float ASin(float inX)
+{
+	return Vec4::sReplicate(inX).ASin().GetX();
+}
+
+/// Arc cosine of x (returns value in the range [0, PI])
+/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::acos
+JPH_INLINE float ACos(float inX)
+{
+	return Vec4::sReplicate(inX).ACos().GetX();
+}
+
+/// An approximation of ACos, max error is 4.2e-3 over the entire range [-1, 1], is approximately 2.5x faster than ACos
+JPH_INLINE float ACosApproximate(float inX)
+{
+	// See: https://www.johndcook.com/blog/2022/09/06/inverse-cosine-near-1/
+	// See also: https://seblagarde.wordpress.com/2014/12/01/inverse-trigonometric-functions-gpu-optimization-for-amd-gcn-architecture/
+	// Taylor of cos(x) = 1 - x^2 / 2 + ...
+	// Substitute x = sqrt(2 y) we get: cos(sqrt(2 y)) = 1 - y
+	// Substitute z = 1 - y we get: cos(sqrt(2 (1 - z))) = z <=> acos(z) = sqrt(2 (1 - z))
+	// To avoid the discontinuity at 1, instead of using the Taylor expansion of acos(x) we use acos(x) / sqrt(2 (1 - x)) = 1 + (1 - x) / 12 + ...
+	// Since the approximation was made at 1, it has quite a large error at 0 meaning that if we want to extend to the
+	// range [-1, 1] by mirroring the range [0, 1], the value at 0+ is not the same as 0-.
+	// So we observe that the form of the Taylor expansion is f(x) = sqrt(1 - x) * (a + b x) and we fit the function so that f(0) = pi / 2
+	// this gives us a = pi / 2. f(1) = 0 regardless of b. We search for a constant b that minimizes the error in the range [0, 1].
+	float abs_x = min(abs(inX), 1.0f); // Ensure that we don't get a value larger than 1
+	float val = sqrt(1.0f - abs_x) * (JPH_PI / 2 - 0.175394f * abs_x);
+
+	// Our approximation is valid in the range [0, 1], extend it to the range [-1, 1]
+	return inX < 0? JPH_PI - val : val;
+}
+
+/// Arc tangent of x (returns value in the range [-PI / 2, PI / 2])
+JPH_INLINE float ATan(float inX)
+{
+	return Vec4::sReplicate(inX).ATan().GetX();
+}
+
+/// Arc tangent of y / x using the signs of the arguments to determine the correct quadrant (returns value in the range [-PI, PI])
+JPH_INLINE float ATan2(float inY, float inX)
+{
+	return Vec4::sATan2(Vec4::sReplicate(inY), Vec4::sReplicate(inX)).GetX();
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/UVec4.h
+++ b/thirdparty/jolt_physics/Jolt/Math/UVec4.h
@@ -0,0 +1,220 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Vec4.h>
+
+JPH_NAMESPACE_BEGIN
+
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) UVec4
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying vector type
+#if defined(JPH_USE_SSE)
+	using Type = __m128i;
+#elif defined(JPH_USE_NEON)
+	using Type = uint32x4_t;
+#else
+	using Type = struct { uint32 mData[4]; };
+#endif
+
+	/// Constructor
+								UVec4() = default; ///< Intentionally not initialized for performance reasons
+								UVec4(const UVec4 &inRHS) = default;
+	UVec4 &						operator = (const UVec4 &inRHS) = default;
+	JPH_INLINE					UVec4(Type inRHS) : mValue(inRHS)					{ }
+
+	/// Create a vector from 4 integer components
+	JPH_INLINE					UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW);
+
+	/// Comparison
+	JPH_INLINE bool				operator == (UVec4Arg inV2) const;
+	JPH_INLINE bool				operator != (UVec4Arg inV2) const					{ return !(*this == inV2); }
+
+	/// Swizzle the elements in inV
+	template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
+	JPH_INLINE UVec4			Swizzle() const;
+
+	/// Vector with all zeros
+	static JPH_INLINE UVec4		sZero();
+
+	/// Replicate int inV across all components
+	static JPH_INLINE UVec4		sReplicate(uint32 inV);
+
+	/// Load 1 int from memory and place it in the X component, zeros Y, Z and W
+	static JPH_INLINE UVec4		sLoadInt(const uint32 *inV);
+
+	/// Load 4 ints from memory
+	static JPH_INLINE UVec4		sLoadInt4(const uint32 *inV);
+
+	/// Load 4 ints from memory, aligned to 16 bytes
+	static JPH_INLINE UVec4		sLoadInt4Aligned(const uint32 *inV);
+
+	/// Gather 4 ints from memory at inBase + inOffsets[i] * Scale
+	template <const int Scale>
+	static JPH_INLINE UVec4		sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets);
+
+	/// Return the minimum value of each of the components
+	static JPH_INLINE UVec4		sMin(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Return the maximum of each of the components
+	static JPH_INLINE UVec4		sMax(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Equals (component wise)
+	static JPH_INLINE UVec4		sEquals(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
+	static JPH_INLINE UVec4		sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl);
+
+	/// Logical or (component wise)
+	static JPH_INLINE UVec4		sOr(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Logical xor (component wise)
+	static JPH_INLINE UVec4		sXor(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Logical and (component wise)
+	static JPH_INLINE UVec4		sAnd(UVec4Arg inV1, UVec4Arg inV2);
+
+	/// Logical not (component wise)
+	static JPH_INLINE UVec4		sNot(UVec4Arg inV1);
+
+	/// Sorts the elements in inIndex so that the values that correspond to trues in inValue are the first elements.
+	/// The remaining elements will be set to inValue.w.
+	/// I.e. if inValue = (true, false, true, false) and inIndex = (1, 2, 3, 4) the function returns (1, 3, 4, 4).
+	static JPH_INLINE UVec4		sSort4True(UVec4Arg inValue, UVec4Arg inIndex);
+
+	/// Get individual components
+#if defined(JPH_USE_SSE)
+	JPH_INLINE uint32			GetX() const										{ return uint32(_mm_cvtsi128_si32(mValue)); }
+	JPH_INLINE uint32			GetY() const										{ return mU32[1]; }
+	JPH_INLINE uint32			GetZ() const										{ return mU32[2]; }
+	JPH_INLINE uint32			GetW() const										{ return mU32[3]; }
+#elif defined(JPH_USE_NEON)
+	JPH_INLINE uint32			GetX() const										{ return vgetq_lane_u32(mValue, 0); }
+	JPH_INLINE uint32			GetY() const										{ return vgetq_lane_u32(mValue, 1); }
+	JPH_INLINE uint32			GetZ() const										{ return vgetq_lane_u32(mValue, 2); }
+	JPH_INLINE uint32			GetW() const										{ return vgetq_lane_u32(mValue, 3); }
+#else
+	JPH_INLINE uint32			GetX() const										{ return mU32[0]; }
+	JPH_INLINE uint32			GetY() const										{ return mU32[1]; }
+	JPH_INLINE uint32			GetZ() const										{ return mU32[2]; }
+	JPH_INLINE uint32			GetW() const										{ return mU32[3]; }
+#endif
+
+	/// Set individual components
+	JPH_INLINE void				SetX(uint32 inX)									{ mU32[0] = inX; }
+	JPH_INLINE void				SetY(uint32 inY)									{ mU32[1] = inY; }
+	JPH_INLINE void				SetZ(uint32 inZ)									{ mU32[2] = inZ; }
+	JPH_INLINE void				SetW(uint32 inW)									{ mU32[3] = inW; }
+
+	/// Get component by index
+	JPH_INLINE uint32			operator [] (uint inCoordinate) const				{ JPH_ASSERT(inCoordinate < 4); return mU32[inCoordinate]; }
+	JPH_INLINE uint32 &			operator [] (uint inCoordinate)						{ JPH_ASSERT(inCoordinate < 4); return mU32[inCoordinate]; }
+
+	/// Multiplies each of the 4 integer components with an integer (discards any overflow)
+	JPH_INLINE UVec4			operator * (UVec4Arg inV2) const;
+
+	/// Adds an integer value to all integer components (discards any overflow)
+	JPH_INLINE UVec4			operator + (UVec4Arg inV2);
+
+	/// Add two integer vectors (component wise)
+	JPH_INLINE UVec4 &			operator += (UVec4Arg inV2);
+
+	/// Replicate the X component to all components
+	JPH_INLINE UVec4			SplatX() const;
+
+	/// Replicate the Y component to all components
+	JPH_INLINE UVec4			SplatY() const;
+
+	/// Replicate the Z component to all components
+	JPH_INLINE UVec4			SplatZ() const;
+
+	/// Replicate the W component to all components
+	JPH_INLINE UVec4			SplatW() const;
+
+	/// Convert each component from an int to a float
+	JPH_INLINE Vec4				ToFloat() const;
+
+	/// Reinterpret UVec4 as a Vec4 (doesn't change the bits)
+	JPH_INLINE Vec4				ReinterpretAsFloat() const;
+
+	/// Store 4 ints to memory
+	JPH_INLINE void				StoreInt4(uint32 *outV) const;
+
+	/// Store 4 ints to memory, aligned to 16 bytes
+	JPH_INLINE void				StoreInt4Aligned(uint32 *outV) const;
+
+	/// Test if any of the components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAnyTrue() const;
+
+	/// Test if any of X, Y or Z components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAnyXYZTrue() const;
+
+	/// Test if all components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAllTrue() const;
+
+	/// Test if X, Y and Z components are true (true is when highest bit of component is set)
+	JPH_INLINE bool				TestAllXYZTrue() const;
+
+	/// Count the number of components that are true (true is when highest bit of component is set)
+	JPH_INLINE int				CountTrues() const;
+
+	/// Store if X is true in bit 0, Y in bit 1, Z in bit 2 and W in bit 3 (true is when highest bit of component is set)
+	JPH_INLINE int				GetTrues() const;
+
+	/// Shift all components by Count bits to the left (filling with zeros from the left)
+	template <const uint Count>
+	JPH_INLINE UVec4			LogicalShiftLeft() const;
+
+	/// Shift all components by Count bits to the right (filling with zeros from the right)
+	template <const uint Count>
+	JPH_INLINE UVec4			LogicalShiftRight() const;
+
+	/// Shift all components by Count bits to the right (shifting in the value of the highest bit)
+	template <const uint Count>
+	JPH_INLINE UVec4			ArithmeticShiftRight() const;
+
+	/// Takes the lower 4 16 bits and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Uint16Lo() const;
+
+	/// Takes the upper 4 16 bits and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Uint16Hi() const;
+
+	/// Takes byte 0 .. 3 and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Byte0() const;
+
+	/// Takes byte 4 .. 7 and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Byte4() const;
+
+	/// Takes byte 8 .. 11 and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Byte8() const;
+
+	/// Takes byte 12 .. 15 and expands them to X, Y, Z and W
+	JPH_INLINE UVec4			Expand4Byte12() const;
+
+	/// Shift vector components by 4 - Count floats to the left, so if Count = 1 the resulting vector is (W, 0, 0, 0), when Count = 3 the resulting vector is (Y, Z, W, 0)
+	JPH_INLINE UVec4			ShiftComponents4Minus(int inCount) const;
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, UVec4Arg inV)
+	{
+		inStream << inV.mU32[0] << ", " << inV.mU32[1] << ", " << inV.mU32[2] << ", " << inV.mU32[3];
+		return inStream;
+	}
+
+	union
+	{
+		Type					mValue;
+		uint32					mU32[4];
+	};
+};
+
+static_assert(std::is_trivial<UVec4>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "UVec4.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/UVec4.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/UVec4.inl
@@ -0,0 +1,581 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+JPH_NAMESPACE_BEGIN
+
+UVec4::UVec4(uint32 inX, uint32 inY, uint32 inZ, uint32 inW)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_set_epi32(int(inW), int(inZ), int(inY), int(inX));
+#elif defined(JPH_USE_NEON)
+	uint32x2_t xy = vcreate_u32(static_cast<uint64>(inX) | (static_cast<uint64>(inY) << 32));
+	uint32x2_t zw = vcreate_u32(static_cast<uint64>(inZ) | (static_cast<uint64>(inW) << 32));
+	mValue = vcombine_u32(xy, zw);
+#else
+	mU32[0] = inX;
+	mU32[1] = inY;
+	mU32[2] = inZ;
+	mU32[3] = inW;
+#endif
+}
+
+bool UVec4::operator == (UVec4Arg inV2) const
+{
+	return sEquals(*this, inV2).TestAllTrue();
+}
+
+template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
+UVec4 UVec4::Swizzle() const
+{
+	static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+	static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+	static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+	static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
+#elif defined(JPH_USE_NEON)
+	return JPH_NEON_SHUFFLE_U32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
+#else
+	return UVec4(mU32[SwizzleX], mU32[SwizzleY], mU32[SwizzleZ], mU32[SwizzleW]);
+#endif
+}
+
+UVec4 UVec4::sZero()
+{
+#if defined(JPH_USE_SSE)
+	return _mm_setzero_si128();
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_u32(0);
+#else
+	return UVec4(0, 0, 0, 0);
+#endif
+}
+
+UVec4 UVec4::sReplicate(uint32 inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_set1_epi32(int(inV));
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_u32(inV);
+#else
+	return UVec4(inV, inV, inV, inV);
+#endif
+}
+
+UVec4 UVec4::sLoadInt(const uint32 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(inV)));
+#elif defined(JPH_USE_NEON)
+	return vsetq_lane_u32(*inV, vdupq_n_u32(0), 0);
+#else
+	return UVec4(*inV, 0, 0, 0);
+#endif
+}
+
+UVec4 UVec4::sLoadInt4(const uint32 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_loadu_si128(reinterpret_cast<const __m128i *>(inV));
+#elif defined(JPH_USE_NEON)
+	return vld1q_u32(inV);
+#else
+	return UVec4(inV[0], inV[1], inV[2], inV[3]);
+#endif
+}
+
+UVec4 UVec4::sLoadInt4Aligned(const uint32 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_load_si128(reinterpret_cast<const __m128i *>(inV));
+#elif defined(JPH_USE_NEON)
+	return vld1q_u32(inV); // ARM doesn't make distinction between aligned or not
+#else
+	return UVec4(inV[0], inV[1], inV[2], inV[3]);
+#endif
+}
+
+template <const int Scale>
+UVec4 UVec4::sGatherInt4(const uint32 *inBase, UVec4Arg inOffsets)
+{
+#ifdef JPH_USE_AVX2
+	return _mm_i32gather_epi32(reinterpret_cast<const int *>(inBase), inOffsets.mValue, Scale);
+#else
+	const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
+	uint32 x = *reinterpret_cast<const uint32 *>(base + inOffsets.GetX() * Scale);
+	uint32 y = *reinterpret_cast<const uint32 *>(base + inOffsets.GetY() * Scale);
+	uint32 z = *reinterpret_cast<const uint32 *>(base + inOffsets.GetZ() * Scale);
+	uint32 w = *reinterpret_cast<const uint32 *>(base + inOffsets.GetW() * Scale);
+	return UVec4(x, y, z, w);
+#endif
+}
+
+UVec4 UVec4::sMin(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_min_epu32(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vminq_u32(inV1.mValue, inV2.mValue);
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = min(inV1.mU32[i], inV2.mU32[i]);
+	return result;
+#endif
+}
+
+UVec4 UVec4::sMax(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_max_epu32(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmaxq_u32(inV1.mValue, inV2.mValue);
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = max(inV1.mU32[i], inV2.mU32[i]);
+	return result;
+#endif
+}
+
+UVec4 UVec4::sEquals(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_cmpeq_epi32(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vceqq_u32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mU32[0] == inV2.mU32[0]? 0xffffffffu : 0,
+				 inV1.mU32[1] == inV2.mU32[1]? 0xffffffffu : 0,
+				 inV1.mU32[2] == inV2.mU32[2]? 0xffffffffu : 0,
+				 inV1.mU32[3] == inV2.mU32[3]? 0xffffffffu : 0);
+#endif
+}
+
+UVec4 UVec4::sSelect(UVec4Arg inNotSet, UVec4Arg inSet, UVec4Arg inControl)
+{
+#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
+	return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(inNotSet.mValue), _mm_castsi128_ps(inSet.mValue), _mm_castsi128_ps(inControl.mValue)));
+#elif defined(JPH_USE_SSE)
+	__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(is_set, _mm_castsi128_ps(inSet.mValue)), _mm_andnot_ps(is_set, _mm_castsi128_ps(inNotSet.mValue))));
+#elif defined(JPH_USE_NEON)
+	return vbslq_u32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mU32[i] : inNotSet.mU32[i];
+	return result;
+#endif
+}
+
+UVec4 UVec4::sOr(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_or_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vorrq_u32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mU32[0] | inV2.mU32[0],
+				 inV1.mU32[1] | inV2.mU32[1],
+				 inV1.mU32[2] | inV2.mU32[2],
+				 inV1.mU32[3] | inV2.mU32[3]);
+#endif
+}
+
+UVec4 UVec4::sXor(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_xor_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return veorq_u32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mU32[0] ^ inV2.mU32[0],
+				 inV1.mU32[1] ^ inV2.mU32[1],
+				 inV1.mU32[2] ^ inV2.mU32[2],
+				 inV1.mU32[3] ^ inV2.mU32[3]);
+#endif
+}
+
+UVec4 UVec4::sAnd(UVec4Arg inV1, UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_and_si128(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vandq_u32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mU32[0] & inV2.mU32[0],
+				 inV1.mU32[1] & inV2.mU32[1],
+				 inV1.mU32[2] & inV2.mU32[2],
+				 inV1.mU32[3] & inV2.mU32[3]);
+#endif
+}
+
+
+UVec4 UVec4::sNot(UVec4Arg inV1)
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_ternarylogic_epi32(inV1.mValue, inV1.mValue, inV1.mValue, 0b01010101);
+#elif defined(JPH_USE_SSE)
+	return sXor(inV1, sReplicate(0xffffffff));
+#elif defined(JPH_USE_NEON)
+	return vmvnq_u32(inV1.mValue);
+#else
+	return UVec4(~inV1.mU32[0], ~inV1.mU32[1], ~inV1.mU32[2], ~inV1.mU32[3]);
+#endif
+}
+
+UVec4 UVec4::sSort4True(UVec4Arg inValue, UVec4Arg inIndex)
+{
+	// If inValue.z is false then shift W to Z
+	UVec4 v = UVec4::sSelect(inIndex.Swizzle<SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>(), inIndex, inValue.SplatZ());
+
+	// If inValue.y is false then shift Z and further to Y and further
+	v = UVec4::sSelect(v.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatY());
+
+	// If inValue.x is false then shift X and further to Y and further
+	v = UVec4::sSelect(v.Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_W>(), v, inValue.SplatX());
+
+	return v;
+}
+
+UVec4 UVec4::operator * (UVec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_mullo_epi32(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmulq_u32(mValue, inV2.mValue);
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = mU32[i] * inV2.mU32[i];
+	return result;
+#endif
+}
+
+UVec4 UVec4::operator + (UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_add_epi32(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vaddq_u32(mValue, inV2.mValue);
+#else
+	return UVec4(mU32[0] + inV2.mU32[0],
+				 mU32[1] + inV2.mU32[1],
+				 mU32[2] + inV2.mU32[2],
+				 mU32[3] + inV2.mU32[3]);
+#endif
+}
+
+UVec4 &UVec4::operator += (UVec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_add_epi32(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vaddq_u32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		mU32[i] += inV2.mU32[i];
+#endif
+	return *this;
+}
+
+UVec4 UVec4::SplatX() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(0, 0, 0, 0));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_u32(mValue, 0);
+#else
+	return UVec4(mU32[0], mU32[0], mU32[0], mU32[0]);
+#endif
+}
+
+UVec4 UVec4::SplatY() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(1, 1, 1, 1));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_u32(mValue, 1);
+#else
+	return UVec4(mU32[1], mU32[1], mU32[1], mU32[1]);
+#endif
+}
+
+UVec4 UVec4::SplatZ() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(2, 2, 2, 2));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_u32(mValue, 2);
+#else
+	return UVec4(mU32[2], mU32[2], mU32[2], mU32[2]);
+#endif
+}
+
+UVec4 UVec4::SplatW() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_epi32(mValue, _MM_SHUFFLE(3, 3, 3, 3));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_u32(mValue, 3);
+#else
+	return UVec4(mU32[3], mU32[3], mU32[3], mU32[3]);
+#endif
+}
+
+Vec4 UVec4::ToFloat() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_cvtepi32_ps(mValue);
+#elif defined(JPH_USE_NEON)
+	return vcvtq_f32_u32(mValue);
+#else
+	return Vec4((float)mU32[0], (float)mU32[1], (float)mU32[2], (float)mU32[3]);
+#endif
+}
+
+Vec4 UVec4::ReinterpretAsFloat() const
+{
+#if defined(JPH_USE_SSE)
+	return Vec4(_mm_castsi128_ps(mValue));
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(mValue);
+#else
+	return *reinterpret_cast<const Vec4 *>(this);
+#endif
+}
+
+void UVec4::StoreInt4(uint32 *outV) const
+{
+#if defined(JPH_USE_SSE)
+	_mm_storeu_si128(reinterpret_cast<__m128i *>(outV), mValue);
+#elif defined(JPH_USE_NEON)
+	vst1q_u32(outV, mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		outV[i] = mU32[i];
+#endif
+}
+
+void UVec4::StoreInt4Aligned(uint32 *outV) const
+{
+#if defined(JPH_USE_SSE)
+	_mm_store_si128(reinterpret_cast<__m128i *>(outV), mValue);
+#elif defined(JPH_USE_NEON)
+	vst1q_u32(outV, mValue); // ARM doesn't make distinction between aligned or not
+#else
+	for (int i = 0; i < 4; ++i)
+		outV[i] = mU32[i];
+#endif
+}
+
+int UVec4::CountTrues() const
+{
+#if defined(JPH_USE_SSE)
+	return CountBits(_mm_movemask_ps(_mm_castsi128_ps(mValue)));
+#elif defined(JPH_USE_NEON)
+	return vaddvq_u32(vshrq_n_u32(mValue, 31));
+#else
+	return (mU32[0] >> 31) + (mU32[1] >> 31) + (mU32[2] >> 31) + (mU32[3] >> 31);
+#endif
+}
+
+int UVec4::GetTrues() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_movemask_ps(_mm_castsi128_ps(mValue));
+#elif defined(JPH_USE_NEON)
+	int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
+	return vaddvq_u32(vshlq_u32(vshrq_n_u32(mValue, 31), shift));
+#else
+	return (mU32[0] >> 31) | ((mU32[1] >> 31) << 1) | ((mU32[2] >> 31) << 2) | ((mU32[3] >> 31) << 3);
+#endif
+}
+
+bool UVec4::TestAnyTrue() const
+{
+	return GetTrues() != 0;
+}
+
+bool UVec4::TestAnyXYZTrue() const
+{
+	return (GetTrues() & 0b111) != 0;
+}
+
+bool UVec4::TestAllTrue() const
+{
+	return GetTrues() == 0b1111;
+}
+
+bool UVec4::TestAllXYZTrue() const
+{
+	return (GetTrues() & 0b111) == 0b111;
+}
+
+template <const uint Count>
+UVec4 UVec4::LogicalShiftLeft() const
+{
+	static_assert(Count <= 31, "Invalid shift");
+
+#if defined(JPH_USE_SSE)
+	return _mm_slli_epi32(mValue, Count);
+#elif defined(JPH_USE_NEON)
+	return vshlq_n_u32(mValue, Count);
+#else
+	return UVec4(mU32[0] << Count, mU32[1] << Count, mU32[2] << Count, mU32[3] << Count);
+#endif
+}
+
+template <const uint Count>
+UVec4 UVec4::LogicalShiftRight() const
+{
+	static_assert(Count <= 31, "Invalid shift");
+
+#if defined(JPH_USE_SSE)
+	return _mm_srli_epi32(mValue, Count);
+#elif defined(JPH_USE_NEON)
+	return vshrq_n_u32(mValue, Count);
+#else
+	return UVec4(mU32[0] >> Count, mU32[1] >> Count, mU32[2] >> Count, mU32[3] >> Count);
+#endif
+}
+
+template <const uint Count>
+UVec4 UVec4::ArithmeticShiftRight() const
+{
+	static_assert(Count <= 31, "Invalid shift");
+
+#if defined(JPH_USE_SSE)
+	return _mm_srai_epi32(mValue, Count);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(mValue), Count));
+#else
+	return UVec4(uint32(int32_t(mU32[0]) >> Count),
+				 uint32(int32_t(mU32[1]) >> Count),
+				 uint32(int32_t(mU32[2]) >> Count),
+				 uint32(int32_t(mU32[3]) >> Count));
+#endif
+}
+
+UVec4 UVec4::Expand4Uint16Lo() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_unpacklo_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
+#elif defined(JPH_USE_NEON)
+	uint16x4_t value = vget_low_u16(vreinterpretq_u16_u32(mValue));
+	uint16x4_t zero = vdup_n_u16(0);
+	return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
+#else
+	return UVec4(mU32[0] & 0xffff,
+				 (mU32[0] >> 16) & 0xffff,
+				 mU32[1] & 0xffff,
+				 (mU32[1] >> 16) & 0xffff);
+#endif
+}
+
+UVec4 UVec4::Expand4Uint16Hi() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_unpackhi_epi16(mValue, _mm_castps_si128(_mm_setzero_ps()));
+#elif defined(JPH_USE_NEON)
+	uint16x4_t value = vget_high_u16(vreinterpretq_u16_u32(mValue));
+	uint16x4_t zero = vdup_n_u16(0);
+	return vreinterpretq_u32_u16(vcombine_u16(vzip1_u16(value, zero), vzip2_u16(value, zero)));
+#else
+	return UVec4(mU32[2] & 0xffff,
+				 (mU32[2] >> 16) & 0xffff,
+				 mU32[3] & 0xffff,
+				 (mU32[3] >> 16) & 0xffff);
+#endif
+}
+
+UVec4 UVec4::Expand4Byte0() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff03), int(0xffffff02), int(0xffffff01), int(0xffffff00)));
+#elif defined(JPH_USE_NEON)
+	uint8x16_t idx = JPH_NEON_UINT8x16(0x00, 0x7f, 0x7f, 0x7f, 0x01, 0x7f, 0x7f, 0x7f, 0x02, 0x7f, 0x7f, 0x7f, 0x03, 0x7f, 0x7f, 0x7f);
+	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = (mU32[0] >> (i * 8)) & 0xff;
+	return result;
+#endif
+}
+
+UVec4 UVec4::Expand4Byte4() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff07), int(0xffffff06), int(0xffffff05), int(0xffffff04)));
+#elif defined(JPH_USE_NEON)
+	uint8x16_t idx = JPH_NEON_UINT8x16(0x04, 0x7f, 0x7f, 0x7f, 0x05, 0x7f, 0x7f, 0x7f, 0x06, 0x7f, 0x7f, 0x7f, 0x07, 0x7f, 0x7f, 0x7f);
+	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = (mU32[1] >> (i * 8)) & 0xff;
+	return result;
+#endif
+}
+
+UVec4 UVec4::Expand4Byte8() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0b), int(0xffffff0a), int(0xffffff09), int(0xffffff08)));
+#elif defined(JPH_USE_NEON)
+	uint8x16_t idx = JPH_NEON_UINT8x16(0x08, 0x7f, 0x7f, 0x7f, 0x09, 0x7f, 0x7f, 0x7f, 0x0a, 0x7f, 0x7f, 0x7f, 0x0b, 0x7f, 0x7f, 0x7f);
+	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = (mU32[2] >> (i * 8)) & 0xff;
+	return result;
+#endif
+}
+
+UVec4 UVec4::Expand4Byte12() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_shuffle_epi8(mValue, _mm_set_epi32(int(0xffffff0f), int(0xffffff0e), int(0xffffff0d), int(0xffffff0c)));
+#elif defined(JPH_USE_NEON)
+	uint8x16_t idx = JPH_NEON_UINT8x16(0x0c, 0x7f, 0x7f, 0x7f, 0x0d, 0x7f, 0x7f, 0x7f, 0x0e, 0x7f, 0x7f, 0x7f, 0x0f, 0x7f, 0x7f, 0x7f);
+	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
+#else
+	UVec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mU32[i] = (mU32[3] >> (i * 8)) & 0xff;
+	return result;
+#endif
+}
+
+UVec4 UVec4::ShiftComponents4Minus(int inCount) const
+{
+#if defined(JPH_USE_SSE4_1) || defined(JPH_USE_NEON)
+	alignas(UVec4) static constexpr uint32 sFourMinusXShuffle[5][4] =
+	{
+		{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
+		{ 0x0f0e0d0c, 0xffffffff, 0xffffffff, 0xffffffff },
+		{ 0x0b0a0908, 0x0f0e0d0c, 0xffffffff, 0xffffffff },
+		{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0xffffffff },
+		{ 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c }
+	};
+#endif
+
+#if defined(JPH_USE_SSE4_1)
+	return _mm_shuffle_epi8(mValue, *reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
+#elif defined(JPH_USE_NEON)
+	uint8x16_t idx = vreinterpretq_u8_u32(*reinterpret_cast<const UVec4::Type *>(sFourMinusXShuffle[inCount]));
+	return vreinterpretq_u32_s8(vqtbl1q_s8(vreinterpretq_s8_u32(mValue), idx));
+#else
+	UVec4 result = UVec4::sZero();
+	for (int i = 0; i < inCount; i++)
+		result.mU32[i] = mU32[i + 4 - inCount];
+	return result;
+#endif
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Vec3.cpp
+++ b/thirdparty/jolt_physics/Jolt/Math/Vec3.cpp
@@ -0,0 +1,71 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Jolt.h>
+
+#include <Jolt/Math/Vec3.h>
+
+JPH_NAMESPACE_BEGIN
+
+static void sAddVertex(StaticArray<Vec3, 1026> &ioVertices, Vec3Arg inVertex)
+{
+	bool found = false;
+	for (const Vec3 &v : ioVertices)
+		if (v == inVertex)
+		{
+			found = true;
+			break;
+		}
+	if (!found)
+		ioVertices.push_back(inVertex);
+}
+
+static void sCreateVertices(StaticArray<Vec3, 1026> &ioVertices, Vec3Arg inDir1, Vec3Arg inDir2, Vec3Arg inDir3, int inLevel)
+{
+	Vec3 center1 = (inDir1 + inDir2).Normalized();
+	Vec3 center2 = (inDir2 + inDir3).Normalized();
+	Vec3 center3 = (inDir3 + inDir1).Normalized();
+
+	sAddVertex(ioVertices, center1);
+	sAddVertex(ioVertices, center2);
+	sAddVertex(ioVertices, center3);
+
+	if (inLevel > 0)
+	{
+		int new_level = inLevel - 1;
+		sCreateVertices(ioVertices, inDir1, center1, center3, new_level);
+		sCreateVertices(ioVertices, center1, center2, center3, new_level);
+		sCreateVertices(ioVertices, center1, inDir2, center2, new_level);
+		sCreateVertices(ioVertices, center3, center2, inDir3, new_level);
+	}
+}
+
+const StaticArray<Vec3, 1026> Vec3::sUnitSphere = []() {
+
+	const int level = 3;
+
+	StaticArray<Vec3, 1026> verts;
+
+	// Add unit axis
+	verts.push_back(Vec3::sAxisX());
+	verts.push_back(-Vec3::sAxisX());
+	verts.push_back(Vec3::sAxisY());
+	verts.push_back(-Vec3::sAxisY());
+	verts.push_back(Vec3::sAxisZ());
+	verts.push_back(-Vec3::sAxisZ());
+
+	// Subdivide
+	sCreateVertices(verts, Vec3::sAxisX(), Vec3::sAxisY(), Vec3::sAxisZ(), level);
+	sCreateVertices(verts, -Vec3::sAxisX(), Vec3::sAxisY(), Vec3::sAxisZ(), level);
+	sCreateVertices(verts, Vec3::sAxisX(), -Vec3::sAxisY(), Vec3::sAxisZ(), level);
+	sCreateVertices(verts, -Vec3::sAxisX(), -Vec3::sAxisY(), Vec3::sAxisZ(), level);
+	sCreateVertices(verts, Vec3::sAxisX(), Vec3::sAxisY(), -Vec3::sAxisZ(), level);
+	sCreateVertices(verts, -Vec3::sAxisX(), Vec3::sAxisY(), -Vec3::sAxisZ(), level);
+	sCreateVertices(verts, Vec3::sAxisX(), -Vec3::sAxisY(), -Vec3::sAxisZ(), level);
+	sCreateVertices(verts, -Vec3::sAxisX(), -Vec3::sAxisY(), -Vec3::sAxisZ(), level);
+
+	return verts;
+}();
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Vec3.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Vec3.h
@@ -0,0 +1,298 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Core/StaticArray.h>
+#include <Jolt/Math/Float3.h>
+#include <Jolt/Math/Swizzle.h>
+#include <Jolt/Math/MathTypes.h>
+
+JPH_NAMESPACE_BEGIN
+
+/// 3 component vector (stored as 4 vectors).
+/// Note that we keep the 4th component the same as the 3rd component to avoid divisions by zero when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED defined
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec3
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying vector type
+#if defined(JPH_USE_SSE)
+	using Type = __m128;
+#elif defined(JPH_USE_NEON)
+	using Type = float32x4_t;
+#else
+	using Type = Vec4::Type;
+#endif
+
+	// Argument type
+	using ArgType = Vec3Arg;
+
+	/// Constructor
+								Vec3() = default; ///< Intentionally not initialized for performance reasons
+								Vec3(const Vec3 &inRHS) = default;
+	Vec3 &						operator = (const Vec3 &inRHS) = default;
+	explicit JPH_INLINE			Vec3(Vec4Arg inRHS);
+	JPH_INLINE					Vec3(Type inRHS) : mValue(inRHS)				{ CheckW(); }
+
+	/// Load 3 floats from memory
+	explicit JPH_INLINE			Vec3(const Float3 &inV);
+
+	/// Create a vector from 3 components
+	JPH_INLINE					Vec3(float inX, float inY, float inZ);
+
+	/// Vector with all zeros
+	static JPH_INLINE Vec3		sZero();
+
+	/// Vector with all ones
+	static JPH_INLINE Vec3		sOne();
+
+	/// Vector with all NaN's
+	static JPH_INLINE Vec3		sNaN();
+
+	/// Vectors with the principal axis
+	static JPH_INLINE Vec3		sAxisX()										{ return Vec3(1, 0, 0); }
+	static JPH_INLINE Vec3		sAxisY()										{ return Vec3(0, 1, 0); }
+	static JPH_INLINE Vec3		sAxisZ()										{ return Vec3(0, 0, 1); }
+
+	/// Replicate inV across all components
+	static JPH_INLINE Vec3		sReplicate(float inV);
+
+	/// Load 3 floats from memory (reads 32 bits extra which it doesn't use)
+	static JPH_INLINE Vec3		sLoadFloat3Unsafe(const Float3 &inV);
+
+	/// Return the minimum value of each of the components
+	static JPH_INLINE Vec3		sMin(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Return the maximum of each of the components
+	static JPH_INLINE Vec3		sMax(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Clamp a vector between min and max (component wise)
+	static JPH_INLINE Vec3		sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax);
+
+	/// Equals (component wise)
+	static JPH_INLINE UVec4		sEquals(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Less than (component wise)
+	static JPH_INLINE UVec4		sLess(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Less than or equal (component wise)
+	static JPH_INLINE UVec4		sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Greater than (component wise)
+	static JPH_INLINE UVec4		sGreater(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Greater than or equal (component wise)
+	static JPH_INLINE UVec4		sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Calculates inMul1 * inMul2 + inAdd
+	static JPH_INLINE Vec3		sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd);
+
+	/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
+	static JPH_INLINE Vec3		sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl);
+
+	/// Logical or (component wise)
+	static JPH_INLINE Vec3		sOr(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Logical xor (component wise)
+	static JPH_INLINE Vec3		sXor(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Logical and (component wise)
+	static JPH_INLINE Vec3		sAnd(Vec3Arg inV1, Vec3Arg inV2);
+
+	/// Get unit vector given spherical coordinates
+	/// inTheta \f$\in [0, \pi]\f$ is angle between vector and z-axis
+	/// inPhi \f$\in [0, 2 \pi]\f$ is the angle in the xy-plane starting from the x axis and rotating counter clockwise around the z-axis
+	static JPH_INLINE Vec3		sUnitSpherical(float inTheta, float inPhi);
+
+	/// A set of vectors uniformly spanning the surface of a unit sphere, usable for debug purposes
+	JPH_EXPORT static const StaticArray<Vec3, 1026> sUnitSphere;
+
+	/// Get random unit vector
+	template <class Random>
+	static inline Vec3			sRandom(Random &inRandom);
+
+	/// Get individual components
+#if defined(JPH_USE_SSE)
+	JPH_INLINE float			GetX() const									{ return _mm_cvtss_f32(mValue); }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
+#elif defined(JPH_USE_NEON)
+	JPH_INLINE float			GetX() const									{ return vgetq_lane_f32(mValue, 0); }
+	JPH_INLINE float			GetY() const									{ return vgetq_lane_f32(mValue, 1); }
+	JPH_INLINE float			GetZ() const									{ return vgetq_lane_f32(mValue, 2); }
+#else
+	JPH_INLINE float			GetX() const									{ return mF32[0]; }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
+#endif
+
+	/// Set individual components
+	JPH_INLINE void				SetX(float inX)									{ mF32[0] = inX; }
+	JPH_INLINE void				SetY(float inY)									{ mF32[1] = inY; }
+	JPH_INLINE void				SetZ(float inZ)									{ mF32[2] = mF32[3] = inZ; } // Assure Z and W are the same
+
+	/// Set all components
+	JPH_INLINE void				Set(float inX, float inY, float inZ)			{ *this = Vec3(inX, inY, inZ); }
+
+	/// Get float component by index
+	JPH_INLINE float			operator [] (uint inCoordinate) const			{ JPH_ASSERT(inCoordinate < 3); return mF32[inCoordinate]; }
+
+	/// Set float component by index
+	JPH_INLINE void				SetComponent(uint inCoordinate, float inValue)	{ JPH_ASSERT(inCoordinate < 3); mF32[inCoordinate] = inValue; mValue = sFixW(mValue); } // Assure Z and W are the same
+
+	/// Comparison
+	JPH_INLINE bool				operator == (Vec3Arg inV2) const;
+	JPH_INLINE bool				operator != (Vec3Arg inV2) const				{ return !(*this == inV2); }
+
+	/// Test if two vectors are close
+	JPH_INLINE bool				IsClose(Vec3Arg inV2, float inMaxDistSq = 1.0e-12f) const;
+
+	/// Test if vector is near zero
+	JPH_INLINE bool				IsNearZero(float inMaxDistSq = 1.0e-12f) const;
+
+	/// Test if vector is normalized
+	JPH_INLINE bool				IsNormalized(float inTolerance = 1.0e-6f) const;
+
+	/// Test if vector contains NaN elements
+	JPH_INLINE bool				IsNaN() const;
+
+	/// Multiply two float vectors (component wise)
+	JPH_INLINE Vec3				operator * (Vec3Arg inV2) const;
+
+	/// Multiply vector with float
+	JPH_INLINE Vec3				operator * (float inV2) const;
+
+	/// Multiply vector with float
+	friend JPH_INLINE Vec3		operator * (float inV1, Vec3Arg inV2);
+
+	/// Divide vector by float
+	JPH_INLINE Vec3				operator / (float inV2) const;
+
+	/// Multiply vector with float
+	JPH_INLINE Vec3 &			operator *= (float inV2);
+
+	/// Multiply vector with vector
+	JPH_INLINE Vec3 &			operator *= (Vec3Arg inV2);
+
+	/// Divide vector by float
+	JPH_INLINE Vec3 &			operator /= (float inV2);
+
+	/// Add two float vectors (component wise)
+	JPH_INLINE Vec3				operator + (Vec3Arg inV2) const;
+
+	/// Add two float vectors (component wise)
+	JPH_INLINE Vec3 &			operator += (Vec3Arg inV2);
+
+	/// Negate
+	JPH_INLINE Vec3				operator - () const;
+
+	/// Subtract two float vectors (component wise)
+	JPH_INLINE Vec3				operator - (Vec3Arg inV2) const;
+
+	/// Subtract two float vectors (component wise)
+	JPH_INLINE Vec3 &			operator -= (Vec3Arg inV2);
+
+	/// Divide (component wise)
+	JPH_INLINE Vec3				operator / (Vec3Arg inV2) const;
+
+	/// Swizzle the elements in inV
+	template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
+	JPH_INLINE Vec3				Swizzle() const;
+
+	/// Replicate the X component to all components
+	JPH_INLINE Vec4				SplatX() const;
+
+	/// Replicate the Y component to all components
+	JPH_INLINE Vec4				SplatY() const;
+
+	/// Replicate the Z component to all components
+	JPH_INLINE Vec4				SplatZ() const;
+
+	/// Get index of component with lowest value
+	JPH_INLINE int				GetLowestComponentIndex() const;
+
+	/// Get index of component with highest value
+	JPH_INLINE int				GetHighestComponentIndex() const;
+
+	/// Return the absolute value of each of the components
+	JPH_INLINE Vec3				Abs() const;
+
+	/// Reciprocal vector (1 / value) for each of the components
+	JPH_INLINE Vec3				Reciprocal() const;
+
+	/// Cross product
+	JPH_INLINE Vec3				Cross(Vec3Arg inV2) const;
+
+	/// Dot product, returns the dot product in X, Y and Z components
+	JPH_INLINE Vec3				DotV(Vec3Arg inV2) const;
+
+	/// Dot product, returns the dot product in X, Y, Z and W components
+	JPH_INLINE Vec4				DotV4(Vec3Arg inV2) const;
+
+	/// Dot product
+	JPH_INLINE float			Dot(Vec3Arg inV2) const;
+
+	/// Squared length of vector
+	JPH_INLINE float			LengthSq() const;
+
+	/// Length of vector
+	JPH_INLINE float			Length() const;
+
+	/// Normalize vector
+	JPH_INLINE Vec3				Normalized() const;
+
+	/// Normalize vector or return inZeroValue if the length of the vector is zero
+	JPH_INLINE Vec3				NormalizedOr(Vec3Arg inZeroValue) const;
+
+	/// Store 3 floats to memory
+	JPH_INLINE void				StoreFloat3(Float3 *outV) const;
+
+	/// Convert each component from a float to an int
+	JPH_INLINE UVec4			ToInt() const;
+
+	/// Reinterpret Vec3 as a UVec4 (doesn't change the bits)
+	JPH_INLINE UVec4			ReinterpretAsInt() const;
+
+	/// Get the minimum of X, Y and Z
+	JPH_INLINE float			ReduceMin() const;
+
+	/// Get the maximum of X, Y and Z
+	JPH_INLINE float			ReduceMax() const;
+
+	/// Component wise square root
+	JPH_INLINE Vec3				Sqrt() const;
+
+	/// Get normalized vector that is perpendicular to this vector
+	JPH_INLINE Vec3				GetNormalizedPerpendicular() const;
+
+	/// Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
+	JPH_INLINE Vec3				GetSign() const;
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, Vec3Arg inV)
+	{
+		inStream << inV.mF32[0] << ", " << inV.mF32[1] << ", " << inV.mF32[2];
+		return inStream;
+	}
+
+	/// Internal helper function that checks that W is equal to Z, so e.g. dividing by it should not generate div by 0
+	JPH_INLINE void				CheckW() const;
+
+	/// Internal helper function that ensures that the Z component is replicated to the W component to prevent divisions by zero
+	static JPH_INLINE Type		sFixW(Type inValue);
+
+	union
+	{
+		Type					mValue;
+		float					mF32[4];
+	};
+};
+
+static_assert(std::is_trivial<Vec3>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "Vec3.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/Vec3.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/Vec3.inl
@@ -0,0 +1,860 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Math/Vec4.h>
+#include <Jolt/Math/UVec4.h>
+#include <Jolt/Core/HashCombine.h>
+
+JPH_SUPPRESS_WARNINGS_STD_BEGIN
+#include <random>
+JPH_SUPPRESS_WARNINGS_STD_END
+
+// Create a std::hash/JPH::Hash for Vec3
+JPH_MAKE_HASHABLE(JPH::Vec3, t.GetX(), t.GetY(), t.GetZ())
+
+JPH_NAMESPACE_BEGIN
+
+void Vec3::CheckW() const
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	// Avoid asserts when both components are NaN
+	JPH_ASSERT(reinterpret_cast<const uint32 *>(mF32)[2] == reinterpret_cast<const uint32 *>(mF32)[3]);
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+}
+
+JPH_INLINE Vec3::Type Vec3::sFixW(Type inValue)
+{
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	#if defined(JPH_USE_SSE)
+		return _mm_shuffle_ps(inValue, inValue, _MM_SHUFFLE(2, 2, 1, 0));
+	#elif defined(JPH_USE_NEON)
+		return JPH_NEON_SHUFFLE_F32x4(inValue, inValue, 0, 1, 2, 2);
+	#else
+		Type value;
+		value.mData[0] = inValue.mData[0];
+		value.mData[1] = inValue.mData[1];
+		value.mData[2] = inValue.mData[2];
+		value.mData[3] = inValue.mData[2];
+		return value;
+	#endif
+#else
+	return inValue;
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+}
+
+Vec3::Vec3(Vec4Arg inRHS) :
+	mValue(sFixW(inRHS.mValue))
+{
+}
+
+Vec3::Vec3(const Float3 &inV)
+{
+#if defined(JPH_USE_SSE)
+	Type x = _mm_load_ss(&inV.x);
+	Type y = _mm_load_ss(&inV.y);
+	Type z = _mm_load_ss(&inV.z);
+	Type xy = _mm_unpacklo_ps(x, y);
+	mValue = _mm_shuffle_ps(xy, z, _MM_SHUFFLE(0, 0, 1, 0)); // Assure Z and W are the same
+#elif defined(JPH_USE_NEON)
+	float32x2_t xy = vld1_f32(&inV.x);
+	float32x2_t zz = vdup_n_f32(inV.z); // Assure Z and W are the same
+	mValue = vcombine_f32(xy, zz);
+#else
+	mF32[0] = inV[0];
+	mF32[1] = inV[1];
+	mF32[2] = inV[2];
+	mF32[3] = inV[2]; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
+#endif
+}
+
+Vec3::Vec3(float inX, float inY, float inZ)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_set_ps(inZ, inZ, inY, inX);
+#elif defined(JPH_USE_NEON)
+	uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
+	uint32x2_t zz = vreinterpret_u32_f32(vdup_n_f32(inZ));
+	mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zz));
+#else
+	mF32[0] = inX;
+	mF32[1] = inY;
+	mF32[2] = inZ;
+	mF32[3] = inZ; // Not strictly needed when JPH_FLOATING_POINT_EXCEPTIONS_ENABLED is off but prevents warnings about uninitialized variables
+#endif
+}
+
+template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ>
+Vec3 Vec3::Swizzle() const
+{
+	static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+	static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+	static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleZ, SwizzleZ, SwizzleY, SwizzleX)); // Assure Z and W are the same
+#elif defined(JPH_USE_NEON)
+	return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleZ);
+#else
+	return Vec3(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ]);
+#endif
+}
+
+Vec3 Vec3::sZero()
+{
+#if defined(JPH_USE_SSE)
+	return _mm_setzero_ps();
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_f32(0);
+#else
+	return Vec3(0, 0, 0);
+#endif
+}
+
+Vec3 Vec3::sReplicate(float inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_set1_ps(inV);
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_f32(inV);
+#else
+	return Vec3(inV, inV, inV);
+#endif
+}
+
+Vec3 Vec3::sOne()
+{
+	return sReplicate(1.0f);
+}
+
+Vec3 Vec3::sNaN()
+{
+	return sReplicate(numeric_limits<float>::quiet_NaN());
+}
+
+Vec3 Vec3::sLoadFloat3Unsafe(const Float3 &inV)
+{
+#if defined(JPH_USE_SSE)
+	Type v = _mm_loadu_ps(&inV.x);
+#elif defined(JPH_USE_NEON)
+	Type v = vld1q_f32(&inV.x);
+#else
+	Type v = { inV.x, inV.y, inV.z };
+#endif
+	return sFixW(v);
+}
+
+Vec3 Vec3::sMin(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_min_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vminq_f32(inV1.mValue, inV2.mValue);
+#else
+	return Vec3(min(inV1.mF32[0], inV2.mF32[0]),
+				min(inV1.mF32[1], inV2.mF32[1]),
+				min(inV1.mF32[2], inV2.mF32[2]));
+#endif
+}
+
+Vec3 Vec3::sMax(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_max_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmaxq_f32(inV1.mValue, inV2.mValue);
+#else
+	return Vec3(max(inV1.mF32[0], inV2.mF32[0]),
+				max(inV1.mF32[1], inV2.mF32[1]),
+				max(inV1.mF32[2], inV2.mF32[2]));
+#endif
+}
+
+Vec3 Vec3::sClamp(Vec3Arg inV, Vec3Arg inMin, Vec3Arg inMax)
+{
+	return sMax(sMin(inV, inMax), inMin);
+}
+
+UVec4 Vec3::sEquals(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vceqq_f32(inV1.mValue, inV2.mValue);
+#else
+	uint32 z = inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
+				 z,
+				 z);
+#endif
+}
+
+UVec4 Vec3::sLess(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcltq_f32(inV1.mValue, inV2.mValue);
+#else
+	uint32 z = inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
+				 z,
+				 z);
+#endif
+}
+
+UVec4 Vec3::sLessOrEqual(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcleq_f32(inV1.mValue, inV2.mValue);
+#else
+	uint32 z = inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
+				 z,
+				 z);
+#endif
+}
+
+UVec4 Vec3::sGreater(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcgtq_f32(inV1.mValue, inV2.mValue);
+#else
+	uint32 z = inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
+				 z,
+				 z);
+#endif
+}
+
+UVec4 Vec3::sGreaterOrEqual(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcgeq_f32(inV1.mValue, inV2.mValue);
+#else
+	uint32 z = inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0;
+	return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
+				 z,
+				 z);
+#endif
+}
+
+Vec3 Vec3::sFusedMultiplyAdd(Vec3Arg inMul1, Vec3Arg inMul2, Vec3Arg inAdd)
+{
+#if defined(JPH_USE_SSE)
+	#ifdef JPH_USE_FMADD
+		return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
+	#else
+		return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
+	#endif
+#elif defined(JPH_USE_NEON)
+	return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
+#else
+	return Vec3(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
+				inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
+				inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2]);
+#endif
+}
+
+Vec3 Vec3::sSelect(Vec3Arg inNotSet, Vec3Arg inSet, UVec4Arg inControl)
+{
+#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
+	Type v = _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
+	return sFixW(v);
+#elif defined(JPH_USE_SSE)
+	__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
+	Type v = _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
+	return sFixW(v);
+#elif defined(JPH_USE_NEON)
+	Type v = vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
+	return sFixW(v);
+#else
+	Vec3 result;
+	for (int i = 0; i < 3; i++)
+		result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	result.mF32[3] = result.mF32[2];
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	return result;
+#endif
+}
+
+Vec3 Vec3::sOr(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_or_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return Vec3(UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
+#endif
+}
+
+Vec3 Vec3::sXor(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_xor_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return Vec3(UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
+#endif
+}
+
+Vec3 Vec3::sAnd(Vec3Arg inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_and_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return Vec3(UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat());
+#endif
+}
+
+Vec3 Vec3::sUnitSpherical(float inTheta, float inPhi)
+{
+	Vec4 s, c;
+	Vec4(inTheta, inPhi, 0, 0).SinCos(s, c);
+	return Vec3(s.GetX() * c.GetY(), s.GetX() * s.GetY(), c.GetX());
+}
+
+template <class Random>
+Vec3 Vec3::sRandom(Random &inRandom)
+{
+	std::uniform_real_distribution<float> zero_to_one(0.0f, 1.0f);
+	float theta = JPH_PI * zero_to_one(inRandom);
+	float phi = 2.0f * JPH_PI * zero_to_one(inRandom);
+	return sUnitSpherical(theta, phi);
+}
+
+bool Vec3::operator == (Vec3Arg inV2) const
+{
+	return sEquals(*this, inV2).TestAllXYZTrue();
+}
+
+bool Vec3::IsClose(Vec3Arg inV2, float inMaxDistSq) const
+{
+	return (inV2 - *this).LengthSq() <= inMaxDistSq;
+}
+
+bool Vec3::IsNearZero(float inMaxDistSq) const
+{
+	return LengthSq() <= inMaxDistSq;
+}
+
+Vec3 Vec3::operator * (Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmulq_f32(mValue, inV2.mValue);
+#else
+	return Vec3(mF32[0] * inV2.mF32[0], mF32[1] * inV2.mF32[1], mF32[2] * inV2.mF32[2]);
+#endif
+}
+
+Vec3 Vec3::operator * (float inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	return vmulq_n_f32(mValue, inV2);
+#else
+	return Vec3(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2);
+#endif
+}
+
+Vec3 operator * (float inV1, Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmulq_n_f32(inV2.mValue, inV1);
+#else
+	return Vec3(inV1 * inV2.mF32[0], inV1 * inV2.mF32[1], inV1 * inV2.mF32[2]);
+#endif
+}
+
+Vec3 Vec3::operator / (float inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_div_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	return vdivq_f32(mValue, vdupq_n_f32(inV2));
+#else
+	return Vec3(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2);
+#endif
+}
+
+Vec3 &Vec3::operator *= (float inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	mValue = vmulq_n_f32(mValue, inV2);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF32[i] *= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
+#endif
+	return *this;
+}
+
+Vec3 &Vec3::operator *= (Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_mul_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vmulq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF32[i] *= inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
+#endif
+	return *this;
+}
+
+Vec3 &Vec3::operator /= (float inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
+#else
+	for (int i = 0; i < 3; ++i)
+		mF32[i] /= inV2;
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
+#endif
+	return *this;
+}
+
+Vec3 Vec3::operator + (Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_add_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vaddq_f32(mValue, inV2.mValue);
+#else
+	return Vec3(mF32[0] + inV2.mF32[0], mF32[1] + inV2.mF32[1], mF32[2] + inV2.mF32[2]);
+#endif
+}
+
+Vec3 &Vec3::operator += (Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_add_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vaddq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF32[i] += inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
+#endif
+	return *this;
+}
+
+Vec3 Vec3::operator - () const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sub_ps(_mm_setzero_ps(), mValue);
+#elif defined(JPH_USE_NEON)
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		return vsubq_f32(vdupq_n_f32(0), mValue);
+	#else
+		return vnegq_f32(mValue);
+	#endif
+#else
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		return Vec3(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2]);
+	#else
+		return Vec3(-mF32[0], -mF32[1], -mF32[2]);
+	#endif
+#endif
+}
+
+Vec3 Vec3::operator - (Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sub_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vsubq_f32(mValue, inV2.mValue);
+#else
+	return Vec3(mF32[0] - inV2.mF32[0], mF32[1] - inV2.mF32[1], mF32[2] - inV2.mF32[2]);
+#endif
+}
+
+Vec3 &Vec3::operator -= (Vec3Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_sub_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vsubq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 3; ++i)
+		mF32[i] -= inV2.mF32[i];
+	#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+		mF32[3] = mF32[2];
+	#endif
+#endif
+	return *this;
+}
+
+Vec3 Vec3::operator / (Vec3Arg inV2) const
+{
+	inV2.CheckW(); // Check W equals Z to avoid div by zero
+#if defined(JPH_USE_SSE)
+	return _mm_div_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vdivq_f32(mValue, inV2.mValue);
+#else
+	return Vec3(mF32[0] / inV2.mF32[0], mF32[1] / inV2.mF32[1], mF32[2] / inV2.mF32[2]);
+#endif
+}
+
+Vec4 Vec3::SplatX() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 0);
+#else
+	return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
+#endif
+}
+
+Vec4 Vec3::SplatY() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 1);
+#else
+	return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
+#endif
+}
+
+Vec4 Vec3::SplatZ() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 2);
+#else
+	return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
+#endif
+}
+
+int Vec3::GetLowestComponentIndex() const
+{
+	return GetX() < GetY() ? (GetZ() < GetX() ? 2 : 0) : (GetZ() < GetY() ? 2 : 1);
+}
+
+int Vec3::GetHighestComponentIndex() const
+{
+	return GetX() > GetY() ? (GetZ() > GetX() ? 2 : 0) : (GetZ() > GetY() ? 2 : 1);
+}
+
+Vec3 Vec3::Abs() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_range_ps(mValue, mValue, 0b1000);
+#elif defined(JPH_USE_SSE)
+	return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
+#elif defined(JPH_USE_NEON)
+	return vabsq_f32(mValue);
+#else
+	return Vec3(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]));
+#endif
+}
+
+Vec3 Vec3::Reciprocal() const
+{
+	return sOne() / mValue;
+}
+
+Vec3 Vec3::Cross(Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	Type t1 = _mm_shuffle_ps(inV2.mValue, inV2.mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+	t1 = _mm_mul_ps(t1, mValue);
+	Type t2 = _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+	t2 = _mm_mul_ps(t2, inV2.mValue);
+	Type t3 = _mm_sub_ps(t1, t2);
+	return _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(0, 0, 2, 1)); // Assure Z and W are the same
+#elif defined(JPH_USE_NEON)
+	Type t1 = JPH_NEON_SHUFFLE_F32x4(inV2.mValue, inV2.mValue, 1, 2, 0, 0); // Assure Z and W are the same
+	t1 = vmulq_f32(t1, mValue);
+	Type t2 = JPH_NEON_SHUFFLE_F32x4(mValue, mValue, 1, 2, 0, 0); // Assure Z and W are the same
+	t2 = vmulq_f32(t2, inV2.mValue);
+	Type t3 = vsubq_f32(t1, t2);
+	return JPH_NEON_SHUFFLE_F32x4(t3, t3, 1, 2, 0, 0); // Assure Z and W are the same
+#else
+	return Vec3(mF32[1] * inV2.mF32[2] - mF32[2] * inV2.mF32[1],
+				mF32[2] * inV2.mF32[0] - mF32[0] * inV2.mF32[2],
+				mF32[0] * inV2.mF32[1] - mF32[1] * inV2.mF32[0]);
+#endif
+}
+
+Vec3 Vec3::DotV(Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	return vdupq_n_f32(vaddvq_f32(mul));
+#else
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+		dot += mF32[i] * inV2.mF32[i];
+	return Vec3::sReplicate(dot);
+#endif
+}
+
+Vec4 Vec3::DotV4(Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_dp_ps(mValue, inV2.mValue, 0x7f);
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	return vdupq_n_f32(vaddvq_f32(mul));
+#else
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+		dot += mF32[i] * inV2.mF32[i];
+	return Vec4::sReplicate(dot);
+#endif
+}
+
+float Vec3::Dot(Vec3Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0x7f));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	return vaddvq_f32(mul);
+#else
+	float dot = 0.0f;
+	for (int i = 0; i < 3; i++)
+		dot += mF32[i] * inV2.mF32[i];
+	return dot;
+#endif
+}
+
+float Vec3::LengthSq() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0x7f));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	return vaddvq_f32(mul);
+#else
+	float len_sq = 0.0f;
+	for (int i = 0; i < 3; i++)
+		len_sq += mF32[i] * mF32[i];
+	return len_sq;
+#endif
+}
+
+float Vec3::Length() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0x7f)));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
+	return vget_lane_f32(vsqrt_f32(sum), 0);
+#else
+	return sqrt(LengthSq());
+#endif
+}
+
+Vec3 Vec3::Sqrt() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sqrt_ps(mValue);
+#elif defined(JPH_USE_NEON)
+	return vsqrtq_f32(mValue);
+#else
+	return Vec3(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]));
+#endif
+}
+
+Vec3 Vec3::Normalized() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0x7f)));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
+	return vdivq_f32(mValue, vsqrtq_f32(sum));
+#else
+	return *this / Length();
+#endif
+}
+
+Vec3 Vec3::NormalizedOr(Vec3Arg inZeroValue) const
+{
+#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
+	Type len_sq = _mm_dp_ps(mValue, mValue, 0x7f);
+	// clang with '-ffast-math' (which you should not use!) can generate _mm_rsqrt_ps
+	// instructions which produce INFs/NaNs when they get a denormal float as input.
+	// We therefore treat denormals as zero here.
+	Type is_zero = _mm_cmple_ps(len_sq, _mm_set1_ps(FLT_MIN));
+#ifdef JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+	if (_mm_movemask_ps(is_zero) == 0xf)
+		return inZeroValue;
+	else
+		return _mm_div_ps(mValue, _mm_sqrt_ps(len_sq));
+#else
+	return _mm_blendv_ps(_mm_div_ps(mValue, _mm_sqrt_ps(len_sq)), inZeroValue.mValue, is_zero);
+#endif // JPH_FLOATING_POINT_EXCEPTIONS_ENABLED
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	mul = vsetq_lane_f32(0, mul, 3);
+	float32x4_t len_sq = vdupq_n_f32(vaddvq_f32(mul));
+	uint32x4_t is_zero = vcleq_f32(len_sq, vdupq_n_f32(FLT_MIN));
+	return vbslq_f32(is_zero, inZeroValue.mValue, vdivq_f32(mValue, vsqrtq_f32(len_sq)));
+#else
+	float len_sq = LengthSq();
+	if (len_sq <= FLT_MIN)
+		return inZeroValue;
+	else
+		return *this / sqrt(len_sq);
+#endif
+}
+
+bool Vec3::IsNormalized(float inTolerance) const
+{
+	return abs(LengthSq() - 1.0f) <= inTolerance;
+}
+
+bool Vec3::IsNaN() const
+{
+#if defined(JPH_USE_AVX512)
+	return (_mm_fpclass_ps_mask(mValue, 0b10000001) & 0x7) != 0;
+#elif defined(JPH_USE_SSE)
+	return (_mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) & 0x7) != 0;
+#elif defined(JPH_USE_NEON)
+	uint32x4_t mask = JPH_NEON_UINT32x4(1, 1, 1, 0);
+	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
+	return vaddvq_u32(vandq_u32(is_equal, mask)) != 3;
+#else
+	return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]);
+#endif
+}
+
+void Vec3::StoreFloat3(Float3 *outV) const
+{
+#if defined(JPH_USE_SSE)
+	_mm_store_ss(&outV->x, mValue);
+	Vec3 t = Swizzle<SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_UNUSED>();
+	_mm_store_ss(&outV->y, t.mValue);
+	t = t.Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_UNUSED>();
+	_mm_store_ss(&outV->z, t.mValue);
+#elif defined(JPH_USE_NEON)
+	float32x2_t xy = vget_low_f32(mValue);
+	vst1_f32(&outV->x, xy);
+	vst1q_lane_f32(&outV->z, mValue, 2);
+#else
+	outV->x = mF32[0];
+	outV->y = mF32[1];
+	outV->z = mF32[2];
+#endif
+}
+
+UVec4 Vec3::ToInt() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_cvttps_epi32(mValue);
+#elif defined(JPH_USE_NEON)
+	return vcvtq_u32_f32(mValue);
+#else
+	return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
+#endif
+}
+
+UVec4 Vec3::ReinterpretAsInt() const
+{
+#if defined(JPH_USE_SSE)
+	return UVec4(_mm_castps_si128(mValue));
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_u32_f32(mValue);
+#else
+	return *reinterpret_cast<const UVec4 *>(this);
+#endif
+}
+
+float Vec3::ReduceMin() const
+{
+	Vec3 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
+	v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+	return v.GetX();
+}
+
+float Vec3::ReduceMax() const
+{
+	Vec3 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_Z>());
+	v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+	return v.GetX();
+}
+
+Vec3 Vec3::GetNormalizedPerpendicular() const
+{
+	if (abs(mF32[0]) > abs(mF32[1]))
+	{
+		float len = sqrt(mF32[0] * mF32[0] + mF32[2] * mF32[2]);
+		return Vec3(mF32[2], 0.0f, -mF32[0]) / len;
+	}
+	else
+	{
+		float len = sqrt(mF32[1] * mF32[1] + mF32[2] * mF32[2]);
+		return Vec3(0.0f, mF32[2], -mF32[1]) / len;
+	}
+}
+
+Vec3 Vec3::GetSign() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
+#elif defined(JPH_USE_SSE)
+	Type minus_one = _mm_set1_ps(-1.0f);
+	Type one = _mm_set1_ps(1.0f);
+	return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
+#elif defined(JPH_USE_NEON)
+	Type minus_one = vdupq_n_f32(-1.0f);
+	Type one = vdupq_n_f32(1.0f);
+	return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
+#else
+	return Vec3(std::signbit(mF32[0])? -1.0f : 1.0f,
+				std::signbit(mF32[1])? -1.0f : 1.0f,
+				std::signbit(mF32[2])? -1.0f : 1.0f);
+#endif
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Vec4.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Vec4.h
@@ -0,0 +1,286 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <Jolt/Math/Float4.h>
+#include <Jolt/Math/Swizzle.h>
+#include <Jolt/Math/MathTypes.h>
+
+JPH_NAMESPACE_BEGIN
+
+class [[nodiscard]] alignas(JPH_VECTOR_ALIGNMENT) Vec4
+{
+public:
+	JPH_OVERRIDE_NEW_DELETE
+
+	// Underlying vector type
+#if defined(JPH_USE_SSE)
+	using Type = __m128;
+#elif defined(JPH_USE_NEON)
+	using Type = float32x4_t;
+#else
+	using Type = struct { float mData[4]; };
+#endif
+
+	/// Constructor
+								Vec4() = default; ///< Intentionally not initialized for performance reasons
+								Vec4(const Vec4 &inRHS) = default;
+	Vec4 &						operator = (const Vec4 &inRHS) = default;
+	explicit JPH_INLINE			Vec4(Vec3Arg inRHS);							///< WARNING: W component undefined!
+	JPH_INLINE					Vec4(Vec3Arg inRHS, float inW);
+	JPH_INLINE					Vec4(Type inRHS) : mValue(inRHS)				{ }
+
+	/// Create a vector from 4 components
+	JPH_INLINE					Vec4(float inX, float inY, float inZ, float inW);
+
+	/// Vector with all zeros
+	static JPH_INLINE Vec4		sZero();
+
+	/// Vector with all ones
+	static JPH_INLINE Vec4		sOne();
+
+	/// Vector with all NaN's
+	static JPH_INLINE Vec4		sNaN();
+
+	/// Replicate inV across all components
+	static JPH_INLINE Vec4		sReplicate(float inV);
+
+	/// Load 4 floats from memory
+	static JPH_INLINE Vec4		sLoadFloat4(const Float4 *inV);
+
+	/// Load 4 floats from memory, 16 bytes aligned
+	static JPH_INLINE Vec4		sLoadFloat4Aligned(const Float4 *inV);
+
+	/// Gather 4 floats from memory at inBase + inOffsets[i] * Scale
+	template <const int Scale>
+	static JPH_INLINE Vec4		sGatherFloat4(const float *inBase, UVec4Arg inOffsets);
+
+	/// Return the minimum value of each of the components
+	static JPH_INLINE Vec4		sMin(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Return the maximum of each of the components
+	static JPH_INLINE Vec4		sMax(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Equals (component wise)
+	static JPH_INLINE UVec4		sEquals(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Less than (component wise)
+	static JPH_INLINE UVec4		sLess(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Less than or equal (component wise)
+	static JPH_INLINE UVec4		sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Greater than (component wise)
+	static JPH_INLINE UVec4		sGreater(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Greater than or equal (component wise)
+	static JPH_INLINE UVec4		sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Calculates inMul1 * inMul2 + inAdd
+	static JPH_INLINE Vec4		sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd);
+
+	/// Component wise select, returns inNotSet when highest bit of inControl = 0 and inSet when highest bit of inControl = 1
+	static JPH_INLINE Vec4		sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl);
+
+	/// Logical or (component wise)
+	static JPH_INLINE Vec4		sOr(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Logical xor (component wise)
+	static JPH_INLINE Vec4		sXor(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Logical and (component wise)
+	static JPH_INLINE Vec4		sAnd(Vec4Arg inV1, Vec4Arg inV2);
+
+	/// Sort the four elements of ioValue and sort ioIndex at the same time.
+	/// Based on a sorting network: http://en.wikipedia.org/wiki/Sorting_network
+	static JPH_INLINE void		sSort4(Vec4 &ioValue, UVec4 &ioIndex);
+
+	/// Reverse sort the four elements of ioValue (highest first) and sort ioIndex at the same time.
+	/// Based on a sorting network: http://en.wikipedia.org/wiki/Sorting_network
+	static JPH_INLINE void		sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex);
+
+	/// Get individual components
+#if defined(JPH_USE_SSE)
+	JPH_INLINE float			GetX() const									{ return _mm_cvtss_f32(mValue); }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
+	JPH_INLINE float			GetW() const									{ return mF32[3]; }
+#elif defined(JPH_USE_NEON)
+	JPH_INLINE float			GetX() const									{ return vgetq_lane_f32(mValue, 0); }
+	JPH_INLINE float			GetY() const									{ return vgetq_lane_f32(mValue, 1); }
+	JPH_INLINE float			GetZ() const									{ return vgetq_lane_f32(mValue, 2); }
+	JPH_INLINE float			GetW() const									{ return vgetq_lane_f32(mValue, 3); }
+#else
+	JPH_INLINE float			GetX() const									{ return mF32[0]; }
+	JPH_INLINE float			GetY() const									{ return mF32[1]; }
+	JPH_INLINE float			GetZ() const									{ return mF32[2]; }
+	JPH_INLINE float			GetW() const									{ return mF32[3]; }
+#endif
+
+	/// Set individual components
+	JPH_INLINE void				SetX(float inX)									{ mF32[0] = inX; }
+	JPH_INLINE void				SetY(float inY)									{ mF32[1] = inY; }
+	JPH_INLINE void				SetZ(float inZ)									{ mF32[2] = inZ; }
+	JPH_INLINE void				SetW(float inW)									{ mF32[3] = inW; }
+
+	/// Set all components
+	JPH_INLINE void				Set(float inX, float inY, float inZ, float inW)	{ *this = Vec4(inX, inY, inZ, inW); }
+
+	/// Get float component by index
+	JPH_INLINE float			operator [] (uint inCoordinate) const			{ JPH_ASSERT(inCoordinate < 4); return mF32[inCoordinate]; }
+	JPH_INLINE float &			operator [] (uint inCoordinate)					{ JPH_ASSERT(inCoordinate < 4); return mF32[inCoordinate]; }
+
+	/// Comparison
+	JPH_INLINE bool				operator == (Vec4Arg inV2) const;
+	JPH_INLINE bool				operator != (Vec4Arg inV2) const			{ return !(*this == inV2); }
+
+	/// Test if two vectors are close
+	JPH_INLINE bool				IsClose(Vec4Arg inV2, float inMaxDistSq = 1.0e-12f) const;
+
+	/// Test if vector is normalized
+	JPH_INLINE bool				IsNormalized(float inTolerance = 1.0e-6f) const;
+
+	/// Test if vector contains NaN elements
+	JPH_INLINE bool				IsNaN() const;
+
+	/// Multiply two float vectors (component wise)
+	JPH_INLINE Vec4				operator * (Vec4Arg inV2) const;
+
+	/// Multiply vector with float
+	JPH_INLINE Vec4				operator * (float inV2) const;
+
+	/// Multiply vector with float
+	friend JPH_INLINE Vec4		operator * (float inV1, Vec4Arg inV2);
+
+	/// Divide vector by float
+	JPH_INLINE Vec4				operator / (float inV2) const;
+
+	/// Multiply vector with float
+	JPH_INLINE Vec4 &			operator *= (float inV2);
+
+	/// Multiply vector with vector
+	JPH_INLINE Vec4 &			operator *= (Vec4Arg inV2);
+
+	/// Divide vector by float
+	JPH_INLINE Vec4 &			operator /= (float inV2);
+
+	/// Add two float vectors (component wise)
+	JPH_INLINE Vec4				operator + (Vec4Arg inV2) const;
+
+	/// Add two float vectors (component wise)
+	JPH_INLINE Vec4 &			operator += (Vec4Arg inV2);
+
+	/// Negate
+	JPH_INLINE Vec4				operator - () const;
+
+	/// Subtract two float vectors (component wise)
+	JPH_INLINE Vec4				operator - (Vec4Arg inV2) const;
+
+	/// Subtract two float vectors (component wise)
+	JPH_INLINE Vec4 &			operator -= (Vec4Arg inV2);
+
+	/// Divide (component wise)
+	JPH_INLINE Vec4				operator / (Vec4Arg inV2) const;
+
+	/// Swizzle the elements in inV
+	template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
+	JPH_INLINE Vec4				Swizzle() const;
+
+	/// Replicate the X component to all components
+	JPH_INLINE Vec4				SplatX() const;
+
+	/// Replicate the Y component to all components
+	JPH_INLINE Vec4				SplatY() const;
+
+	/// Replicate the Z component to all components
+	JPH_INLINE Vec4				SplatZ() const;
+
+	/// Replicate the W component to all components
+	JPH_INLINE Vec4				SplatW() const;
+
+	/// Return the absolute value of each of the components
+	JPH_INLINE Vec4				Abs() const;
+
+	/// Reciprocal vector (1 / value) for each of the components
+	JPH_INLINE Vec4				Reciprocal() const;
+
+	/// Dot product, returns the dot product in X, Y and Z components
+	JPH_INLINE Vec4				DotV(Vec4Arg inV2) const;
+
+	/// Dot product
+	JPH_INLINE float			Dot(Vec4Arg inV2) const;
+
+	/// Squared length of vector
+	JPH_INLINE float			LengthSq() const;
+
+	/// Length of vector
+	JPH_INLINE float			Length() const;
+
+	/// Normalize vector
+	JPH_INLINE Vec4				Normalized() const;
+
+	/// Store 4 floats to memory
+	JPH_INLINE void				StoreFloat4(Float4 *outV) const;
+
+	/// Convert each component from a float to an int
+	JPH_INLINE UVec4			ToInt() const;
+
+	/// Reinterpret Vec4 as a UVec4 (doesn't change the bits)
+	JPH_INLINE UVec4			ReinterpretAsInt() const;
+
+	/// Store if X is negative in bit 0, Y in bit 1, Z in bit 2 and W in bit 3
+	JPH_INLINE int				GetSignBits() const;
+
+	/// Get the minimum of X, Y, Z and W
+	JPH_INLINE float			ReduceMin() const;
+
+	/// Get the maximum of X, Y, Z and W
+	JPH_INLINE float			ReduceMax() const;
+
+	/// Component wise square root
+	JPH_INLINE Vec4				Sqrt() const;
+
+	/// Get vector that contains the sign of each element (returns 1.0f if positive, -1.0f if negative)
+	JPH_INLINE Vec4				GetSign() const;
+
+	/// Calculate the sine and cosine for each element of this vector (input in radians)
+	inline void					SinCos(Vec4 &outSin, Vec4 &outCos) const;
+
+	/// Calculate the tangent for each element of this vector (input in radians)
+	inline Vec4					Tan() const;
+
+	/// Calculate the arc sine for each element of this vector (returns value in the range [-PI / 2, PI / 2])
+	/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::asin
+	inline Vec4					ASin() const;
+
+	/// Calculate the arc cosine for each element of this vector (returns value in the range [0, PI])
+	/// Note that all input values will be clamped to the range [-1, 1] and this function will not return NaNs like std::acos
+	inline Vec4					ACos() const;
+
+	/// Calculate the arc tangent for each element of this vector (returns value in the range [-PI / 2, PI / 2])
+	inline Vec4					ATan() const;
+
+	/// Calculate the arc tangent of y / x using the signs of the arguments to determine the correct quadrant (returns value in the range [-PI, PI])
+	inline static Vec4			sATan2(Vec4Arg inY, Vec4Arg inX);
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, Vec4Arg inV)
+	{
+		inStream << inV.mF32[0] << ", " << inV.mF32[1] << ", " << inV.mF32[2] << ", " << inV.mF32[3];
+		return inStream;
+	}
+
+	union
+	{
+		Type					mValue;
+		float					mF32[4];
+	};
+};
+
+static_assert(std::is_trivial<Vec4>(), "Is supposed to be a trivial type!");
+
+JPH_NAMESPACE_END
+
+#include "Vec4.inl"
--- a/thirdparty/jolt_physics/Jolt/Math/Vec4.inl
+++ b/thirdparty/jolt_physics/Jolt/Math/Vec4.inl
@@ -0,0 +1,986 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#include <Jolt/Math/Trigonometry.h>
+#include <Jolt/Math/Vec3.h>
+#include <Jolt/Math/UVec4.h>
+
+JPH_NAMESPACE_BEGIN
+
+// Constructor
+Vec4::Vec4(Vec3Arg inRHS) :
+	mValue(inRHS.mValue)
+{
+}
+
+Vec4::Vec4(Vec3Arg inRHS, float inW)
+{
+#if defined(JPH_USE_SSE4_1)
+	mValue = _mm_blend_ps(inRHS.mValue, _mm_set1_ps(inW), 8);
+#elif defined(JPH_USE_NEON)
+	mValue = vsetq_lane_f32(inW, inRHS.mValue, 3);
+#else
+	for (int i = 0; i < 3; i++)
+		mF32[i] = inRHS.mF32[i];
+	mF32[3] = inW;
+#endif
+}
+
+Vec4::Vec4(float inX, float inY, float inZ, float inW)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_set_ps(inW, inZ, inY, inX);
+#elif defined(JPH_USE_NEON)
+	uint32x2_t xy = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inX)) | (static_cast<uint64>(BitCast<uint32>(inY)) << 32));
+	uint32x2_t zw = vcreate_u32(static_cast<uint64>(BitCast<uint32>(inZ)) | (static_cast<uint64>(BitCast<uint32>(inW)) << 32));
+	mValue = vreinterpretq_f32_u32(vcombine_u32(xy, zw));
+#else
+	mF32[0] = inX;
+	mF32[1] = inY;
+	mF32[2] = inZ;
+	mF32[3] = inW;
+#endif
+}
+
+template<uint32 SwizzleX, uint32 SwizzleY, uint32 SwizzleZ, uint32 SwizzleW>
+Vec4 Vec4::Swizzle() const
+{
+	static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
+	static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
+	static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
+	static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
+
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
+#elif defined(JPH_USE_NEON)
+	return JPH_NEON_SHUFFLE_F32x4(mValue, mValue, SwizzleX, SwizzleY, SwizzleZ, SwizzleW);
+#else
+	return Vec4(mF32[SwizzleX], mF32[SwizzleY], mF32[SwizzleZ], mF32[SwizzleW]);
+#endif
+}
+
+Vec4 Vec4::sZero()
+{
+#if defined(JPH_USE_SSE)
+	return _mm_setzero_ps();
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_f32(0);
+#else
+	return Vec4(0, 0, 0, 0);
+#endif
+}
+
+Vec4 Vec4::sReplicate(float inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_set1_ps(inV);
+#elif defined(JPH_USE_NEON)
+	return vdupq_n_f32(inV);
+#else
+	return Vec4(inV, inV, inV, inV);
+#endif
+}
+
+Vec4 Vec4::sOne()
+{
+	return sReplicate(1.0f);
+}
+
+Vec4 Vec4::sNaN()
+{
+	return sReplicate(numeric_limits<float>::quiet_NaN());
+}
+
+Vec4 Vec4::sLoadFloat4(const Float4 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_loadu_ps(&inV->x);
+#elif defined(JPH_USE_NEON)
+	return vld1q_f32(&inV->x);
+#else
+	return Vec4(inV->x, inV->y, inV->z, inV->w);
+#endif
+}
+
+Vec4 Vec4::sLoadFloat4Aligned(const Float4 *inV)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_load_ps(&inV->x);
+#elif defined(JPH_USE_NEON)
+	return vld1q_f32(&inV->x);
+#else
+	return Vec4(inV->x, inV->y, inV->z, inV->w);
+#endif
+}
+
+template <const int Scale>
+Vec4 Vec4::sGatherFloat4(const float *inBase, UVec4Arg inOffsets)
+{
+#if defined(JPH_USE_SSE)
+	#ifdef JPH_USE_AVX2
+		return _mm_i32gather_ps(inBase, inOffsets.mValue, Scale);
+	#else
+		const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
+		Type x = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale));
+		Type y = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale));
+		Type xy = _mm_unpacklo_ps(x, y);
+		Type z = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale));
+		Type w = _mm_load_ss(reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale));
+		Type zw = _mm_unpacklo_ps(z, w);
+		return _mm_movelh_ps(xy, zw);
+	#endif
+#else
+	const uint8 *base = reinterpret_cast<const uint8 *>(inBase);
+	float x = *reinterpret_cast<const float *>(base + inOffsets.GetX() * Scale);
+	float y = *reinterpret_cast<const float *>(base + inOffsets.GetY() * Scale);
+	float z = *reinterpret_cast<const float *>(base + inOffsets.GetZ() * Scale);
+	float w = *reinterpret_cast<const float *>(base + inOffsets.GetW() * Scale);
+	return Vec4(x, y, z, w);
+#endif
+}
+
+Vec4 Vec4::sMin(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_min_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vminq_f32(inV1.mValue, inV2.mValue);
+#else
+	return Vec4(min(inV1.mF32[0], inV2.mF32[0]),
+				min(inV1.mF32[1], inV2.mF32[1]),
+				min(inV1.mF32[2], inV2.mF32[2]),
+				min(inV1.mF32[3], inV2.mF32[3]));
+#endif
+}
+
+Vec4 Vec4::sMax(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_max_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmaxq_f32(inV1.mValue, inV2.mValue);
+#else
+	return Vec4(max(inV1.mF32[0], inV2.mF32[0]),
+				max(inV1.mF32[1], inV2.mF32[1]),
+				max(inV1.mF32[2], inV2.mF32[2]),
+				max(inV1.mF32[3], inV2.mF32[3]));
+#endif
+}
+
+UVec4 Vec4::sEquals(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpeq_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vceqq_f32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mF32[0] == inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] == inV2.mF32[1]? 0xffffffffu : 0,
+				 inV1.mF32[2] == inV2.mF32[2]? 0xffffffffu : 0,
+				 inV1.mF32[3] == inV2.mF32[3]? 0xffffffffu : 0);
+#endif
+}
+
+UVec4 Vec4::sLess(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmplt_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcltq_f32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mF32[0] < inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] < inV2.mF32[1]? 0xffffffffu : 0,
+				 inV1.mF32[2] < inV2.mF32[2]? 0xffffffffu : 0,
+				 inV1.mF32[3] < inV2.mF32[3]? 0xffffffffu : 0);
+#endif
+}
+
+UVec4 Vec4::sLessOrEqual(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmple_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcleq_f32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mF32[0] <= inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] <= inV2.mF32[1]? 0xffffffffu : 0,
+				 inV1.mF32[2] <= inV2.mF32[2]? 0xffffffffu : 0,
+				 inV1.mF32[3] <= inV2.mF32[3]? 0xffffffffu : 0);
+#endif
+}
+
+UVec4 Vec4::sGreater(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpgt_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcgtq_f32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mF32[0] > inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] > inV2.mF32[1]? 0xffffffffu : 0,
+				 inV1.mF32[2] > inV2.mF32[2]? 0xffffffffu : 0,
+				 inV1.mF32[3] > inV2.mF32[3]? 0xffffffffu : 0);
+#endif
+}
+
+UVec4 Vec4::sGreaterOrEqual(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_castps_si128(_mm_cmpge_ps(inV1.mValue, inV2.mValue));
+#elif defined(JPH_USE_NEON)
+	return vcgeq_f32(inV1.mValue, inV2.mValue);
+#else
+	return UVec4(inV1.mF32[0] >= inV2.mF32[0]? 0xffffffffu : 0,
+				 inV1.mF32[1] >= inV2.mF32[1]? 0xffffffffu : 0,
+				 inV1.mF32[2] >= inV2.mF32[2]? 0xffffffffu : 0,
+				 inV1.mF32[3] >= inV2.mF32[3]? 0xffffffffu : 0);
+#endif
+}
+
+Vec4 Vec4::sFusedMultiplyAdd(Vec4Arg inMul1, Vec4Arg inMul2, Vec4Arg inAdd)
+{
+#if defined(JPH_USE_SSE)
+	#ifdef JPH_USE_FMADD
+		return _mm_fmadd_ps(inMul1.mValue, inMul2.mValue, inAdd.mValue);
+	#else
+		return _mm_add_ps(_mm_mul_ps(inMul1.mValue, inMul2.mValue), inAdd.mValue);
+	#endif
+#elif defined(JPH_USE_NEON)
+	return vmlaq_f32(inAdd.mValue, inMul1.mValue, inMul2.mValue);
+#else
+	return Vec4(inMul1.mF32[0] * inMul2.mF32[0] + inAdd.mF32[0],
+				inMul1.mF32[1] * inMul2.mF32[1] + inAdd.mF32[1],
+				inMul1.mF32[2] * inMul2.mF32[2] + inAdd.mF32[2],
+				inMul1.mF32[3] * inMul2.mF32[3] + inAdd.mF32[3]);
+#endif
+}
+
+Vec4 Vec4::sSelect(Vec4Arg inNotSet, Vec4Arg inSet, UVec4Arg inControl)
+{
+#if defined(JPH_USE_SSE4_1) && !defined(JPH_PLATFORM_WASM) // _mm_blendv_ps has problems on FireFox
+	return _mm_blendv_ps(inNotSet.mValue, inSet.mValue, _mm_castsi128_ps(inControl.mValue));
+#elif defined(JPH_USE_SSE)
+	__m128 is_set = _mm_castsi128_ps(_mm_srai_epi32(inControl.mValue, 31));
+	return _mm_or_ps(_mm_and_ps(is_set, inSet.mValue), _mm_andnot_ps(is_set, inNotSet.mValue));
+#elif defined(JPH_USE_NEON)
+	return vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(inControl.mValue), 31)), inSet.mValue, inNotSet.mValue);
+#else
+	Vec4 result;
+	for (int i = 0; i < 4; i++)
+		result.mF32[i] = (inControl.mU32[i] & 0x80000000u) ? inSet.mF32[i] : inNotSet.mF32[i];
+	return result;
+#endif
+}
+
+Vec4 Vec4::sOr(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_or_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return UVec4::sOr(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
+#endif
+}
+
+Vec4 Vec4::sXor(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_xor_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return UVec4::sXor(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
+#endif
+}
+
+Vec4 Vec4::sAnd(Vec4Arg inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_and_ps(inV1.mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(inV1.mValue), vreinterpretq_u32_f32(inV2.mValue)));
+#else
+	return UVec4::sAnd(inV1.ReinterpretAsInt(), inV2.ReinterpretAsInt()).ReinterpretAsFloat();
+#endif
+}
+
+void Vec4::sSort4(Vec4 &ioValue, UVec4 &ioIndex)
+{
+	// Pass 1, test 1st vs 3rd, 2nd vs 4th
+	Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
+	UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
+	UVec4 c1 = sLess(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v1, c1);
+	ioIndex = UVec4::sSelect(ioIndex, i1, c1);
+
+	// Pass 2, test 1st vs 2nd, 3rd vs 4th
+	Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
+	UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
+	UVec4 c2 = sLess(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v2, c2);
+	ioIndex = UVec4::sSelect(ioIndex, i2, c2);
+
+	// Pass 3, test 2nd vs 3rd component
+	Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
+	UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
+	UVec4 c3 = sLess(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v3, c3);
+	ioIndex = UVec4::sSelect(ioIndex, i3, c3);
+}
+
+void Vec4::sSort4Reverse(Vec4 &ioValue, UVec4 &ioIndex)
+{
+	// Pass 1, test 1st vs 3rd, 2nd vs 4th
+	Vec4 v1 = ioValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
+	UVec4 i1 = ioIndex.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>();
+	UVec4 c1 = sGreater(ioValue, v1).Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v1, c1);
+	ioIndex = UVec4::sSelect(ioIndex, i1, c1);
+
+	// Pass 2, test 1st vs 2nd, 3rd vs 4th
+	Vec4 v2 = ioValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
+	UVec4 i2 = ioIndex.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>();
+	UVec4 c2 = sGreater(ioValue, v2).Swizzle<SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_W, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v2, c2);
+	ioIndex = UVec4::sSelect(ioIndex, i2, c2);
+
+	// Pass 3, test 2nd vs 3rd component
+	Vec4 v3 = ioValue.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
+	UVec4 i3 = ioIndex.Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_W>();
+	UVec4 c3 = sGreater(ioValue, v3).Swizzle<SWIZZLE_X, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_W>();
+	ioValue = sSelect(ioValue, v3, c3);
+	ioIndex = UVec4::sSelect(ioIndex, i3, c3);
+}
+
+bool Vec4::operator == (Vec4Arg inV2) const
+{
+	return sEquals(*this, inV2).TestAllTrue();
+}
+
+bool Vec4::IsClose(Vec4Arg inV2, float inMaxDistSq) const
+{
+	return (inV2 - *this).LengthSq() <= inMaxDistSq;
+}
+
+bool Vec4::IsNormalized(float inTolerance) const
+{
+	return abs(LengthSq() - 1.0f) <= inTolerance;
+}
+
+bool Vec4::IsNaN() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_fpclass_ps_mask(mValue, 0b10000001) != 0;
+#elif defined(JPH_USE_SSE)
+	return _mm_movemask_ps(_mm_cmpunord_ps(mValue, mValue)) != 0;
+#elif defined(JPH_USE_NEON)
+	uint32x4_t is_equal = vceqq_f32(mValue, mValue); // If a number is not equal to itself it's a NaN
+	return vaddvq_u32(vshrq_n_u32(is_equal, 31)) != 4;
+#else
+	return isnan(mF32[0]) || isnan(mF32[1]) || isnan(mF32[2]) || isnan(mF32[3]);
+#endif
+}
+
+Vec4 Vec4::operator * (Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmulq_f32(mValue, inV2.mValue);
+#else
+	return Vec4(mF32[0] * inV2.mF32[0],
+				mF32[1] * inV2.mF32[1],
+				mF32[2] * inV2.mF32[2],
+				mF32[3] * inV2.mF32[3]);
+#endif
+}
+
+Vec4 Vec4::operator * (float inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	return vmulq_n_f32(mValue, inV2);
+#else
+	return Vec4(mF32[0] * inV2, mF32[1] * inV2, mF32[2] * inV2, mF32[3] * inV2);
+#endif
+}
+
+/// Multiply vector with float
+Vec4 operator * (float inV1, Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	return _mm_mul_ps(_mm_set1_ps(inV1), inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vmulq_n_f32(inV2.mValue, inV1);
+#else
+	return Vec4(inV1 * inV2.mF32[0],
+				inV1 * inV2.mF32[1],
+				inV1 * inV2.mF32[2],
+				inV1 * inV2.mF32[3]);
+#endif
+}
+
+Vec4 Vec4::operator / (float inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_div_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	return vdivq_f32(mValue, vdupq_n_f32(inV2));
+#else
+	return Vec4(mF32[0] / inV2, mF32[1] / inV2, mF32[2] / inV2, mF32[3] / inV2);
+#endif
+}
+
+Vec4 &Vec4::operator *= (float inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_mul_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	mValue = vmulq_n_f32(mValue, inV2);
+#else
+	for (int i = 0; i < 4; ++i)
+		mF32[i] *= inV2;
+#endif
+	return *this;
+}
+
+Vec4 &Vec4::operator *= (Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_mul_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vmulq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		mF32[i] *= inV2.mF32[i];
+#endif
+	return *this;
+}
+
+Vec4 &Vec4::operator /= (float inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_div_ps(mValue, _mm_set1_ps(inV2));
+#elif defined(JPH_USE_NEON)
+	mValue = vdivq_f32(mValue, vdupq_n_f32(inV2));
+#else
+	for (int i = 0; i < 4; ++i)
+		mF32[i] /= inV2;
+#endif
+	return *this;
+}
+
+Vec4 Vec4::operator + (Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_add_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vaddq_f32(mValue, inV2.mValue);
+#else
+	return Vec4(mF32[0] + inV2.mF32[0],
+				mF32[1] + inV2.mF32[1],
+				mF32[2] + inV2.mF32[2],
+				mF32[3] + inV2.mF32[3]);
+#endif
+}
+
+Vec4 &Vec4::operator += (Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_add_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vaddq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		mF32[i] += inV2.mF32[i];
+#endif
+	return *this;
+}
+
+Vec4 Vec4::operator - () const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sub_ps(_mm_setzero_ps(), mValue);
+#elif defined(JPH_USE_NEON)
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		return vsubq_f32(vdupq_n_f32(0), mValue);
+	#else
+		return vnegq_f32(mValue);
+	#endif
+#else
+	#ifdef JPH_CROSS_PLATFORM_DETERMINISTIC
+		return Vec4(0.0f - mF32[0], 0.0f - mF32[1], 0.0f - mF32[2], 0.0f - mF32[3]);
+	#else
+		return Vec4(-mF32[0], -mF32[1], -mF32[2], -mF32[3]);
+	#endif
+#endif
+}
+
+Vec4 Vec4::operator - (Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sub_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vsubq_f32(mValue, inV2.mValue);
+#else
+	return Vec4(mF32[0] - inV2.mF32[0],
+				mF32[1] - inV2.mF32[1],
+				mF32[2] - inV2.mF32[2],
+				mF32[3] - inV2.mF32[3]);
+#endif
+}
+
+Vec4 &Vec4::operator -= (Vec4Arg inV2)
+{
+#if defined(JPH_USE_SSE)
+	mValue = _mm_sub_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	mValue = vsubq_f32(mValue, inV2.mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		mF32[i] -= inV2.mF32[i];
+#endif
+	return *this;
+}
+
+Vec4 Vec4::operator / (Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_div_ps(mValue, inV2.mValue);
+#elif defined(JPH_USE_NEON)
+	return vdivq_f32(mValue, inV2.mValue);
+#else
+	return Vec4(mF32[0] / inV2.mF32[0],
+				mF32[1] / inV2.mF32[1],
+				mF32[2] / inV2.mF32[2],
+				mF32[3] / inV2.mF32[3]);
+#endif
+}
+
+Vec4 Vec4::SplatX() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(0, 0, 0, 0));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 0);
+#else
+	return Vec4(mF32[0], mF32[0], mF32[0], mF32[0]);
+#endif
+}
+
+Vec4 Vec4::SplatY() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(1, 1, 1, 1));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 1);
+#else
+	return Vec4(mF32[1], mF32[1], mF32[1], mF32[1]);
+#endif
+}
+
+Vec4 Vec4::SplatZ() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(2, 2, 2, 2));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 2);
+#else
+	return Vec4(mF32[2], mF32[2], mF32[2], mF32[2]);
+#endif
+}
+
+Vec4 Vec4::SplatW() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_shuffle_ps(mValue, mValue, _MM_SHUFFLE(3, 3, 3, 3));
+#elif defined(JPH_USE_NEON)
+	return vdupq_laneq_f32(mValue, 3);
+#else
+	return Vec4(mF32[3], mF32[3], mF32[3], mF32[3]);
+#endif
+}
+
+Vec4 Vec4::Abs() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_range_ps(mValue, mValue, 0b1000);
+#elif defined(JPH_USE_SSE)
+	return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), mValue), mValue);
+#elif defined(JPH_USE_NEON)
+	return vabsq_f32(mValue);
+#else
+	return Vec4(abs(mF32[0]), abs(mF32[1]), abs(mF32[2]), abs(mF32[3]));
+#endif
+}
+
+Vec4 Vec4::Reciprocal() const
+{
+	return sOne() / mValue;
+}
+
+Vec4 Vec4::DotV(Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_dp_ps(mValue, inV2.mValue, 0xff);
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
+	return vdupq_n_f32(vaddvq_f32(mul));
+#else
+	// Brackets placed so that the order is consistent with the vectorized version
+	return Vec4::sReplicate((mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]));
+#endif
+}
+
+float Vec4::Dot(Vec4Arg inV2) const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_dp_ps(mValue, inV2.mValue, 0xff));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, inV2.mValue);
+	return vaddvq_f32(mul);
+#else
+	// Brackets placed so that the order is consistent with the vectorized version
+	return (mF32[0] * inV2.mF32[0] + mF32[1] * inV2.mF32[1]) + (mF32[2] * inV2.mF32[2] + mF32[3] * inV2.mF32[3]);
+#endif
+}
+
+float Vec4::LengthSq() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_dp_ps(mValue, mValue, 0xff));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	return vaddvq_f32(mul);
+#else
+	// Brackets placed so that the order is consistent with the vectorized version
+	return (mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]);
+#endif
+}
+
+float Vec4::Length() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(mValue, mValue, 0xff)));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	float32x2_t sum = vdup_n_f32(vaddvq_f32(mul));
+	return vget_lane_f32(vsqrt_f32(sum), 0);
+#else
+	// Brackets placed so that the order is consistent with the vectorized version
+	return sqrt((mF32[0] * mF32[0] + mF32[1] * mF32[1]) + (mF32[2] * mF32[2] + mF32[3] * mF32[3]));
+#endif
+}
+
+Vec4 Vec4::Sqrt() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_sqrt_ps(mValue);
+#elif defined(JPH_USE_NEON)
+	return vsqrtq_f32(mValue);
+#else
+	return Vec4(sqrt(mF32[0]), sqrt(mF32[1]), sqrt(mF32[2]), sqrt(mF32[3]));
+#endif
+}
+
+
+Vec4 Vec4::GetSign() const
+{
+#if defined(JPH_USE_AVX512)
+	return _mm_fixupimm_ps(mValue, mValue, _mm_set1_epi32(0xA9A90A00), 0);
+#elif defined(JPH_USE_SSE)
+	Type minus_one = _mm_set1_ps(-1.0f);
+	Type one = _mm_set1_ps(1.0f);
+	return _mm_or_ps(_mm_and_ps(mValue, minus_one), one);
+#elif defined(JPH_USE_NEON)
+	Type minus_one = vdupq_n_f32(-1.0f);
+	Type one = vdupq_n_f32(1.0f);
+	return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(mValue), vreinterpretq_u32_f32(minus_one)), vreinterpretq_u32_f32(one)));
+#else
+	return Vec4(std::signbit(mF32[0])? -1.0f : 1.0f,
+				std::signbit(mF32[1])? -1.0f : 1.0f,
+				std::signbit(mF32[2])? -1.0f : 1.0f,
+				std::signbit(mF32[3])? -1.0f : 1.0f);
+#endif
+}
+
+Vec4 Vec4::Normalized() const
+{
+#if defined(JPH_USE_SSE4_1)
+	return _mm_div_ps(mValue, _mm_sqrt_ps(_mm_dp_ps(mValue, mValue, 0xff)));
+#elif defined(JPH_USE_NEON)
+	float32x4_t mul = vmulq_f32(mValue, mValue);
+	float32x4_t sum = vdupq_n_f32(vaddvq_f32(mul));
+	return vdivq_f32(mValue, vsqrtq_f32(sum));
+#else
+	return *this / Length();
+#endif
+}
+
+void Vec4::StoreFloat4(Float4 *outV) const
+{
+#if defined(JPH_USE_SSE)
+	_mm_storeu_ps(&outV->x, mValue);
+#elif defined(JPH_USE_NEON)
+	vst1q_f32(&outV->x, mValue);
+#else
+	for (int i = 0; i < 4; ++i)
+		(&outV->x)[i] = mF32[i];
+#endif
+}
+
+UVec4 Vec4::ToInt() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_cvttps_epi32(mValue);
+#elif defined(JPH_USE_NEON)
+	return vcvtq_u32_f32(mValue);
+#else
+	return UVec4(uint32(mF32[0]), uint32(mF32[1]), uint32(mF32[2]), uint32(mF32[3]));
+#endif
+}
+
+UVec4 Vec4::ReinterpretAsInt() const
+{
+#if defined(JPH_USE_SSE)
+	return UVec4(_mm_castps_si128(mValue));
+#elif defined(JPH_USE_NEON)
+	return vreinterpretq_u32_f32(mValue);
+#else
+	return *reinterpret_cast<const UVec4 *>(this);
+#endif
+}
+
+int Vec4::GetSignBits() const
+{
+#if defined(JPH_USE_SSE)
+	return _mm_movemask_ps(mValue);
+#elif defined(JPH_USE_NEON)
+	int32x4_t shift = JPH_NEON_INT32x4(0, 1, 2, 3);
+	return vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(mValue), 31), shift));
+#else
+	return (std::signbit(mF32[0])? 1 : 0) | (std::signbit(mF32[1])? 2 : 0) | (std::signbit(mF32[2])? 4 : 0) | (std::signbit(mF32[3])? 8 : 0);
+#endif
+}
+
+float Vec4::ReduceMin() const
+{
+	Vec4 v = sMin(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
+	v = sMin(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+	return v.GetX();
+}
+
+float Vec4::ReduceMax() const
+{
+	Vec4 v = sMax(mValue, Swizzle<SWIZZLE_Y, SWIZZLE_UNUSED, SWIZZLE_W, SWIZZLE_UNUSED>());
+	v = sMax(v, v.Swizzle<SWIZZLE_Z, SWIZZLE_UNUSED, SWIZZLE_UNUSED, SWIZZLE_UNUSED>());
+	return v.GetX();
+}
+
+void Vec4::SinCos(Vec4 &outSin, Vec4 &outCos) const
+{
+	// Implementation based on sinf.c from the cephes library, combines sinf and cosf in a single function, changes octants to quadrants and vectorizes it
+	// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
+
+	// Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
+	UVec4 sin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
+	Vec4 x = Vec4::sXor(*this, sin_sign.ReinterpretAsFloat());
+
+	// x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
+	UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
+
+	// Make x relative to the closest quadrant.
+	// This does x = x - quadrant * PI / 2 using a two step Cody-Waite argument reduction.
+	// This improves the accuracy of the result by avoiding loss of significant bits in the subtraction.
+	// We start with x = x - quadrant * PI / 2, PI / 2 in hexadecimal notation is 0x3fc90fdb, we remove the lowest 16 bits to
+	// get 0x3fc90000 (= 1.5703125) this means we can now multiply with a number of up to 2^16 without losing any bits.
+	// This leaves us with: x = (x - quadrant * 1.5703125) - quadrant * (PI / 2 - 1.5703125).
+	// PI / 2 - 1.5703125 in hexadecimal is 0x39fdaa22, stripping the lowest 12 bits we get 0x39fda000 (= 0.0004837512969970703125)
+	// This leaves uw with: x = ((x - quadrant * 1.5703125) - quadrant * 0.0004837512969970703125) - quadrant * (PI / 2 - 1.5703125 - 0.0004837512969970703125)
+	// See: https://stackoverflow.com/questions/42455143/sine-cosine-modular-extended-precision-arithmetic
+	// After this we have x in the range [-PI / 4, PI / 4].
+	Vec4 float_quadrant = quadrant.ToFloat();
+	x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
+
+	// Calculate x2 = x^2
+	Vec4 x2 = x * x;
+
+	// Taylor expansion:
+	// Cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + x^8/8! + ... = (((x2/8!- 1/6!) * x2 + 1/4!) * x2 - 1/2!) * x2 + 1
+	Vec4 taylor_cos = ((2.443315711809948e-5f * x2 - Vec4::sReplicate(1.388731625493765e-3f)) * x2 + Vec4::sReplicate(4.166664568298827e-2f)) * x2 * x2 - 0.5f * x2 + Vec4::sOne();
+	// Sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ... = ((-x2/7! + 1/5!) * x2 - 1/3!) * x2 * x + x
+	Vec4 taylor_sin = ((-1.9515295891e-4f * x2 + Vec4::sReplicate(8.3321608736e-3f)) * x2 - Vec4::sReplicate(1.6666654611e-1f)) * x2 * x + x;
+
+	// The lowest 2 bits of quadrant indicate the quadrant that we are in.
+	// Let x be the original input value and x' our value that has been mapped to the range [-PI / 4, PI / 4].
+	// since cos(x) = sin(x - PI / 2) and since we want to use the Taylor expansion as close as possible to 0,
+	// we can alternate between using the Taylor expansion for sin and cos according to the following table:
+	//
+	// quadrant	 sin(x)		 cos(x)
+	// XXX00b	 sin(x')	 cos(x')
+	// XXX01b	 cos(x')	-sin(x')
+	// XXX10b	-sin(x')	-cos(x')
+	// XXX11b	-cos(x')	 sin(x')
+	//
+	// So: sin_sign = bit2, cos_sign = bit1 ^ bit2, bit1 determines if we use sin or cos Taylor expansion
+	UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
+	UVec4 bit2 = UVec4::sAnd(quadrant.LogicalShiftLeft<30>(), UVec4::sReplicate(0x80000000U));
+
+	// Select which one of the results is sin and which one is cos
+	Vec4 s = Vec4::sSelect(taylor_sin, taylor_cos, bit1);
+	Vec4 c = Vec4::sSelect(taylor_cos, taylor_sin, bit1);
+
+	// Update the signs
+	sin_sign = UVec4::sXor(sin_sign, bit2);
+	UVec4 cos_sign = UVec4::sXor(bit1, bit2);
+
+	// Correct the signs
+	outSin = Vec4::sXor(s, sin_sign.ReinterpretAsFloat());
+	outCos = Vec4::sXor(c, cos_sign.ReinterpretAsFloat());
+}
+
+Vec4 Vec4::Tan() const
+{
+	// Implementation based on tanf.c from the cephes library, see Vec4::SinCos for further details
+	// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
+
+	// Make argument positive
+	UVec4 tan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
+	Vec4 x = Vec4::sXor(*this, tan_sign.ReinterpretAsFloat());
+
+	// x / (PI / 2) rounded to nearest int gives us the quadrant closest to x
+	UVec4 quadrant = (0.6366197723675814f * x + Vec4::sReplicate(0.5f)).ToInt();
+
+	// Remap x to range [-PI / 4, PI / 4], see Vec4::SinCos
+	Vec4 float_quadrant = quadrant.ToFloat();
+	x = ((x - float_quadrant * 1.5703125f) - float_quadrant * 0.0004837512969970703125f) - float_quadrant * 7.549789948768648e-8f;
+
+	// Calculate x2 = x^2
+	Vec4 x2 = x * x;
+
+	// Roughly equivalent to the Taylor expansion:
+	// Tan(x) = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + ...
+	Vec4 tan =
+		(((((9.38540185543e-3f * x2 + Vec4::sReplicate(3.11992232697e-3f)) * x2 + Vec4::sReplicate(2.44301354525e-2f)) * x2
+		+ Vec4::sReplicate(5.34112807005e-2f)) * x2 + Vec4::sReplicate(1.33387994085e-1f)) * x2 + Vec4::sReplicate(3.33331568548e-1f)) * x2 * x + x;
+
+	// For the 2nd and 4th quadrant we need to invert the value
+	UVec4 bit1 = quadrant.LogicalShiftLeft<31>();
+	tan = Vec4::sSelect(tan, Vec4::sReplicate(-1.0f) / (tan JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))), bit1); // Add small epsilon to prevent div by zero, works because tan is always positive
+
+	// Put the sign back
+	return Vec4::sXor(tan, tan_sign.ReinterpretAsFloat());
+}
+
+Vec4 Vec4::ASin() const
+{
+	// Implementation based on asinf.c from the cephes library
+	// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
+
+	// Make argument positive
+	UVec4 asin_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
+	Vec4 a = Vec4::sXor(*this, asin_sign.ReinterpretAsFloat());
+
+	// ASin is not defined outside the range [-1, 1] but it often happens that a value is slightly above 1 so we just clamp here
+	a = Vec4::sMin(a, Vec4::sOne());
+
+	// When |x| <= 0.5 we use the asin approximation as is
+	Vec4 z1 = a * a;
+	Vec4 x1 = a;
+
+	// When |x| > 0.5 we use the identity asin(x) = PI / 2 - 2 * asin(sqrt((1 - x) / 2))
+	Vec4 z2 = 0.5f * (Vec4::sOne() - a);
+	Vec4 x2 = z2.Sqrt();
+
+	// Select which of the two situations we have
+	UVec4 greater = Vec4::sGreater(a, Vec4::sReplicate(0.5f));
+	Vec4 z = Vec4::sSelect(z1, z2, greater);
+	Vec4 x = Vec4::sSelect(x1, x2, greater);
+
+	// Polynomial approximation of asin
+	z = ((((4.2163199048e-2f * z + Vec4::sReplicate(2.4181311049e-2f)) * z + Vec4::sReplicate(4.5470025998e-2f)) * z + Vec4::sReplicate(7.4953002686e-2f)) * z + Vec4::sReplicate(1.6666752422e-1f)) * z * x + x;
+
+	// If |x| > 0.5 we need to apply the remainder of the identity above
+	z = Vec4::sSelect(z, Vec4::sReplicate(0.5f * JPH_PI) - (z + z), greater);
+
+	// Put the sign back
+	return Vec4::sXor(z, asin_sign.ReinterpretAsFloat());
+}
+
+Vec4 Vec4::ACos() const
+{
+	// Not the most accurate, but simple
+	return Vec4::sReplicate(0.5f * JPH_PI) - ASin();
+}
+
+Vec4 Vec4::ATan() const
+{
+	// Implementation based on atanf.c from the cephes library
+	// Original implementation by Stephen L. Moshier (See: http://www.moshier.net/)
+
+	// Make argument positive
+	UVec4 atan_sign = UVec4::sAnd(ReinterpretAsInt(), UVec4::sReplicate(0x80000000U));
+	Vec4 x = Vec4::sXor(*this, atan_sign.ReinterpretAsFloat());
+	Vec4 y = Vec4::sZero();
+
+	// If x > Tan(PI / 8)
+	UVec4 greater1 = Vec4::sGreater(x, Vec4::sReplicate(0.4142135623730950f));
+	Vec4 x1 = (x - Vec4::sOne()) / (x + Vec4::sOne());
+
+	// If x > Tan(3 * PI / 8)
+	UVec4 greater2 = Vec4::sGreater(x, Vec4::sReplicate(2.414213562373095f));
+	Vec4 x2 = Vec4::sReplicate(-1.0f) / (x JPH_IF_FLOATING_POINT_EXCEPTIONS_ENABLED(+ Vec4::sReplicate(FLT_MIN))); // Add small epsilon to prevent div by zero, works because x is always positive
+
+	// Apply first if
+	x = Vec4::sSelect(x, x1, greater1);
+	y = Vec4::sSelect(y, Vec4::sReplicate(0.25f * JPH_PI), greater1);
+
+	// Apply second if
+	x = Vec4::sSelect(x, x2, greater2);
+	y = Vec4::sSelect(y, Vec4::sReplicate(0.5f * JPH_PI), greater2);
+
+	// Polynomial approximation
+	Vec4 z = x * x;
+	y += (((8.05374449538e-2f * z - Vec4::sReplicate(1.38776856032e-1f)) * z + Vec4::sReplicate(1.99777106478e-1f)) * z - Vec4::sReplicate(3.33329491539e-1f)) * z * x + x;
+
+	// Put the sign back
+	return Vec4::sXor(y, atan_sign.ReinterpretAsFloat());
+}
+
+Vec4 Vec4::sATan2(Vec4Arg inY, Vec4Arg inX)
+{
+	UVec4 sign_mask = UVec4::sReplicate(0x80000000U);
+
+	// Determine absolute value and sign of y
+	UVec4 y_sign = UVec4::sAnd(inY.ReinterpretAsInt(), sign_mask);
+	Vec4 y_abs = Vec4::sXor(inY, y_sign.ReinterpretAsFloat());
+
+	// Determine absolute value and sign of x
+	UVec4 x_sign = UVec4::sAnd(inX.ReinterpretAsInt(), sign_mask);
+	Vec4 x_abs = Vec4::sXor(inX, x_sign.ReinterpretAsFloat());
+
+	// Always divide smallest / largest to avoid dividing by zero
+	UVec4 x_is_numerator = Vec4::sLess(x_abs, y_abs);
+	Vec4 numerator = Vec4::sSelect(y_abs, x_abs, x_is_numerator);
+	Vec4 denominator = Vec4::sSelect(x_abs, y_abs, x_is_numerator);
+	Vec4 atan = (numerator / denominator).ATan();
+
+	// If we calculated x / y instead of y / x the result is PI / 2 - result (note that this is true because we know the result is positive because the input was positive)
+	atan = Vec4::sSelect(atan, Vec4::sReplicate(0.5f * JPH_PI) - atan, x_is_numerator);
+
+	// Now we need to map to the correct quadrant
+	// x_sign	y_sign	result
+	// +1		+1		atan
+	// -1		+1		-atan + PI
+	// -1		-1		atan - PI
+	// +1		-1		-atan
+	// This can be written as: x_sign * y_sign * (atan - (x_sign < 0? PI : 0))
+	atan -= Vec4::sAnd(x_sign.ArithmeticShiftRight<31>().ReinterpretAsFloat(), Vec4::sReplicate(JPH_PI));
+	atan = Vec4::sXor(atan, UVec4::sXor(x_sign, y_sign).ReinterpretAsFloat());
+	return atan;
+}
+
+JPH_NAMESPACE_END
--- a/thirdparty/jolt_physics/Jolt/Math/Vector.h
+++ b/thirdparty/jolt_physics/Jolt/Math/Vector.h
@@ -0,0 +1,211 @@
+// Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
+// SPDX-FileCopyrightText: 2021 Jorrit Rouwe
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+JPH_NAMESPACE_BEGIN
+
+/// Templatized vector class
+template <uint Rows>
+class [[nodiscard]] Vector
+{
+public:
+	/// Constructor
+	inline						Vector() = default;
+	inline						Vector(const Vector &) = default;
+
+	/// Dimensions
+	inline uint					GetRows() const											{ return Rows; }
+
+	/// Vector with all zeros
+	inline void					SetZero()
+	{
+		for (uint r = 0; r < Rows; ++r)
+			mF32[r] = 0.0f;
+	}
+
+	inline static Vector		sZero()													{ Vector v; v.SetZero(); return v; }
+
+	/// Copy a (part) of another vector into this vector
+	template <class OtherVector>
+		void					CopyPart(const OtherVector &inV, uint inSourceRow, uint inNumRows, uint inDestRow)
+		{
+			for (uint r = 0; r < inNumRows; ++r)
+				mF32[inDestRow + r] = inV[inSourceRow + r];
+		}
+
+	/// Get float component by index
+	inline float				operator [] (uint inCoordinate) const
+	{
+		JPH_ASSERT(inCoordinate < Rows);
+		return mF32[inCoordinate];
+	}
+
+	inline float &				operator [] (uint inCoordinate)
+	{
+		JPH_ASSERT(inCoordinate < Rows);
+		return mF32[inCoordinate];
+	}
+
+	/// Comparison
+	inline bool					operator == (const Vector &inV2) const
+	{
+		for (uint r = 0; r < Rows; ++r)
+			if (mF32[r] != inV2.mF32[r])
+				return false;
+		return true;
+	}
+
+	inline bool					operator != (const Vector &inV2) const
+	{
+		for (uint r = 0; r < Rows; ++r)
+			if (mF32[r] != inV2.mF32[r])
+				return true;
+		return false;
+	}
+
+	/// Test if vector consists of all zeros
+	inline bool					IsZero() const
+	{
+		for (uint r = 0; r < Rows; ++r)
+			if (mF32[r] != 0.0f)
+				return false;
+		return true;
+	}
+
+	/// Test if two vectors are close to each other
+	inline bool					IsClose(const Vector &inV2, float inMaxDistSq = 1.0e-12f) const
+	{
+		return (inV2 - *this).LengthSq() <= inMaxDistSq;
+	}
+
+	/// Assignment
+	inline Vector &				operator = (const Vector &) = default;
+
+	/// Multiply vector with float
+	inline Vector				operator * (const float inV2) const
+	{
+		Vector v;
+		for (uint r = 0; r < Rows; ++r)
+			v.mF32[r] = mF32[r] * inV2;
+		return v;
+	}
+
+	inline Vector &				operator *= (const float inV2)
+	{
+		for (uint r = 0; r < Rows; ++r)
+			mF32[r] *= inV2;
+		return *this;
+	}
+
+	/// Multiply vector with float
+	inline friend Vector		operator * (const float inV1, const Vector &inV2)
+	{
+		return inV2 * inV1;
+	}
+
+	/// Divide vector by float
+	inline Vector				operator / (float inV2) const
+	{
+		Vector v;
+		for (uint r = 0; r < Rows; ++r)
+			v.mF32[r] = mF32[r] / inV2;
+		return v;
+	}
+
+	inline Vector &				operator /= (float inV2)
+	{
+		for (uint r = 0; r < Rows; ++r)
+			mF32[r] /= inV2;
+		return *this;
+	}
+
+	/// Add two float vectors (component wise)
+	inline Vector				operator + (const Vector &inV2) const
+	{
+		Vector v;
+		for (uint r = 0; r < Rows; ++r)
+			v.mF32[r] = mF32[r] + inV2.mF32[r];
+		return v;
+	}
+
+	inline Vector &				operator += (const Vector &inV2)
+	{
+		for (uint r = 0; r < Rows; ++r)
+			mF32[r] += inV2.mF32[r];
+		return *this;
+	}
+
+	/// Negate
+	inline Vector				operator - () const
+	{
+		Vector v;
+		for (uint r = 0; r < Rows; ++r)
+			v.mF32[r] = -mF32[r];
+		return v;
+	}
+
+	/// Subtract two float vectors (component wise)
+	inline Vector				operator - (const Vector &inV2) const
+	{
+		Vector v;
+		for (uint r = 0; r < Rows; ++r)
+			v.mF32[r] = mF32[r] - inV2.mF32[r];
+		return v;
+	}
+
+	inline Vector &				operator -= (const Vector &inV2)
+	{
+		for (uint r = 0; r < Rows; ++r)
+			mF32[r] -= inV2.mF32[r];
+		return *this;
+	}
+
+	/// Dot product
+	inline float				Dot(const Vector &inV2) const
+	{
+		float dot = 0.0f;
+		for (uint r = 0; r < Rows; ++r)
+			dot += mF32[r] * inV2.mF32[r];
+		return dot;
+	}
+
+	/// Squared length of vector
+	inline float				LengthSq() const
+	{
+		return Dot(*this);
+	}
+
+	/// Length of vector
+	inline float				Length() const
+	{
+		return sqrt(LengthSq());
+	}
+
+	/// Check if vector is normalized
+	inline bool					IsNormalized(float inToleranceSq = 1.0e-6f)
+	{
+		return abs(LengthSq() - 1.0f) <= inToleranceSq;
+	}
+
+	/// Normalize vector
+	inline Vector				Normalized() const
+	{
+		return *this / Length();
+	}
+
+	/// To String
+	friend ostream &			operator << (ostream &inStream, const Vector &inV)
+	{
+		inStream << "[";
+		for (uint i = 0; i < Rows - 1; ++i)
+			inStream << inV.mF32[i] << ", ";
+		inStream << inV.mF32[Rows - 1] << "]";
+		return inStream;
+	}
+
+	float						mF32[Rows];
+};
+
+JPH_NAMESPACE_END