initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
--- a/thirdparty/accesskit/LICENSE-MIT
+++ b/thirdparty/accesskit/LICENSE-MIT
@@ -0,0 +1,23 @@
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/thirdparty/accesskit/include/accesskit.h
+++ b/thirdparty/accesskit/include/accesskit.h
--- a/thirdparty/amd-fsr/ffx_a.h
+++ b/thirdparty/amd-fsr/ffx_a.h
--- a/thirdparty/amd-fsr/ffx_fsr1.h
+++ b/thirdparty/amd-fsr/ffx_fsr1.h
--- a/thirdparty/amd-fsr/license.txt
+++ b/thirdparty/amd-fsr/license.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/thirdparty/amd-fsr2/LICENSE.txt
+++ b/thirdparty/amd-fsr2/LICENSE.txt
@@ -0,0 +1,21 @@
+FidelityFX Super Resolution 2.2
+=================================
+Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/thirdparty/amd-fsr2/ffx_assert.cpp
+++ b/thirdparty/amd-fsr2/ffx_assert.cpp
@@ -0,0 +1,81 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "ffx_assert.h"
+#include <stdlib.h>  // for malloc()
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>  // required for OutputDebugString()
+#include <stdio.h>    // required for sprintf_s
+#endif                // #ifndef _WIN32
+
+static FfxAssertCallback s_assertCallback;
+
+// set the printing callback function
+void ffxAssertSetPrintingCallback(FfxAssertCallback callback)
+{
+    s_assertCallback = callback;
+    return;
+}
+
+// implementation of assert reporting
+bool ffxAssertReport(const char* file, int32_t line, const char* condition, const char* message)
+{
+    if (!file) {
+
+        return true;
+    }
+
+#ifdef _WIN32
+    // form the final assertion string and output to the TTY.
+    const size_t bufferSize = static_cast<size_t>(snprintf(nullptr, 0, "%s(%d): ASSERTION FAILED. %s\n", file, line, message ? message : condition)) + 1;
+    char*        tempBuf    = static_cast<char*>(malloc(bufferSize));
+    if (!tempBuf) {
+
+        return true;
+    }
+
+    if (!message) {
+        sprintf_s(tempBuf, bufferSize, "%s(%d): ASSERTION FAILED. %s\n", file, line, condition);
+    } else {
+        sprintf_s(tempBuf, bufferSize, "%s(%d): ASSERTION FAILED. %s\n", file, line, message);
+    }
+
+    if (!s_assertCallback) {
+        OutputDebugStringA(tempBuf);
+    } else {
+        s_assertCallback(tempBuf);
+    }
+
+    // free the buffer.
+    free(tempBuf);
+
+#else
+    FFX_UNUSED(line);
+    FFX_UNUSED(condition);
+    FFX_UNUSED(message);
+#endif
+
+    return true;
+}
--- a/thirdparty/amd-fsr2/ffx_assert.h
+++ b/thirdparty/amd-fsr2/ffx_assert.h
@@ -0,0 +1,132 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include "ffx_types.h"
+#include "ffx_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // #ifdef __cplusplus
+
+#ifdef _DEBUG
+#ifdef _WIN32
+
+#ifdef DISABLE_FFX_DEBUG_BREAK
+#define FFX_DEBUG_BREAK \
+    {                   \
+    }
+#else
+/// Macro to force the debugger to break at this point in the code.
+#define FFX_DEBUG_BREAK __debugbreak();
+#endif
+#else
+#define FFX_DEBUG_BREAK \
+    {                   \
+    }
+#endif
+#else
+// don't allow debug break in release builds.
+#define FFX_DEBUG_BREAK
+#endif
+
+/// A typedef for the callback function for assert printing.
+///
+/// This can be used to re-route printing of assert messages from the FFX backend
+/// to another destination. For example instead of the default behaviour of printing
+/// the assert messages to the debugger's TTY the message can be re-routed to a
+/// MessageBox in a GUI application.
+///
+/// @param [in] message                 The message generated by the assert.
+///
+typedef void (*FfxAssertCallback)(const char* message);
+
+/// Function to report an assert.
+///
+/// @param [in] file                    The name of the file as a string.
+/// @param [in] line                    The index of the line in the file.
+/// @param [in] condition               The boolean condition that was tested.
+/// @param [in] msg                     The optional message to print.
+///
+/// @returns
+/// Always returns true.
+///
+FFX_API bool ffxAssertReport(const char* file, int32_t line, const char* condition, const char* msg);
+
+/// Provides the ability to set a callback for assert messages.
+///
+/// @param [in] callback                The callback function that will receive assert messages.
+///
+FFX_API void ffxAssertSetPrintingCallback(FfxAssertCallback callback);
+
+#ifdef _DEBUG
+/// Standard assert macro.
+#define FFX_ASSERT(condition)                                                      \
+    do                                                                             \
+    {                                                                              \
+        if (!(condition) && ffxAssertReport(__FILE__, __LINE__, #condition, NULL)) \
+            FFX_DEBUG_BREAK                                                        \
+    } while (0)
+
+/// Assert macro with message.
+#define FFX_ASSERT_MESSAGE(condition, msg)                                        \
+    do                                                                            \
+    {                                                                             \
+        if (!(condition) && ffxAssertReport(__FILE__, __LINE__, #condition, msg)) \
+            FFX_DEBUG_BREAK                                                       \
+    } while (0)
+
+/// Assert macro that always fails.
+#define FFX_ASSERT_FAIL(message)                            \
+    do                                                      \
+    {                                                       \
+        ffxAssertReport(__FILE__, __LINE__, NULL, message); \
+        FFX_DEBUG_BREAK                                     \
+    } while (0)
+#else
+// asserts disabled
+#define FFX_ASSERT(condition)  \
+    do                         \
+    {                          \
+        FFX_UNUSED(condition); \
+    } while (0)
+
+#define FFX_ASSERT_MESSAGE(condition, message) \
+    do                                         \
+    {                                          \
+        FFX_UNUSED(condition);                 \
+        FFX_UNUSED(message);                   \
+    } while (0)
+
+#define FFX_ASSERT_FAIL(message) \
+    do                           \
+    {                            \
+        FFX_UNUSED(message);     \
+    } while (0)
+#endif  // #if _DEBUG
+
+/// Simple static assert.
+#define FFX_STATIC_ASSERT(condition) static_assert(condition, #condition)
+
+#ifdef __cplusplus
+}
+#endif  // #ifdef __cplusplus
--- a/thirdparty/amd-fsr2/ffx_error.h
+++ b/thirdparty/amd-fsr2/ffx_error.h
@@ -0,0 +1,59 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include "ffx_types.h"
+
+/// Typedef for error codes returned from functions in the FidelityFX SDK.
+typedef int32_t FfxErrorCode;
+
+static const FfxErrorCode FFX_OK                            = 0;           ///< The operation completed successfully.
+static const FfxErrorCode FFX_ERROR_INVALID_POINTER         = 0x80000000;  ///< The operation failed due to an invalid pointer.
+static const FfxErrorCode FFX_ERROR_INVALID_ALIGNMENT       = 0x80000001;  ///< The operation failed due to an invalid alignment.
+static const FfxErrorCode FFX_ERROR_INVALID_SIZE            = 0x80000002;  ///< The operation failed due to an invalid size.
+static const FfxErrorCode FFX_EOF                           = 0x80000003;  ///< The end of the file was encountered.
+static const FfxErrorCode FFX_ERROR_INVALID_PATH            = 0x80000004;  ///< The operation failed because the specified path was invalid.
+static const FfxErrorCode FFX_ERROR_EOF                     = 0x80000005;  ///< The operation failed because end of file was reached.
+static const FfxErrorCode FFX_ERROR_MALFORMED_DATA          = 0x80000006;  ///< The operation failed because of some malformed data.
+static const FfxErrorCode FFX_ERROR_OUT_OF_MEMORY           = 0x80000007;  ///< The operation failed because it ran out memory.
+static const FfxErrorCode FFX_ERROR_INCOMPLETE_INTERFACE    = 0x80000008;  ///< The operation failed because the interface was not fully configured.
+static const FfxErrorCode FFX_ERROR_INVALID_ENUM            = 0x80000009;  ///< The operation failed because of an invalid enumeration value.
+static const FfxErrorCode FFX_ERROR_INVALID_ARGUMENT        = 0x8000000a;  ///< The operation failed because an argument was invalid.
+static const FfxErrorCode FFX_ERROR_OUT_OF_RANGE            = 0x8000000b;  ///< The operation failed because a value was out of range.
+static const FfxErrorCode FFX_ERROR_NULL_DEVICE             = 0x8000000c;  ///< The operation failed because a device was null.
+static const FfxErrorCode FFX_ERROR_BACKEND_API_ERROR       = 0x8000000d;  ///< The operation failed because the backend API returned an error code.
+static const FfxErrorCode FFX_ERROR_INSUFFICIENT_MEMORY     = 0x8000000e;  ///< The operation failed because there was not enough memory.
+
+/// Helper macro to return error code y from a function when a specific condition, x, is not met.
+#define FFX_RETURN_ON_ERROR(x, y)                   \
+    if (!(x))                                       \
+    {                                               \
+        return (y);                                 \
+    }
+
+/// Helper macro to return error code x from a function when it is not FFX_OK.
+#define FFX_VALIDATE(x)                             \
+    {                                               \
+        FfxErrorCode ret = x;                       \
+        FFX_RETURN_ON_ERROR(ret == FFX_OK, ret);    \
+    }
+
--- a/thirdparty/amd-fsr2/ffx_fsr2.cpp
+++ b/thirdparty/amd-fsr2/ffx_fsr2.cpp
--- a/thirdparty/amd-fsr2/ffx_fsr2.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2.h
@@ -0,0 +1,455 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+
+// @defgroup FSR2
+
+#pragma once
+
+// Include the interface for the backend of the FSR2 API.
+#include "ffx_fsr2_interface.h"
+
+/// FidelityFX Super Resolution 2 major version.
+///
+/// @ingroup FSR2
+#define FFX_FSR2_VERSION_MAJOR      (2)
+
+/// FidelityFX Super Resolution 2 minor version.
+///
+/// @ingroup FSR2
+#define FFX_FSR2_VERSION_MINOR      (2)
+
+/// FidelityFX Super Resolution 2 patch version.
+///
+/// @ingroup FSR2
+#define FFX_FSR2_VERSION_PATCH      (1)
+
+/// The size of the context specified in 32bit values.
+///
+/// @ingroup FSR2
+#define FFX_FSR2_CONTEXT_SIZE       (16536)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // #if defined(__cplusplus)
+
+/// An enumeration of all the quality modes supported by FidelityFX Super
+/// Resolution 2 upscaling.
+///
+/// In order to provide a consistent user experience across multiple
+/// applications which implement FSR2. It is strongly recommended that the
+/// following preset scaling factors are made available through your
+/// application's user interface.
+///
+/// If your application does not expose the notion of preset scaling factors
+/// for upscaling algorithms (perhaps instead implementing a fixed ratio which
+/// is immutable) or implementing a more dynamic scaling scheme (such as
+/// dynamic resolution scaling), then there is no need to use these presets.
+///
+/// Please note that <c><i>FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE</i></c> is
+/// an optional mode which may introduce significant quality degradation in the
+/// final image. As such it is recommended that you evaluate the final results
+/// of using this scaling mode before deciding if you should include it in your
+/// application.
+///
+/// @ingroup FSR2
+typedef enum FfxFsr2QualityMode {
+
+    FFX_FSR2_QUALITY_MODE_QUALITY                       = 1,        ///< Perform upscaling with a per-dimension upscaling ratio of 1.5x.
+    FFX_FSR2_QUALITY_MODE_BALANCED                      = 2,        ///< Perform upscaling with a per-dimension upscaling ratio of 1.7x.
+    FFX_FSR2_QUALITY_MODE_PERFORMANCE                   = 3,        ///< Perform upscaling with a per-dimension upscaling ratio of 2.0x.
+    FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE             = 4         ///< Perform upscaling with a per-dimension upscaling ratio of 3.0x.
+} FfxFsr2QualityMode;
+
+/// An enumeration of bit flags used when creating a
+/// <c><i>FfxFsr2Context</i></c>. See <c><i>FfxFsr2ContextDescription</i></c>.
+///
+/// @ingroup FSR2
+typedef enum FfxFsr2InitializationFlagBits {
+
+    FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE                  = (1<<0),   ///< A bit indicating if the input color data provided is using a high-dynamic range.
+    FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS   = (1<<1),   ///< A bit indicating if the motion vectors are rendered at display resolution.
+    FFX_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION  = (1<<2),   ///< A bit indicating that the motion vectors have the jittering pattern applied to them.
+    FFX_FSR2_ENABLE_DEPTH_INVERTED                      = (1<<3),   ///< A bit indicating that the input depth buffer data provided is inverted [1..0].
+    FFX_FSR2_ENABLE_DEPTH_INFINITE                      = (1<<4),   ///< A bit indicating that the input depth buffer data provided is using an infinite far plane.
+    FFX_FSR2_ENABLE_AUTO_EXPOSURE                       = (1<<5),   ///< A bit indicating if automatic exposure should be applied to input color data.
+    FFX_FSR2_ENABLE_DYNAMIC_RESOLUTION                  = (1<<6),   ///< A bit indicating that the application uses dynamic resolution scaling.
+    FFX_FSR2_ENABLE_TEXTURE1D_USAGE                     = (1<<7),   ///< A bit indicating that the backend should use 1D textures.
+    FFX_FSR2_ENABLE_DEBUG_CHECKING                      = (1<<8),   ///< A bit indicating that the runtime should check some API values and report issues.
+} FfxFsr2InitializationFlagBits;
+
+/// A structure encapsulating the parameters required to initialize FidelityFX
+/// Super Resolution 2 upscaling.
+///
+/// @ingroup FSR2
+typedef struct FfxFsr2ContextDescription {
+
+    uint32_t                    flags;                              ///< A collection of <c><i>FfxFsr2InitializationFlagBits</i></c>.
+    FfxDimensions2D             maxRenderSize;                      ///< The maximum size that rendering will be performed at.
+    FfxDimensions2D             displaySize;                        ///< The size of the presentation resolution targeted by the upscaling process.
+    FfxFsr2Interface            callbacks;                          ///< A set of pointers to the backend implementation for FSR 2.0.
+    FfxDevice                   device;                             ///< The abstracted device which is passed to some callback functions.
+
+    FfxFsr2Message              fpMessage;                          ///< A pointer to a function that can recieve messages from the runtime.
+} FfxFsr2ContextDescription;
+
+/// A structure encapsulating the parameters for dispatching the various passes
+/// of FidelityFX Super Resolution 2.
+///
+/// @ingroup FSR2
+typedef struct FfxFsr2DispatchDescription {
+
+    FfxCommandList              commandList;                        ///< The <c><i>FfxCommandList</i></c> to record FSR2 rendering commands into.
+    FfxResource                 color;                              ///< A <c><i>FfxResource</i></c> containing the color buffer for the current frame (at render resolution).
+    FfxResource                 depth;                              ///< A <c><i>FfxResource</i></c> containing 32bit depth values for the current frame (at render resolution).
+    FfxResource                 motionVectors;                      ///< A <c><i>FfxResource</i></c> containing 2-dimensional motion vectors (at render resolution if <c><i>FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS</i></c> is not set).
+    FfxResource                 exposure;                           ///< A optional <c><i>FfxResource</i></c> containing a 1x1 exposure value.
+    FfxResource                 reactive;                           ///< A optional <c><i>FfxResource</i></c> containing alpha value of reactive objects in the scene.
+    FfxResource                 transparencyAndComposition;         ///< A optional <c><i>FfxResource</i></c> containing alpha value of special objects in the scene.
+    FfxResource                 output;                             ///< A <c><i>FfxResource</i></c> containing the output color buffer for the current frame (at presentation resolution).
+    FfxFloatCoords2D            jitterOffset;                       ///< The subpixel jitter offset applied to the camera.
+    FfxFloatCoords2D            motionVectorScale;                  ///< The scale factor to apply to motion vectors.
+    FfxDimensions2D             renderSize;                         ///< The resolution that was used for rendering the input resources.
+    bool                        enableSharpening;                   ///< Enable an additional sharpening pass.
+    float                       sharpness;                          ///< The sharpness value between 0 and 1, where 0 is no additional sharpness and 1 is maximum additional sharpness.
+    float                       frameTimeDelta;                     ///< The time elapsed since the last frame (expressed in milliseconds).
+    float                       preExposure;                        ///< The pre exposure value (must be > 0.0f)
+    bool                        reset;                              ///< A boolean value which when set to true, indicates the camera has moved discontinuously.
+    float                       cameraNear;                         ///< The distance to the near plane of the camera.
+    float                       cameraFar;                          ///< The distance to the far plane of the camera.
+    float                       cameraFovAngleVertical;             ///< The camera angle field of view in the vertical direction (expressed in radians).
+    float                       viewSpaceToMetersFactor;            ///< The scale factor to convert view space units to meters
+
+    // EXPERIMENTAL reactive mask generation parameters
+    bool                        enableAutoReactive;                 ///< A boolean value to indicate internal reactive autogeneration should be used
+    FfxResource                 colorOpaqueOnly;                    ///< A <c><i>FfxResource</i></c> containing the opaque only color buffer for the current frame (at render resolution).
+    float                       autoTcThreshold;                    ///< Cutoff value for TC
+    float                       autoTcScale;                        ///< A value to scale the transparency and composition mask
+    float                       autoReactiveScale;                  ///< A value to scale the reactive mask
+    float                       autoReactiveMax;                    ///< A value to clamp the reactive mask
+
+    float                       reprojectionMatrix[16];             ///< The matrix used for reprojecting pixels with invalid motion vectors by using the depth.
+} FfxFsr2DispatchDescription;
+
+/// A structure encapsulating the parameters for automatic generation of a reactive mask
+///
+/// @ingroup FSR2
+typedef struct FfxFsr2GenerateReactiveDescription {
+
+    FfxCommandList              commandList;                        ///< The <c><i>FfxCommandList</i></c> to record FSR2 rendering commands into.
+    FfxResource                 colorOpaqueOnly;                    ///< A <c><i>FfxResource</i></c> containing the opaque only color buffer for the current frame (at render resolution).
+    FfxResource                 colorPreUpscale;                    ///< A <c><i>FfxResource</i></c> containing the opaque+translucent color buffer for the current frame (at render resolution).
+    FfxResource                 outReactive;                        ///< A <c><i>FfxResource</i></c> containing the surface to generate the reactive mask into.
+    FfxDimensions2D             renderSize;                         ///< The resolution that was used for rendering the input resources.
+    float                       scale;                              ///< A value to scale the output
+    float                       cutoffThreshold;                    ///< A threshold value to generate a binary reactive mask
+    float                       binaryValue;                        ///< A value to set for the binary reactive mask
+    uint32_t                    flags;                              ///< Flags to determine how to generate the reactive mask
+} FfxFsr2GenerateReactiveDescription;
+
+/// A structure encapsulating the FidelityFX Super Resolution 2 context.
+///
+/// This sets up an object which contains all persistent internal data and
+/// resources that are required by FSR2.
+///
+/// The <c><i>FfxFsr2Context</i></c> object should have a lifetime matching
+/// your use of FSR2. Before destroying the FSR2 context care should be taken
+/// to ensure the GPU is not accessing the resources created or used by FSR2.
+/// It is therefore recommended that the GPU is idle before destroying the
+/// FSR2 context.
+///
+/// @ingroup FSR2
+typedef struct FfxFsr2Context {
+
+    uint32_t                    data[FFX_FSR2_CONTEXT_SIZE];        ///< An opaque set of <c>uint32_t</c> which contain the data for the context.
+} FfxFsr2Context;
+
+/// Create a FidelityFX Super Resolution 2 context from the parameters
+/// programmed to the <c><i>FfxFsr2CreateParams</i></c> structure.
+///
+/// The context structure is the main object used to interact with the FSR2
+/// API, and is responsible for the management of the internal resources used
+/// by the FSR2 algorithm. When this API is called, multiple calls will be
+/// made via the pointers contained in the <c><i>callbacks</i></c> structure.
+/// These callbacks will attempt to retreive the device capabilities, and
+/// create the internal resources, and pipelines required by FSR2's
+/// frame-to-frame function. Depending on the precise configuration used when
+/// creating the <c><i>FfxFsr2Context</i></c> a different set of resources and
+/// pipelines might be requested via the callback functions.
+///
+/// The flags included in the <c><i>flags</i></c> field of
+/// <c><i>FfxFsr2Context</i></c> how match the configuration of your
+/// application as well as the intended use of FSR2. It is important that these
+/// flags are set correctly (as well as a correct programmed
+/// <c><i>FfxFsr2DispatchDescription</i></c>) to ensure correct operation. It is
+/// recommended to consult the overview documentation for further details on
+/// how FSR2 should be integerated into an application.
+///
+/// When the <c><i>FfxFsr2Context</i></c> is created, you should use the
+/// <c><i>ffxFsr2ContextDispatch</i></c> function each frame where FSR2
+/// upscaling should be applied. See the documentation of
+/// <c><i>ffxFsr2ContextDispatch</i></c> for more details.
+///
+/// The <c><i>FfxFsr2Context</i></c> should be destroyed when use of it is
+/// completed, typically when an application is unloaded or FSR2 upscaling is
+/// disabled by a user. To destroy the FSR2 context you should call
+/// <c><i>ffxFsr2ContextDestroy</i></c>.
+///
+/// @param [out] context                A pointer to a <c><i>FfxFsr2Context</i></c> structure to populate.
+/// @param [in]  contextDescription     A pointer to a <c><i>FfxFsr2ContextDescription</i></c> structure.
+///
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+/// @retval
+/// FFX_ERROR_CODE_NULL_POINTER         The operation failed because either <c><i>context</i></c> or <c><i>contextDescription</i></c> was <c><i>NULL</i></c>.
+/// @retval
+/// FFX_ERROR_INCOMPLETE_INTERFACE      The operation failed because the <c><i>FfxFsr2ContextDescription.callbacks</i></c>  was not fully specified.
+/// @retval
+/// FFX_ERROR_BACKEND_API_ERROR         The operation failed because of an error returned from the backend.
+///
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2ContextCreate(FfxFsr2Context* context, const FfxFsr2ContextDescription* contextDescription);
+
+/// Dispatch the various passes that constitute FidelityFX Super Resolution 2.
+///
+/// FSR2 is a composite effect, meaning that it is compromised of multiple
+/// constituent passes (implemented as one or more clears, copies and compute
+/// dispatches). The <c><i>ffxFsr2ContextDispatch</i></c> function is the
+/// function which (via the use of the functions contained in the
+/// <c><i>callbacks</i></c> field of the <c><i>FfxFsr2Context</i></c>
+/// structure) utlimately generates the sequence of graphics API calls required
+/// each frame.
+///
+/// As with the creation of the <c><i>FfxFsr2Context</i></c> correctly
+/// programming the <c><i>FfxFsr2DispatchDescription</i></c> is key to ensuring
+/// the correct operation of FSR2. It is particularly important to ensure that
+/// camera jitter is correctly applied to your application's projection matrix
+/// (or camera origin for raytraced applications). FSR2 provides the
+/// <c><i>ffxFsr2GetJitterPhaseCount</i></c> and
+/// <c><i>ffxFsr2GetJitterOffset</i></c> entry points to help applications
+/// correctly compute the camera jitter. Whatever jitter pattern is used by the
+/// application it should be correctly programmed to the
+/// <c><i>jitterOffset</i></c> field of the <c><i>dispatchDescription</i></c>
+/// structure. For more guidance on camera jitter please consult the
+/// documentation for <c><i>ffxFsr2GetJitterOffset</i></c> as well as the
+/// accompanying overview documentation for FSR2.
+///
+/// @param [in] context                 A pointer to a <c><i>FfxFsr2Context</i></c> structure.
+/// @param [in] dispatchDescription     A pointer to a <c><i>FfxFsr2DispatchDescription</i></c> structure.
+///
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+/// @retval
+/// FFX_ERROR_CODE_NULL_POINTER         The operation failed because either <c><i>context</i></c> or <c><i>dispatchDescription</i></c> was <c><i>NULL</i></c>.
+/// @retval
+/// FFX_ERROR_OUT_OF_RANGE              The operation failed because <c><i>dispatchDescription.renderSize</i></c> was larger than the maximum render resolution.
+/// @retval
+/// FFX_ERROR_NULL_DEVICE               The operation failed because the device inside the context was <c><i>NULL</i></c>.
+/// @retval
+/// FFX_ERROR_BACKEND_API_ERROR         The operation failed because of an error returned from the backend.
+///
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2ContextDispatch(FfxFsr2Context* context, const FfxFsr2DispatchDescription* dispatchDescription);
+
+/// A helper function generate a Reactive mask from an opaque only texure and one containing translucent objects.
+///
+/// @param [in] context                 A pointer to a <c><i>FfxFsr2Context</i></c> structure.
+/// @param [in] params                  A pointer to a <c><i>FfxFsr2GenerateReactiveDescription</i></c> structure
+///
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+///
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2ContextGenerateReactiveMask(FfxFsr2Context* context, const FfxFsr2GenerateReactiveDescription* params);
+
+/// Destroy the FidelityFX Super Resolution context.
+///
+/// @param [out] context                A pointer to a <c><i>FfxFsr2Context</i></c> structure to destroy.
+///
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+/// @retval
+/// FFX_ERROR_CODE_NULL_POINTER         The operation failed because either <c><i>context</i></c> was <c><i>NULL</i></c>.
+///
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2ContextDestroy(FfxFsr2Context* context);
+
+/// Get the upscale ratio from the quality mode.
+///
+/// The following table enumerates the mapping of the quality modes to
+/// per-dimension scaling ratios.
+///
+/// Quality preset                                        | Scale factor
+/// ----------------------------------------------------- | -------------
+/// <c><i>FFX_FSR2_QUALITY_MODE_QUALITY</i></c>           | 1.5x
+/// <c><i>FFX_FSR2_QUALITY_MODE_BALANCED</i></c>          | 1.7x
+/// <c><i>FFX_FSR2_QUALITY_MODE_PERFORMANCE</i></c>       | 2.0x
+/// <c><i>FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE</i></c> | 3.0x
+///
+/// Passing an invalid <c><i>qualityMode</i></c> will return 0.0f.
+///
+/// @param [in] qualityMode             The quality mode preset.
+///
+/// @returns
+/// The upscaling the per-dimension upscaling ratio for
+/// <c><i>qualityMode</i></c> according to the table above.
+///
+/// @ingroup FSR2
+FFX_API float ffxFsr2GetUpscaleRatioFromQualityMode(FfxFsr2QualityMode qualityMode);
+
+/// A helper function to calculate the rendering resolution from a target
+/// resolution and desired quality level.
+///
+/// This function applies the scaling factor returned by
+/// <c><i>ffxFsr2GetUpscaleRatioFromQualityMode</i></c> to each dimension.
+///
+/// @param [out] renderWidth            A pointer to a <c>uint32_t</c> which will hold the calculated render resolution width.
+/// @param [out] renderHeight           A pointer to a <c>uint32_t</c> which will hold the calculated render resolution height.
+/// @param [in] displayWidth            The target display resolution width.
+/// @param [in] displayHeight           The target display resolution height.
+/// @param [in] qualityMode             The desired quality mode for FSR 2 upscaling.
+///
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+/// @retval
+/// FFX_ERROR_INVALID_POINTER           Either <c><i>renderWidth</i></c> or <c><i>renderHeight</i></c> was <c>NULL</c>.
+/// @retval
+/// FFX_ERROR_INVALID_ENUM              An invalid quality mode was specified.
+///
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2GetRenderResolutionFromQualityMode(
+    uint32_t* renderWidth,
+    uint32_t* renderHeight,
+    uint32_t displayWidth,
+    uint32_t displayHeight,
+    FfxFsr2QualityMode qualityMode);
+
+/// A helper function to calculate the jitter phase count from display
+/// resolution.
+///
+/// For more detailed information about the application of camera jitter to
+/// your application's rendering please refer to the
+/// <c><i>ffxFsr2GetJitterOffset</i></c> function.
+/// 
+/// The table below shows the jitter phase count which this function
+/// would return for each of the quality presets.
+///
+/// Quality preset                                        | Scale factor  | Phase count
+/// ----------------------------------------------------- | ------------- | ---------------
+/// <c><i>FFX_FSR2_QUALITY_MODE_QUALITY</i></c>           | 1.5x          | 18
+/// <c><i>FFX_FSR2_QUALITY_MODE_BALANCED</i></c>          | 1.7x          | 23
+/// <c><i>FFX_FSR2_QUALITY_MODE_PERFORMANCE</i></c>       | 2.0x          | 32
+/// <c><i>FFX_FSR2_QUALITY_MODE_ULTRA_PERFORMANCE</i></c> | 3.0x          | 72
+/// Custom                                                | [1..n]x       | ceil(8*n^2)
+///
+/// @param [in] renderWidth             The render resolution width.
+/// @param [in] displayWidth            The display resolution width.
+///
+/// @returns
+/// The jitter phase count for the scaling factor between <c><i>renderWidth</i></c> and <c><i>displayWidth</i></c>.
+///
+/// @ingroup FSR2
+FFX_API int32_t ffxFsr2GetJitterPhaseCount(int32_t renderWidth, int32_t displayWidth);
+
+/// A helper function to calculate the subpixel jitter offset.
+///
+/// FSR2 relies on the application to apply sub-pixel jittering while rendering.
+/// This is typically included in the projection matrix of the camera. To make
+/// the application of camera jitter simple, the FSR2 API provides a small set
+/// of utility function which computes the sub-pixel jitter offset for a
+/// particular frame within a sequence of separate jitter offsets. To begin, the
+/// index within the jitter phase must be computed. To calculate the
+/// sequence's length, you can call the <c><i>ffxFsr2GetJitterPhaseCount</i></c>
+/// function. The index should be a value which is incremented each frame modulo
+/// the length of the sequence computed by <c><i>ffxFsr2GetJitterPhaseCount</i></c>.
+/// The index within the jitter phase  is passed to
+/// <c><i>ffxFsr2GetJitterOffset</i></c> via the <c><i>index</i></c> parameter.
+///
+/// This function uses a Halton(2,3) sequence to compute the jitter offset.
+/// The ultimate index used for the sequence is <c><i>index</i></c> %
+/// <c><i>phaseCount</i></c>.
+///
+/// It is important to understand that the values returned from the
+/// <c><i>ffxFsr2GetJitterOffset</i></c> function are in unit pixel space, and
+/// in order to composite this correctly into a projection matrix we must
+/// convert them into projection offsets. This is done as per the pseudo code
+/// listing which is shown below.
+///
+///     const int32_t jitterPhaseCount = ffxFsr2GetJitterPhaseCount(renderWidth, displayWidth);
+///
+///     float jitterX = 0;
+///     float jitterY = 0;
+///     ffxFsr2GetJitterOffset(&jitterX, &jitterY, index, jitterPhaseCount);
+/// 
+///     const float jitterX = 2.0f * jitterX / (float)renderWidth;
+///     const float jitterY = -2.0f * jitterY / (float)renderHeight;
+///     const Matrix4 jitterTranslationMatrix = translateMatrix(Matrix3::identity, Vector3(jitterX, jitterY, 0));
+///     const Matrix4 jitteredProjectionMatrix = jitterTranslationMatrix * projectionMatrix;
+/// 
+/// Jitter should be applied to all rendering. This includes opaque, alpha
+/// transparent, and raytraced objects. For rasterized objects, the sub-pixel
+/// jittering values calculated by the <c><i>iffxFsr2GetJitterOffset</i></c>
+/// function can be applied to the camera projection matrix which is ultimately
+/// used to perform transformations during vertex shading. For raytraced
+/// rendering, the sub-pixel jitter should be applied to the ray's origin,
+/// often the camera's position.
+/// 
+/// Whether you elect to use the <c><i>ffxFsr2GetJitterOffset</i></c> function
+/// or your own sequence generator, you must program the
+/// <c><i>jitterOffset</i></c> field of the
+/// <c><i>FfxFsr2DispatchParameters</i></c> structure in order to inform FSR2
+/// of the jitter offset that has been applied in order to render each frame.
+/// 
+/// If not using the recommended <c><i>ffxFsr2GetJitterOffset</i></c> function,
+/// care should be taken that your jitter sequence never generates a null vector;
+/// that is value of 0 in both the X and Y dimensions.
+///
+/// @param [out] outX                   A pointer to a <c>float</c> which will contain the subpixel jitter offset for the x dimension.
+/// @param [out] outY                   A pointer to a <c>float</c> which will contain the subpixel jitter offset for the y dimension.
+/// @param [in] index                   The index within the jitter sequence.
+/// @param [in] phaseCount              The length of jitter phase. See <c><i>ffxFsr2GetJitterPhaseCount</i></c>.
+/// 
+/// @retval
+/// FFX_OK                              The operation completed successfully.
+/// @retval
+/// FFX_ERROR_INVALID_POINTER           Either <c><i>outX</i></c> or <c><i>outY</i></c> was <c>NULL</c>.
+/// @retval
+/// FFX_ERROR_INVALID_ARGUMENT          Argument <c><i>phaseCount</i></c> must be greater than 0.
+/// 
+/// @ingroup FSR2
+FFX_API FfxErrorCode ffxFsr2GetJitterOffset(float* outX, float* outY, int32_t index, int32_t phaseCount);
+
+/// A helper function to check if a resource is
+/// <c><i>FFX_FSR2_RESOURCE_IDENTIFIER_NULL</i></c>.
+///
+/// @param [in] resource                A <c><i>FfxResource</i></c>.
+///
+/// @returns
+/// true                                The <c><i>resource</i></c> was not <c><i>FFX_FSR2_RESOURCE_IDENTIFIER_NULL</i></c>.
+/// @returns
+/// false                               The <c><i>resource</i></c> was <c><i>FFX_FSR2_RESOURCE_IDENTIFIER_NULL</i></c>.
+///
+/// @ingroup FSR2
+FFX_API bool ffxFsr2ResourceIsNull(FfxResource resource);
+
+#if defined(__cplusplus)
+}
+#endif // #if defined(__cplusplus)
--- a/thirdparty/amd-fsr2/ffx_fsr2_interface.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2_interface.h
@@ -0,0 +1,395 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include "ffx_assert.h"
+#include "ffx_types.h"
+#include "ffx_error.h"
+
+// Include the FSR2 resources defined in the HLSL code. This shared here to avoid getting out of sync.
+#define FFX_CPU
+#include "shaders/ffx_fsr2_resources.h"
+#include "shaders/ffx_fsr2_common.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // #if defined(__cplusplus)
+
+FFX_FORWARD_DECLARE(FfxFsr2Interface);
+
+/// An enumeration of all the passes which constitute the FSR2 algorithm.
+///
+/// FSR2 is implemented as a composite of several compute passes each
+/// computing a key part of the final result. Each call to the 
+/// <c><i>FfxFsr2ScheduleGpuJobFunc</i></c> callback function will
+/// correspond to a single pass included in <c><i>FfxFsr2Pass</i></c>. For a
+/// more comprehensive description of each pass, please refer to the FSR2
+/// reference documentation.
+///
+/// Please note in some cases e.g.: <c><i>FFX_FSR2_PASS_ACCUMULATE</i></c>
+/// and <c><i>FFX_FSR2_PASS_ACCUMULATE_SHARPEN</i></c> either one pass or the
+/// other will be used (they are mutually exclusive). The choice of which will
+/// depend on the way the <c><i>FfxFsr2Context</i></c> is created and the
+/// precise contents of <c><i>FfxFsr2DispatchParamters</i></c> each time a call
+/// is made to <c><i>ffxFsr2ContextDispatch</i></c>.
+/// 
+/// @ingroup FSR2
+typedef enum FfxFsr2Pass {
+
+    FFX_FSR2_PASS_DEPTH_CLIP = 0,                                       ///< A pass which performs depth clipping.
+    FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH = 1,                       ///< A pass which performs reconstruction of previous frame's depth.
+    FFX_FSR2_PASS_LOCK = 2,                                             ///< A pass which calculates pixel locks.
+    FFX_FSR2_PASS_ACCUMULATE = 3,                                       ///< A pass which performs upscaling.
+    FFX_FSR2_PASS_ACCUMULATE_SHARPEN = 4,                               ///< A pass which performs upscaling when sharpening is used.
+    FFX_FSR2_PASS_RCAS = 5,                                             ///< A pass which performs sharpening.
+    FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID = 6,                        ///< A pass which generates the luminance mipmap chain for the current frame.
+    FFX_FSR2_PASS_GENERATE_REACTIVE = 7,                                ///< An optional pass to generate a reactive mask
+    FFX_FSR2_PASS_TCR_AUTOGENERATE = 8,                                 ///< An optional pass to generate a texture-and-composition and reactive masks
+
+    FFX_FSR2_PASS_COUNT                                                 ///< The number of passes performed by FSR2.
+} FfxFsr2Pass;
+
+typedef enum FfxFsr2MsgType {
+    FFX_FSR2_MESSAGE_TYPE_ERROR = 0,
+    FFX_FSR2_MESSAGE_TYPE_WARNING = 1,
+    FFX_FSR2_MESSAGE_TYPE_COUNT
+} FfxFsr2MsgType;
+
+/// Create and initialize the backend context.
+///
+/// The callback function sets up the backend context for rendering.
+/// It will create or reference the device and create required internal data structures.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] device                              The FfxDevice obtained by ffxGetDevice(DX12/VK/...).
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+///
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2CreateBackendContextFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxDevice device);
+
+/// Get a list of capabilities of the device.
+///
+/// When creating an <c><i>FfxFsr2Context</i></c> it is desirable for the FSR2
+/// core implementation to be aware of certain characteristics of the platform
+/// that is being targetted. This is because some optimizations which FSR2
+/// attempts to perform are more effective on certain classes of hardware than
+/// others, or are not supported by older hardware. In order to avoid cases
+/// where optimizations actually have the effect of decreasing performance, or
+/// reduce the breadth of support provided by FSR2, FSR2 queries the
+/// capabilities of the device to make such decisions.
+///
+/// For target platforms with fixed hardware support you need not implement
+/// this callback function by querying the device, but instead may hardcore
+/// what features are available on the platform.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [out] outDeviceCapabilities              The device capabilities structure to fill out.
+/// @param [in] device                              The device to query for capabilities.
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode(*FfxFsr2GetDeviceCapabilitiesFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxDeviceCapabilities* outDeviceCapabilities,
+    FfxDevice device);
+
+/// Destroy the backend context and dereference the device.
+///
+/// This function is called when the <c><i>FfxFsr2Context</i></c> is destroyed.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+///
+/// @ingroup FSR2
+typedef FfxErrorCode(*FfxFsr2DestroyBackendContextFunc)(
+    FfxFsr2Interface* backendInterface);
+
+/// Create a resource.
+///
+/// This callback is intended for the backend to create internal resources.
+///
+/// Please note: It is also possible that the creation of resources might
+/// itself cause additional resources to be created by simply calling the
+/// <c><i>FfxFsr2CreateResourceFunc</i></c> function pointer again. This is
+/// useful when handling the initial creation of resources which must be
+/// initialized. The flow in such a case would be an initial call to create the
+/// CPU-side resource, another to create the GPU-side resource, and then a call
+/// to schedule a copy render job to move the data between the two. Typically
+/// this type of function call flow is only seen during the creation of an
+/// <c><i>FfxFsr2Context</i></c>.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] createResourceDescription           A pointer to a <c><i>FfxCreateResourceDescription</i></c>.
+/// @param [out] outResource                        A pointer to a <c><i>FfxResource</i></c> object.
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2CreateResourceFunc)(
+    FfxFsr2Interface* backendInterface,
+    const FfxCreateResourceDescription* createResourceDescription,
+    FfxResourceInternal* outResource);
+
+/// Register a resource in the backend for the current frame.
+///
+/// Since FSR2 and the backend are not aware how many different
+/// resources will get passed to FSR2 over time, it's not safe 
+/// to register all resources simultaneously in the backend.
+/// Also passed resources may not be valid after the dispatch call.
+/// As a result it's safest to register them as FfxResourceInternal 
+/// and clear them at the end of the dispatch call.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] inResource                          A pointer to a <c><i>FfxResource</i></c>.
+/// @param [out] outResource                        A pointer to a <c><i>FfxResourceInternal</i></c> object.
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode(*FfxFsr2RegisterResourceFunc)(
+    FfxFsr2Interface* backendInterface,
+    const FfxResource* inResource,
+    FfxResourceInternal* outResource);
+
+/// Unregister all temporary FfxResourceInternal from the backend.
+///
+/// Unregister FfxResourceInternal referencing resources passed to 
+/// a function as a parameter.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+///
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode(*FfxFsr2UnregisterResourcesFunc)(
+    FfxFsr2Interface* backendInterface);
+
+/// Retrieve a <c><i>FfxResourceDescription</i></c> matching a
+/// <c><i>FfxResource</i></c> structure. 
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] resource                            A pointer to a <c><i>FfxResource</i></c> object.
+///
+/// @returns
+/// A description of the resource.
+///
+/// @ingroup FSR2
+typedef FfxResourceDescription (*FfxFsr2GetResourceDescriptionFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxResourceInternal resource);
+
+/// Destroy a resource
+///
+/// This callback is intended for the backend to release an internal resource.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] resource                            A pointer to a <c><i>FfxResource</i></c> object.
+/// 
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2DestroyResourceFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxResourceInternal resource);
+
+/// Create a render pipeline.
+///
+/// A rendering pipeline contains the shader as well as resource bindpoints
+/// and samplers.
+/// 
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] pass                                The identifier for the pass.
+/// @param [in] pipelineDescription                 A pointer to a <c><i>FfxPipelineDescription</i></c> describing the pipeline to be created.
+/// @param [out] outPipeline                        A pointer to a <c><i>FfxPipelineState</i></c> structure which should be populated.
+/// 
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2CreatePipelineFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxFsr2Pass pass,
+    const FfxPipelineDescription* pipelineDescription,
+    FfxPipelineState* outPipeline);
+
+/// Destroy a render pipeline.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [out] pipeline                           A pointer to a <c><i>FfxPipelineState</i></c> structure which should be released.
+/// 
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2DestroyPipelineFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxPipelineState* pipeline);
+
+/// Schedule a render job to be executed on the next call of
+/// <c><i>FfxFsr2ExecuteGpuJobsFunc</i></c>.
+///
+/// Render jobs can perform one of three different tasks: clear, copy or
+/// compute dispatches.
+///
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] job                                 A pointer to a <c><i>FfxGpuJobDescription</i></c> structure.
+/// 
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2ScheduleGpuJobFunc)(
+    FfxFsr2Interface* backendInterface,
+    const FfxGpuJobDescription* job);
+
+/// Execute scheduled render jobs on the <c><i>comandList</i></c> provided.
+/// 
+/// The recording of the graphics API commands should take place in this
+/// callback function, the render jobs which were previously enqueued (via
+/// callbacks made to <c><i>FfxFsr2ScheduleGpuJobFunc</i></c>) should be
+/// processed in the order they were received. Advanced users might choose to
+/// reorder the rendering jobs, but should do so with care to respect the
+/// resource dependencies.
+/// 
+/// Depending on the precise contents of <c><i>FfxFsr2DispatchDescription</i></c> a
+/// different number of render jobs might have previously been enqueued (for
+/// example if sharpening is toggled on and off).
+/// 
+/// @param [in] backendInterface                    A pointer to the backend interface.
+/// @param [in] commandList                         A pointer to a <c><i>FfxCommandList</i></c> structure.
+/// 
+/// @retval
+/// FFX_OK                                          The operation completed successfully.
+/// @retval
+/// Anything else                                   The operation failed.
+/// 
+/// @ingroup FSR2
+typedef FfxErrorCode (*FfxFsr2ExecuteGpuJobsFunc)(
+    FfxFsr2Interface* backendInterface,
+    FfxCommandList commandList);
+
+/// Pass a string message
+///
+/// Used for debug messages.
+///
+/// @param [in] type                       The type of message.
+/// @param [in] message                    A string message to pass.
+///
+///
+/// @ingroup FSR2
+typedef void(*FfxFsr2Message)(
+    FfxFsr2MsgType type,
+    const wchar_t* message);
+
+/// A structure encapsulating the interface between the core implentation of
+/// the FSR2 algorithm and any graphics API that it should ultimately call.
+/// 
+/// This set of functions serves as an abstraction layer between FSR2 and the
+/// API used to implement it. While FSR2 ships with backends for DirectX12 and
+/// Vulkan, it is possible to implement your own backend for other platforms or
+/// which sits ontop of your engine's own abstraction layer. For details on the
+/// expectations of what each function should do you should refer the
+/// description of the following function pointer types:
+/// 
+///     <c><i>FfxFsr2CreateDeviceFunc</i></c>
+///     <c><i>FfxFsr2GetDeviceCapabilitiesFunc</i></c>
+///     <c><i>FfxFsr2DestroyDeviceFunc</i></c>
+///     <c><i>FfxFsr2CreateResourceFunc</i></c>
+///     <c><i>FfxFsr2GetResourceDescriptionFunc</i></c>
+///     <c><i>FfxFsr2DestroyResourceFunc</i></c>
+///     <c><i>FfxFsr2CreatePipelineFunc</i></c>
+///     <c><i>FfxFsr2DestroyPipelineFunc</i></c>
+///     <c><i>FfxFsr2ScheduleGpuJobFunc</i></c>
+///     <c><i>FfxFsr2ExecuteGpuJobsFunc</i></c>
+///
+/// Depending on the graphics API that is abstracted by the backend, it may be
+/// required that the backend is to some extent stateful. To ensure that
+/// applications retain full control to manage the memory used by FSR2, the
+/// <c><i>scratchBuffer</i></c> and <c><i>scratchBufferSize</i></c> fields are
+/// provided. A backend should provide a means of specifying how much scratch
+/// memory is required for its internal implementation (e.g: via a function
+/// or constant value). The application is that responsible for allocating that
+/// memory and providing it when setting up the FSR2 backend. Backends provided
+/// with FSR2 do not perform dynamic memory allocations, and instead
+/// suballocate all memory from the scratch buffers provided.
+///
+/// The <c><i>scratchBuffer</i></c> and <c><i>scratchBufferSize</i></c> fields
+/// should be populated according to the requirements of each backend. For
+/// example, if using the DirectX 12 backend you should call the 
+/// <c><i>ffxFsr2GetScratchMemorySizeDX12</i></c> function. It is not required
+/// that custom backend implementations use a scratch buffer.
+///
+/// @ingroup FSR2
+typedef struct FfxFsr2Interface {
+
+    FfxFsr2CreateBackendContextFunc         fpCreateBackendContext;         ///< A callback function to create and initialize the backend context.
+    FfxFsr2GetDeviceCapabilitiesFunc        fpGetDeviceCapabilities;        ///< A callback function to query device capabilites.
+    FfxFsr2DestroyBackendContextFunc        fpDestroyBackendContext;        ///< A callback function to destroy the backendcontext. This also dereferences the device.
+    FfxFsr2CreateResourceFunc               fpCreateResource;               ///< A callback function to create a resource.
+    FfxFsr2RegisterResourceFunc             fpRegisterResource;             ///< A callback function to register an external resource.
+    FfxFsr2UnregisterResourcesFunc          fpUnregisterResources;          ///< A callback function to unregister external resource.
+    FfxFsr2GetResourceDescriptionFunc       fpGetResourceDescription;       ///< A callback function to retrieve a resource description.
+    FfxFsr2DestroyResourceFunc              fpDestroyResource;              ///< A callback function to destroy a resource.
+    FfxFsr2CreatePipelineFunc               fpCreatePipeline;               ///< A callback function to create a render or compute pipeline.
+    FfxFsr2DestroyPipelineFunc              fpDestroyPipeline;              ///< A callback function to destroy a render or compute pipeline.
+    FfxFsr2ScheduleGpuJobFunc               fpScheduleGpuJob;               ///< A callback function to schedule a render job.
+    FfxFsr2ExecuteGpuJobsFunc               fpExecuteGpuJobs;               ///< A callback function to execute all queued render jobs.
+
+    void*                                   scratchBuffer;                  ///< A preallocated buffer for memory utilized internally by the backend.
+    size_t                                  scratchBufferSize;              ///< Size of the buffer pointed to by <c><i>scratchBuffer</i></c>.
+} FfxFsr2Interface;
+
+#if defined(__cplusplus)
+}
+#endif // #if defined(__cplusplus)
--- a/thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2_maximum_bias.h
@@ -0,0 +1,46 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// @internal
+
+#pragma once
+
+static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH = 16;
+static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT = 16;
+static const float ffxFsr2MaximumBias[] = {
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.876f,	1.809f,	1.772f,	1.753f,	1.748f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.869f,	1.801f,	1.764f,	1.745f,	1.739f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.976f,	1.841f,	1.774f,	1.737f,	1.716f,	1.71f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.914f,	1.784f,	1.716f,	1.673f,	1.649f,	1.641f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.793f,	1.676f,	1.604f,	1.562f,	1.54f,	1.533f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.802f,	1.619f,	1.536f,	1.492f,	1.467f,	1.454f,	1.449f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.812f,	1.575f,	1.496f,	1.456f,	1.432f,	1.416f,	1.408f,	1.405f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.555f,	1.479f,	1.438f,	1.413f,	1.398f,	1.387f,	1.381f,	1.379f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.812f,	1.555f,	1.474f,	1.43f,	1.404f,	1.387f,	1.376f,	1.368f,	1.363f,	1.362f,
+	2.0f,	2.0f,	2.0f,	2.0f,	2.0f,	1.802f,	1.575f,	1.479f,	1.43f,	1.401f,	1.382f,	1.369f,	1.36f,	1.354f,	1.351f,	1.35f,
+	2.0f,	2.0f,	1.976f,	1.914f,	1.793f,	1.619f,	1.496f,	1.438f,	1.404f,	1.382f,	1.367f,	1.357f,	1.349f,	1.344f,	1.341f,	1.34f,
+	1.876f,	1.869f,	1.841f,	1.784f,	1.676f,	1.536f,	1.456f,	1.413f,	1.387f,	1.369f,	1.357f,	1.347f,	1.341f,	1.336f,	1.333f,	1.332f,
+	1.809f,	1.801f,	1.774f,	1.716f,	1.604f,	1.492f,	1.432f,	1.398f,	1.376f,	1.36f,	1.349f,	1.341f,	1.335f,	1.33f,	1.328f,	1.327f,
+	1.772f,	1.764f,	1.737f,	1.673f,	1.562f,	1.467f,	1.416f,	1.387f,	1.368f,	1.354f,	1.344f,	1.336f,	1.33f,	1.326f,	1.323f,	1.323f,
+	1.753f,	1.745f,	1.716f,	1.649f,	1.54f,	1.454f,	1.408f,	1.381f,	1.363f,	1.351f,	1.341f,	1.333f,	1.328f,	1.323f,	1.321f,	1.32f,
+	1.748f,	1.739f,	1.71f,	1.641f,	1.533f,	1.449f,	1.405f,	1.379f,	1.362f,	1.35f,	1.34f,	1.332f,	1.327f,	1.323f,	1.32f,	1.319f,
+
+};
--- a/thirdparty/amd-fsr2/ffx_fsr2_private.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2_private.h
@@ -0,0 +1,84 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+// Constants for FSR2 DX12 dispatches. Must be kept in sync with cbFSR2 in ffx_fsr2_callbacks_hlsl.h
+typedef struct Fsr2Constants {
+
+    int32_t                     renderSize[2];
+    int32_t                     maxRenderSize[2];
+    int32_t                     displaySize[2];
+    int32_t                     inputColorResourceDimensions[2];
+    int32_t                     lumaMipDimensions[2];
+    int32_t                     lumaMipLevelToUse;
+    int32_t                     frameIndex;
+    
+    float                       deviceToViewDepth[4];
+    float                       jitterOffset[2];
+    float                       motionVectorScale[2];
+    float                       downscaleFactor[2];
+    float                       motionVectorJitterCancellation[2];
+    float                       preExposure;
+    float                       previousFramePreExposure;
+    float                       tanHalfFOV;
+    float                       jitterPhaseCount;
+    float                       deltaTime;
+    float                       dynamicResChangeFactor;
+    float                       viewSpaceToMetersFactor;
+
+    float                       pad;
+    float                       reprojectionMatrix[16];
+} Fsr2Constants;
+
+struct FfxFsr2ContextDescription;
+struct FfxDeviceCapabilities;
+struct FfxPipelineState;
+struct FfxResource;
+
+// FfxFsr2Context_Private
+// The private implementation of the FSR2 context.
+typedef struct FfxFsr2Context_Private {
+
+    FfxFsr2ContextDescription   contextDescription;
+    Fsr2Constants               constants;
+    FfxDevice                   device;
+    FfxDeviceCapabilities       deviceCapabilities;
+    FfxPipelineState            pipelineDepthClip;
+    FfxPipelineState            pipelineReconstructPreviousDepth;
+    FfxPipelineState            pipelineLock;
+    FfxPipelineState            pipelineAccumulate;
+    FfxPipelineState            pipelineAccumulateSharpen;
+    FfxPipelineState            pipelineRCAS;
+    FfxPipelineState            pipelineComputeLuminancePyramid;
+    FfxPipelineState            pipelineGenerateReactive;
+    FfxPipelineState            pipelineTcrAutogenerate;
+
+    // 2 arrays of resources, as e.g. FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS will use different resources when bound as SRV vs when bound as UAV
+    FfxResourceInternal         srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_COUNT];
+    FfxResourceInternal         uavResources[FFX_FSR2_RESOURCE_IDENTIFIER_COUNT];
+
+    bool                        firstExecution;
+    bool                        refreshPipelineStates;
+    uint32_t                    resourceFrameIndex;
+    float                       previousJitterOffset[2];
+    int32_t                     jitterPhaseCountRemaining;
+} FfxFsr2Context_Private;
--- a/thirdparty/amd-fsr2/ffx_types.h
+++ b/thirdparty/amd-fsr2/ffx_types.h
@@ -0,0 +1,365 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#if defined (FFX_GCC)
+/// FidelityFX exported functions
+#define FFX_API
+#else
+/// FidelityFX exported functions
+#define FFX_API __declspec(dllexport)
+#endif // #if defined (FFX_GCC)
+
+/// Maximum supported number of simultaneously bound SRVs.
+#define FFX_MAX_NUM_SRVS            16
+
+/// Maximum supported number of simultaneously bound UAVs.
+#define FFX_MAX_NUM_UAVS            8
+
+/// Maximum number of constant buffers bound.
+#define FFX_MAX_NUM_CONST_BUFFERS   2
+
+/// Maximum size of bound constant buffers.
+#define FFX_MAX_CONST_SIZE          64
+
+/// Off by default warnings
+#if defined(_MSC_VER)
+#pragma warning(disable : 4365 4710 4820 5039)
+#elif defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmissing-field-initializers"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wignored-qualifiers"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // #ifdef __cplusplus
+
+/// An enumeration of surface formats.
+typedef enum FfxSurfaceFormat {
+
+    FFX_SURFACE_FORMAT_UNKNOWN,                     ///< Unknown format
+    FFX_SURFACE_FORMAT_R32G32B32A32_TYPELESS,       ///< 32 bit per channel, 4 channel typeless format
+    FFX_SURFACE_FORMAT_R32G32B32A32_FLOAT,          ///< 32 bit per channel, 4 channel float format
+    FFX_SURFACE_FORMAT_R16G16B16A16_FLOAT,          ///< 16 bit per channel, 4 channel float format
+    FFX_SURFACE_FORMAT_R16G16B16A16_UNORM,          ///< 16 bit per channel, 4 channel unsigned normalized format
+    FFX_SURFACE_FORMAT_R32G32_FLOAT,                ///< 32 bit per channel, 2 channel float format
+    FFX_SURFACE_FORMAT_R32_UINT,                    ///< 32 bit per channel, 1 channel float format
+    FFX_SURFACE_FORMAT_R8G8B8A8_TYPELESS,           ///<  8 bit per channel, 4 channel float format
+    FFX_SURFACE_FORMAT_R8G8B8A8_UNORM,              ///<  8 bit per channel, 4 channel unsigned normalized format
+    FFX_SURFACE_FORMAT_R11G11B10_FLOAT,             ///< 32 bit 3 channel float format
+    FFX_SURFACE_FORMAT_R16G16_FLOAT,                ///< 16 bit per channel, 2 channel float format
+    FFX_SURFACE_FORMAT_R16G16_UINT,                 ///< 16 bit per channel, 2 channel unsigned int format
+    FFX_SURFACE_FORMAT_R16_FLOAT,                   ///< 16 bit per channel, 1 channel float format
+    FFX_SURFACE_FORMAT_R16_UINT,                    ///< 16 bit per channel, 1 channel unsigned int format
+    FFX_SURFACE_FORMAT_R16_UNORM,                   ///< 16 bit per channel, 1 channel unsigned normalized format
+    FFX_SURFACE_FORMAT_R16_SNORM,                   ///< 16 bit per channel, 1 channel signed normalized format
+    FFX_SURFACE_FORMAT_R8_UNORM,                    ///<  8 bit per channel, 1 channel unsigned normalized format
+    FFX_SURFACE_FORMAT_R8_UINT,                     ///<  8 bit per channel, 1 channel unsigned int format
+    FFX_SURFACE_FORMAT_R8G8_UNORM,                  ///<  8 bit per channel, 2 channel unsigned normalized format
+    FFX_SURFACE_FORMAT_R32_FLOAT                    ///< 32 bit per channel, 1 channel float format
+} FfxSurfaceFormat;
+
+/// An enumeration of resource usage.
+typedef enum FfxResourceUsage {
+
+    FFX_RESOURCE_USAGE_READ_ONLY = 0,               ///< No usage flags indicate a resource is read only.
+    FFX_RESOURCE_USAGE_RENDERTARGET = (1<<0),       ///< Indicates a resource will be used as render target.
+    FFX_RESOURCE_USAGE_UAV = (1<<1),                ///< Indicates a resource will be used as UAV.
+} FfxResourceUsage;
+
+/// An enumeration of resource states.
+typedef enum FfxResourceStates {
+
+    FFX_RESOURCE_STATE_UNORDERED_ACCESS = (1<<0),   ///< Indicates a resource is in the state to be used as UAV.
+    FFX_RESOURCE_STATE_COMPUTE_READ = (1 << 1),     ///< Indicates a resource is in the state to be read by compute shaders.
+    FFX_RESOURCE_STATE_COPY_SRC = (1 << 2),         ///< Indicates a resource is in the state to be used as source in a copy command.
+    FFX_RESOURCE_STATE_COPY_DEST = (1 << 3),        ///< Indicates a resource is in the state to be used as destination in a copy command.
+    FFX_RESOURCE_STATE_GENERIC_READ = (FFX_RESOURCE_STATE_COPY_SRC | FFX_RESOURCE_STATE_COMPUTE_READ),  ///< Indicates a resource is in generic (slow) read state.
+} FfxResourceStates;
+
+/// An enumeration of surface dimensions.
+typedef enum FfxResourceDimension {
+
+    FFX_RESOURCE_DIMENSION_TEXTURE_1D,              ///< A resource with a single dimension.
+    FFX_RESOURCE_DIMENSION_TEXTURE_2D,              ///< A resource with two dimensions.
+} FfxResourceDimension;
+
+/// An enumeration of surface dimensions.
+typedef enum FfxResourceFlags {
+
+    FFX_RESOURCE_FLAGS_NONE         = 0,            ///< No flags.
+    FFX_RESOURCE_FLAGS_ALIASABLE    = (1<<0),       ///< A bit indicating a resource does not need to persist across frames.
+} FfxResourceFlags;
+
+/// An enumeration of all resource view types.
+typedef enum FfxResourceViewType {
+
+    FFX_RESOURCE_VIEW_UNORDERED_ACCESS,             ///< The resource view is an unordered access view (UAV).
+    FFX_RESOURCE_VIEW_SHADER_READ,                  ///< The resource view is a shader resource view (SRV).
+} FfxResourceViewType;
+
+/// The type of filtering to perform when reading a texture.
+typedef enum FfxFilterType {
+
+    FFX_FILTER_TYPE_POINT,                          ///< Point sampling.
+    FFX_FILTER_TYPE_LINEAR                          ///< Sampling with interpolation.
+} FfxFilterType;
+
+/// An enumeration of all supported shader models.
+typedef enum FfxShaderModel {
+
+    FFX_SHADER_MODEL_5_1,                           ///< Shader model 5.1.
+    FFX_SHADER_MODEL_6_0,                           ///< Shader model 6.0.
+    FFX_SHADER_MODEL_6_1,                           ///< Shader model 6.1.
+    FFX_SHADER_MODEL_6_2,                           ///< Shader model 6.2.
+    FFX_SHADER_MODEL_6_3,                           ///< Shader model 6.3.
+    FFX_SHADER_MODEL_6_4,                           ///< Shader model 6.4.
+    FFX_SHADER_MODEL_6_5,                           ///< Shader model 6.5.
+    FFX_SHADER_MODEL_6_6,                           ///< Shader model 6.6.
+    FFX_SHADER_MODEL_6_7,                           ///< Shader model 6.7.
+} FfxShaderModel;
+
+// An enumeration for different resource types
+typedef enum FfxResourceType {
+
+    FFX_RESOURCE_TYPE_BUFFER,                       ///< The resource is a buffer.
+    FFX_RESOURCE_TYPE_TEXTURE1D,                    ///< The resource is a 1-dimensional texture.
+    FFX_RESOURCE_TYPE_TEXTURE2D,                    ///< The resource is a 2-dimensional texture.
+    FFX_RESOURCE_TYPE_TEXTURE3D,                    ///< The resource is a 3-dimensional texture.
+} FfxResourceType;
+
+/// An enumeration for different heap types
+typedef enum FfxHeapType {
+
+    FFX_HEAP_TYPE_DEFAULT = 0,                      ///< Local memory.
+    FFX_HEAP_TYPE_UPLOAD                            ///< Heap used for uploading resources.
+} FfxHeapType;
+
+/// An enumberation for different render job types
+typedef enum FfxGpuJobType {
+
+    FFX_GPU_JOB_CLEAR_FLOAT = 0,                 ///< The GPU job is performing a floating-point clear.
+    FFX_GPU_JOB_COPY = 1,                        ///< The GPU job is performing a copy.
+    FFX_GPU_JOB_COMPUTE = 2,                     ///< The GPU job is performing a compute dispatch.
+} FfxGpuJobType;
+
+/// A typedef representing the graphics device.
+typedef void* FfxDevice;
+
+/// A typedef representing a command list or command buffer.
+typedef void* FfxCommandList;
+
+/// A typedef for a root signature.
+typedef void* FfxRootSignature;
+
+/// A typedef for a pipeline state object.
+typedef void* FfxPipeline;
+
+/// A structure encapasulating a collection of device capabilities.
+typedef struct FfxDeviceCapabilities {
+
+    FfxShaderModel                  minimumSupportedShaderModel;            ///< The minimum shader model supported by the device.
+    uint32_t                        waveLaneCountMin;                       ///< The minimum supported wavefront width.
+    uint32_t                        waveLaneCountMax;                       ///< The maximum supported wavefront width.
+    bool                            fp16Supported;                          ///< The device supports FP16 in hardware.
+    bool                            raytracingSupported;                    ///< The device supports raytracing.
+} FfxDeviceCapabilities;
+
+/// A structure encapsulating a 2-dimensional point, using 32bit unsigned integers.
+typedef struct FfxDimensions2D {
+
+    uint32_t                        width;                                  ///< The width of a 2-dimensional range.
+    uint32_t                        height;                                 ///< The height of a 2-dimensional range.
+} FfxDimensions2D;
+
+/// A structure encapsulating a 2-dimensional point,
+typedef struct FfxIntCoords2D {
+
+    int32_t                         x;                                      ///< The x coordinate of a 2-dimensional point.
+    int32_t                         y;                                      ///< The y coordinate of a 2-dimensional point.
+} FfxIntCoords2D;
+
+/// A structure encapsulating a 2-dimensional set of floating point coordinates.
+typedef struct FfxFloatCoords2D {
+
+    float                           x;                                      ///< The x coordinate of a 2-dimensional point.
+    float                           y;                                      ///< The y coordinate of a 2-dimensional point.
+} FfxFloatCoords2D;
+
+/// A structure describing a resource.
+typedef struct FfxResourceDescription {
+
+    FfxResourceType                 type;                                   ///< The type of the resource.
+    FfxSurfaceFormat                format;                                 ///< The surface format.
+    uint32_t                        width;                                  ///< The width of the resource.
+    uint32_t                        height;                                 ///< The height of the resource.
+    uint32_t                        depth;                                  ///< The depth of the resource.
+    uint32_t                        mipCount;                               ///< Number of mips (or 0 for full mipchain).
+    FfxResourceFlags                flags;                                  ///< A set of <c><i>FfxResourceFlags</i></c> flags.
+} FfxResourceDescription;
+
+/// An outward facing structure containing a resource
+typedef struct FfxResource {
+    void*                           resource;                               ///< pointer to the resource.
+    wchar_t                         name[64];
+    FfxResourceDescription          description;
+    FfxResourceStates               state;
+    bool                            isDepth;
+    uint64_t                        descriptorData;
+} FfxResource;
+
+/// An internal structure containing a handle to a resource and resource views
+typedef struct FfxResourceInternal {
+    int32_t                         internalIndex;                          ///< The index of the resource.
+} FfxResourceInternal;
+
+
+/// A structure defining a resource bind point
+typedef struct FfxResourceBinding
+{
+    uint32_t    slotIndex;
+    uint32_t    resourceIdentifier;
+    wchar_t     name[64];
+}FfxResourceBinding;
+
+/// A structure encapsulating a single pass of an algorithm.
+typedef struct FfxPipelineState {
+
+    FfxRootSignature                rootSignature;                                  ///< The pipelines rootSignature
+    FfxPipeline                     pipeline;                                       ///< The pipeline object
+    uint32_t                        uavCount;                                       ///< Count of UAVs used in this pipeline
+    uint32_t                        srvCount;                                       ///< Count of SRVs used in this pipeline
+    uint32_t                        constCount;                                     ///< Count of constant buffers used in this pipeline
+
+    FfxResourceBinding              uavResourceBindings[FFX_MAX_NUM_UAVS];          ///< Array of ResourceIdentifiers bound as UAVs
+    FfxResourceBinding              srvResourceBindings[FFX_MAX_NUM_SRVS];          ///< Array of ResourceIdentifiers bound as SRVs
+    FfxResourceBinding              cbResourceBindings[FFX_MAX_NUM_CONST_BUFFERS];  ///< Array of ResourceIdentifiers bound as CBs
+} FfxPipelineState;
+
+/// A structure containing the data required to create a resource.
+typedef struct FfxCreateResourceDescription {
+    
+    FfxHeapType                     heapType;                               ///< The heap type to hold the resource, typically <c><i>FFX_HEAP_TYPE_DEFAULT</i></c>.
+    FfxResourceDescription          resourceDescription;                    ///< A resource description.
+    FfxResourceStates               initalState;                            ///< The initial resource state.
+    uint32_t                        initDataSize;                           ///< Size of initial data buffer.
+    void*                           initData;                               ///< Buffer containing data to fill the resource.
+    const wchar_t*                  name;                                   ///< Name of the resource.
+    FfxResourceUsage                usage;                                  ///< Resource usage flags.
+    uint32_t                        id;                                     ///< Internal resource ID.
+} FfxCreateResourceDescription;
+
+/// A structure containing the description used to create a
+/// <c><i>FfxPipeline</i></c> structure.
+///
+/// A pipeline is the name given to a shader and the collection of state that
+/// is required to dispatch it. In the context of FSR2 and its architecture
+/// this means that a <c><i>FfxPipelineDescription</i></c> will map to either a
+/// monolithic object in an explicit API (such as a
+/// <c><i>PipelineStateObject</i></c> in DirectX 12). Or a shader and some
+/// ancillary API objects (in something like DirectX 11).
+///
+/// The <c><i>contextFlags</i></c> field contains a copy of the flags passed
+/// to <c><i>ffxFsr2ContextCreate</i></c> via the <c><i>flags</i></c> field of
+/// the <c><i>FfxFsr2InitializationParams</i></c> structure. These flags are
+/// used to determine which permutation of a pipeline for a specific
+/// <c><i>FfxFsr2Pass</i></c> should be used to implement the features required
+/// by each application, as well as to acheive the best performance on specific
+/// target hardware configurations.
+/// 
+/// When using one of the provided backends for FSR2 (such as DirectX 12 or
+/// Vulkan) the data required to create a pipeline is compiled offline and
+/// included into the backend library that you are using. For cases where the
+/// backend interface is overriden by providing custom callback function
+/// implementations care should be taken to respect the contents of the
+/// <c><i>contextFlags</i></c> field in order to correctly support the options
+/// provided by FSR2, and acheive best performance.
+///
+/// @ingroup FSR2
+typedef struct FfxPipelineDescription {
+
+    uint32_t                            contextFlags;                   ///< A collection of <c><i>FfxFsr2InitializationFlagBits</i></c> which were passed to the context.
+    FfxFilterType*                      samplers;                       ///< Array of static samplers.
+    size_t                              samplerCount;                   ///< The number of samples contained inside <c><i>samplers</i></c>.
+    const uint32_t*                     rootConstantBufferSizes;        ///< Array containing the sizes of the root constant buffers (count of 32 bit elements).
+    uint32_t                            rootConstantBufferCount;        ///< The number of root constants contained within <c><i>rootConstantBufferSizes</i></c>.
+} FfxPipelineDescription;
+
+/// A structure containing a constant buffer.
+typedef struct FfxConstantBuffer {
+
+    uint32_t                        uint32Size;                             ///< Size of 32 bit chunks used in the constant buffer
+    uint32_t                        data[FFX_MAX_CONST_SIZE];               ///< Constant buffer data
+}FfxConstantBuffer;
+
+/// A structure describing a clear render job.
+typedef struct FfxClearFloatJobDescription {
+
+    float                           color[4];                               ///< The clear color of the resource.
+    FfxResourceInternal             target;                                 ///< The resource to be cleared.
+} FfxClearFloatJobDescription;
+
+/// A structure describing a compute render job.
+typedef struct FfxComputeJobDescription {
+
+    FfxPipelineState                pipeline;                               ///< Compute pipeline for the render job.
+    uint32_t                        dimensions[3];                          ///< Dispatch dimensions.
+    FfxResourceInternal             srvs[FFX_MAX_NUM_SRVS];                 ///< SRV resources to be bound in the compute job.
+    wchar_t                         srvNames[FFX_MAX_NUM_SRVS][64];
+    FfxResourceInternal             uavs[FFX_MAX_NUM_UAVS];                 ///< UAV resources to be bound in the compute job.
+    uint32_t                        uavMip[FFX_MAX_NUM_UAVS];               ///< Mip level of UAV resources to be bound in the compute job.
+    wchar_t                         uavNames[FFX_MAX_NUM_UAVS][64];
+    FfxConstantBuffer               cbs[FFX_MAX_NUM_CONST_BUFFERS];         ///< Constant buffers to be bound in the compute job.
+    wchar_t                         cbNames[FFX_MAX_NUM_CONST_BUFFERS][64];
+    uint32_t                        cbSlotIndex[FFX_MAX_NUM_CONST_BUFFERS]; ///< Slot index in the descriptor table
+} FfxComputeJobDescription;
+
+/// A structure describing a copy render job.
+typedef struct FfxCopyJobDescription
+{
+    FfxResourceInternal                     src;                                    ///< Source resource for the copy.
+    FfxResourceInternal                     dst;                                    ///< Destination resource for the copy.
+} FfxCopyJobDescription;
+
+/// A structure describing a single render job.
+typedef struct FfxGpuJobDescription{
+
+    FfxGpuJobType                jobType;                                    ///< Type of the job.
+
+    union {
+        FfxClearFloatJobDescription clearJobDescriptor;                     ///< Clear job descriptor. Valid when <c><i>jobType</i></c> is <c><i>FFX_RENDER_JOB_CLEAR_FLOAT</i></c>.
+        FfxCopyJobDescription       copyJobDescriptor;                      ///< Copy job descriptor. Valid when <c><i>jobType</i></c> is <c><i>FFX_RENDER_JOB_COPY</i></c>.
+        FfxComputeJobDescription    computeJobDescriptor;                   ///< Compute job descriptor. Valid when <c><i>jobType</i></c> is <c><i>FFX_RENDER_JOB_COMPUTE</i></c>.
+    };
+} FfxGpuJobDescription;
+
+#ifdef __cplusplus
+}
+#endif  // #ifdef __cplusplus
--- a/thirdparty/amd-fsr2/ffx_util.h
+++ b/thirdparty/amd-fsr2/ffx_util.h
@@ -0,0 +1,78 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#pragma once
+
+#include "ffx_types.h"
+
+/// The value of Pi.
+const float FFX_PI = 3.141592653589793f;
+
+/// An epsilon value for floating point numbers.
+const float FFX_EPSILON = 1e-06f;
+
+/// Helper macro to create the version number.
+#define FFX_MAKE_VERSION(major, minor, patch) ((major << 22) | (minor << 12) | patch)
+
+///< Use this to specify no version.
+#define FFX_UNSPECIFIED_VERSION     0xFFFFAD00
+
+/// Helper macro to avoid warnings about unused variables.
+#define FFX_UNUSED(x)               ((void)(x))
+
+/// Helper macro to align an integer to the specified power of 2 boundary
+#define FFX_ALIGN_UP(x, y)          (((x) + ((y)-1)) & ~((y)-1))
+
+/// Helper macro to check if a value is aligned.
+#define FFX_IS_ALIGNED(x)           (((x) != 0) && ((x) & ((x)-1)))
+
+/// Helper macro to stringify a value.
+#define FFX_STR(s)                  FFX_XSTR(s)
+#define FFX_XSTR(s)                 #s
+
+/// Helper macro to forward declare a structure.
+#define FFX_FORWARD_DECLARE(x)      typedef struct x x
+
+/// Helper macro to return the maximum of two values.
+#define FFX_MAXIMUM(x, y)           (((x) > (y)) ? (x) : (y))
+
+/// Helper macro to return the minimum of two values.
+#define FFX_MINIMUM(x, y)           (((x) < (y)) ? (x) : (y))
+
+/// Helper macro to do safe free on a pointer.
+#define FFX_SAFE_FREE(x) \
+    if (x)               \
+    free(x)
+
+/// Helper macro to return the abs of an integer value.
+#define FFX_ABSOLUTE(x)                 (((x) < 0) ? (-(x)) : (x))
+
+/// Helper macro to return sign of a value.
+#define FFX_SIGN(x)                     (((x) < 0) ? -1 : 1)
+
+/// Helper macro to work out the number of elements in an array.
+#define FFX_ARRAY_ELEMENTS(x)           (int32_t)((sizeof(x) / sizeof(0 [x])) / ((size_t)(!(sizeof(x) % sizeof(0 [x])))))
+
+/// The maximum length of a path that can be specified to the FidelityFX API.
+#define FFX_MAXIMUM_PATH                (260)
+
+/// Helper macro to check if the specified key is set in a bitfield.
+#define FFX_CONTAINS_FLAG(options, key) ((options & key) == key)
--- a/thirdparty/amd-fsr2/patches/0001-build-fixes.patch
+++ b/thirdparty/amd-fsr2/patches/0001-build-fixes.patch
@@ -0,0 +1,136 @@
+diff --git a/thirdparty/amd-fsr2/ffx_fsr2.cpp b/thirdparty/amd-fsr2/ffx_fsr2.cpp
+index 051018e437..3970aa7f5b 100644
+--- a/thirdparty/amd-fsr2/ffx_fsr2.cpp
+++ b/thirdparty/amd-fsr2/ffx_fsr2.cpp
+@@ -36,6 +36,15 @@
+ #pragma clang diagnostic ignored "-Wunused-variable"
+ #endif
+ 
+#ifndef _countof
+#define _countof(array) (sizeof(array) / sizeof(array[0]))
+#endif
+
+#ifndef _MSC_VER
+#include <wchar.h>
+#define wcscpy_s wcscpy
+#endif
+
+ // max queued frames for descriptor management
+ static const uint32_t FSR2_MAX_QUEUED_FRAMES = 16;
+ 
+diff --git a/thirdparty/amd-fsr2/ffx_types.h b/thirdparty/amd-fsr2/ffx_types.h
+index 74edd192c4..f71b259cce 100644
+--- a/thirdparty/amd-fsr2/ffx_types.h
+++ b/thirdparty/amd-fsr2/ffx_types.h
+@@ -22,6 +22,7 @@
+ #pragma once
+ 
+ #include <stdint.h>
+#include <stdlib.h>
+ 
+ #if defined (FFX_GCC)
+ /// FidelityFX exported functions
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+index ebbe610ffa..31d68292d4 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl
+index 7ae41cf0c1..3b86c17d4d 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl
+index 15186e3bb6..8439c4e9d4 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl
+index fcb2b76528..45ec5bdb86 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl
+index f7cad59c20..7c3a4c2740 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl
+index f0823c2bc8..8b4ebc6afc 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl
+index 20e17eef8c..be4395aaed 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+index bebca91099..7d6a66b8ac 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+@@ -19,7 +19,7 @@
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ // THE SOFTWARE.
+ 
+-#version 450
+//#version 450
+ 
+ #extension GL_GOOGLE_include_directive : require
+ #extension GL_EXT_samplerless_texture_functions : require
--- a/thirdparty/amd-fsr2/patches/0002-godot-fsr2-options.patch
+++ b/thirdparty/amd-fsr2/patches/0002-godot-fsr2-options.patch
@@ -0,0 +1,121 @@
+diff --git a/thirdparty/amd-fsr2/ffx_fsr2.cpp b/thirdparty/amd-fsr2/ffx_fsr2.cpp
+index 3970aa7f5b..ec571b9cd2 100644
+--- a/thirdparty/amd-fsr2/ffx_fsr2.cpp
+++ b/thirdparty/amd-fsr2/ffx_fsr2.cpp
+@@ -952,6 +952,8 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D
+     context->constants.lumaMipDimensions[0] = uint32_t(context->constants.maxRenderSize[0] / mipDiv);
+     context->constants.lumaMipDimensions[1] = uint32_t(context->constants.maxRenderSize[1] / mipDiv);
+ 
+    memcpy(context->constants.reprojectionMatrix, params->reprojectionMatrix, sizeof(context->constants.reprojectionMatrix));
+
+     // reactive mask bias
+     const int32_t threadGroupWorkRegionDim = 8;
+     const int32_t dispatchSrcX = (context->constants.renderSize[0] + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim;
+diff --git a/thirdparty/amd-fsr2/ffx_fsr2.h b/thirdparty/amd-fsr2/ffx_fsr2.h
+index 2a1c74abb1..dfcd4caf35 100644
+--- a/thirdparty/amd-fsr2/ffx_fsr2.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2.h
+@@ -146,6 +146,7 @@ typedef struct FfxFsr2DispatchDescription {
+     float                       autoReactiveScale;                  ///< A value to scale the reactive mask
+     float                       autoReactiveMax;                    ///< A value to clamp the reactive mask
+ 
+    float                       reprojectionMatrix[16];             ///< The matrix used for reprojecting pixels with invalid motion vectors by using the depth.
+ } FfxFsr2DispatchDescription;
+ 
+ /// A structure encapsulating the parameters for automatic generation of a reactive mask
+diff --git a/thirdparty/amd-fsr2/ffx_fsr2_private.h b/thirdparty/amd-fsr2/ffx_fsr2_private.h
+index 6b5fbc5117..8a9aec5778 100644
+--- a/thirdparty/amd-fsr2/ffx_fsr2_private.h
+++ b/thirdparty/amd-fsr2/ffx_fsr2_private.h
+@@ -44,6 +44,9 @@ typedef struct Fsr2Constants {
+     float                       deltaTime;
+     float                       dynamicResChangeFactor;
+     float                       viewSpaceToMetersFactor;
+
+    float                       pad;
+    float                       reprojectionMatrix[16];
+ } Fsr2Constants;
+ 
+ struct FfxFsr2ContextDescription;
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+index 31d68292d4..2e98c8a6c5 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+@@ -35,7 +35,7 @@
+ #endif
+ #define FSR2_BIND_SRV_INTERNAL_UPSCALED                      3
+ #define FSR2_BIND_SRV_LOCK_STATUS                            4
+-#define FSR2_BIND_SRV_INPUT_DEPTH_CLIP                       5
+//#define FSR2_BIND_SRV_INPUT_DEPTH_CLIP                       5
+ #define FSR2_BIND_SRV_PREPARED_INPUT_COLOR                   6
+ #define FSR2_BIND_SRV_LUMA_INSTABILITY                       7
+ #define FSR2_BIND_SRV_LANCZOS_LUT                            8
+@@ -52,6 +52,10 @@
+ 
+ #define FSR2_BIND_CB_FSR2                                    18
+ 
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+#define FSR2_BIND_SRV_INPUT_DEPTH                            5
+#endif
+
+ #include "ffx_fsr2_callbacks_glsl.h"
+ #include "ffx_fsr2_common.h"
+ #include "ffx_fsr2_sample.h"
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h
+index 10da13fb81..b610037cc6 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h
+@@ -52,6 +52,9 @@
+ 		FfxFloat32    fDeltaTime;
+ 		FfxFloat32    fDynamicResChangeFactor;
+ 		FfxFloat32    fViewSpaceToMetersFactor;
+
+		FfxFloat32    fPad;
+		mat4          mReprojectionMatrix;
+ 	} cbFSR2;
+ #endif
+ 
+@@ -317,7 +320,11 @@ FfxFloat32 LoadInputDepth(FfxInt32x2 iPxPos)
+ #if defined(FSR2_BIND_SRV_REACTIVE_MASK) 
+ FfxFloat32 LoadReactiveMask(FfxInt32x2 iPxPos)
+ {
+#if FFX_FSR2_OPTION_GODOT_REACTIVE_MASK_CLAMP
+	return min(texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r, 0.9f);
+#else
+ 	return texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r;
+#endif
+ }
+ #endif
+ 
+@@ -354,6 +361,16 @@ FfxFloat32x2 LoadInputMotionVector(FfxInt32x2 iPxDilatedMotionVectorPos)
+ {
+ 	FfxFloat32x2 fSrcMotionVector = texelFetch(r_input_motion_vectors, iPxDilatedMotionVectorPos, 0).xy;
+ 
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+	bool bInvalidMotionVector = all(lessThanEqual(fSrcMotionVector, vec2(-1.0f, -1.0f)));
+	if (bInvalidMotionVector)
+	{
+		FfxFloat32 fSrcDepth = LoadInputDepth(iPxDilatedMotionVectorPos);
+		FfxFloat32x2 fUv = (iPxDilatedMotionVectorPos + FfxFloat32(0.5)) / RenderSize();
+		fSrcMotionVector = FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS_FUNCTION(fUv, fSrcDepth, cbFSR2.mReprojectionMatrix);
+	}
+#endif
+
+ 	FfxFloat32x2 fUvMotionVector = fSrcMotionVector * MotionVectorScale();
+ 
+ #if FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS
+diff --git a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+index 7d6a66b8ac..5c042c332a 100644
+--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+@@ -40,6 +40,10 @@
+ #define FSR2_BIND_CB_FSR2									11
+ #define FSR2_BIND_CB_REACTIVE                               12
+ 
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+#define FSR2_BIND_SRV_INPUT_DEPTH                           13
+#endif
+
+ #include "ffx_fsr2_callbacks_glsl.h"
+ #include "ffx_fsr2_common.h"
+ 
--- a/thirdparty/amd-fsr2/shaders/ffx_common_types.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_common_types.h
@@ -0,0 +1,429 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#ifndef FFX_COMMON_TYPES_H
+#define FFX_COMMON_TYPES_H
+
+#if defined(FFX_CPU)
+#define FFX_PARAMETER_IN
+#define FFX_PARAMETER_OUT
+#define FFX_PARAMETER_INOUT
+#elif defined(FFX_HLSL)
+#define FFX_PARAMETER_IN        in
+#define FFX_PARAMETER_OUT       out
+#define FFX_PARAMETER_INOUT     inout
+#elif defined(FFX_GLSL)
+#define FFX_PARAMETER_IN        in
+#define FFX_PARAMETER_OUT       out
+#define FFX_PARAMETER_INOUT     inout
+#endif // #if defined(FFX_CPU)
+
+#if defined(FFX_CPU)
+/// A typedef for a boolean value.
+///
+/// @ingroup CPU
+typedef bool FfxBoolean;
+
+/// A typedef for a unsigned 8bit integer.
+///
+/// @ingroup CPU
+typedef uint8_t FfxUInt8;
+
+/// A typedef for a unsigned 16bit integer.
+///
+/// @ingroup CPU
+typedef uint16_t FfxUInt16;
+
+/// A typedef for a unsigned 32bit integer.
+///
+/// @ingroup CPU
+typedef uint32_t FfxUInt32;
+
+/// A typedef for a unsigned 64bit integer.
+///
+/// @ingroup CPU
+typedef uint64_t FfxUInt64;
+
+/// A typedef for a signed 8bit integer.
+///
+/// @ingroup CPU
+typedef int8_t FfxInt8;
+
+/// A typedef for a signed 16bit integer.
+///
+/// @ingroup CPU
+typedef int16_t FfxInt16;
+
+/// A typedef for a signed 32bit integer.
+///
+/// @ingroup CPU
+typedef int32_t FfxInt32;
+
+/// A typedef for a signed 64bit integer.
+///
+/// @ingroup CPU
+typedef int64_t FfxInt64;
+
+/// A typedef for a floating point value.
+///
+/// @ingroup CPU
+typedef float FfxFloat32;
+
+/// A typedef for a 2-dimensional floating point value.
+///
+/// @ingroup CPU
+typedef float FfxFloat32x2[2];
+
+/// A typedef for a 3-dimensional floating point value.
+///
+/// @ingroup CPU
+typedef float FfxFloat32x3[3];
+
+/// A typedef for a 4-dimensional floating point value.
+///
+/// @ingroup CPU
+typedef float FfxFloat32x4[4];
+
+/// A typedef for a 2-dimensional 32bit unsigned integer.
+///
+/// @ingroup CPU
+typedef uint32_t FfxUInt32x2[2];
+
+/// A typedef for a 3-dimensional 32bit unsigned integer.
+///
+/// @ingroup CPU
+typedef uint32_t FfxUInt32x3[3];
+
+/// A typedef for a 4-dimensional 32bit unsigned integer.
+///
+/// @ingroup CPU
+typedef uint32_t FfxUInt32x4[4];
+#endif // #if defined(FFX_CPU)
+
+#if defined(FFX_HLSL)
+/// A typedef for a boolean value.
+///
+/// @ingroup GPU
+typedef bool FfxBoolean;
+
+#if FFX_HLSL_6_2
+typedef float32_t   FfxFloat32;
+typedef float32_t2  FfxFloat32x2;
+typedef float32_t3  FfxFloat32x3;
+typedef float32_t4  FfxFloat32x4;
+
+/// A typedef for a unsigned 32bit integer.
+///
+/// @ingroup GPU
+typedef uint32_t    FfxUInt32;
+typedef uint32_t2   FfxUInt32x2;
+typedef uint32_t3   FfxUInt32x3;
+typedef uint32_t4   FfxUInt32x4;
+typedef int32_t     FfxInt32;
+typedef int32_t2    FfxInt32x2;
+typedef int32_t3    FfxInt32x3;
+typedef int32_t4    FfxInt32x4;
+#else
+#define FfxFloat32   float
+#define FfxFloat32x2 float2
+#define FfxFloat32x3 float3
+#define FfxFloat32x4 float4
+
+/// A typedef for a unsigned 32bit integer.
+///
+/// @ingroup GPU
+typedef uint        FfxUInt32;
+typedef uint2       FfxUInt32x2;
+typedef uint3       FfxUInt32x3;
+typedef uint4       FfxUInt32x4;
+typedef int         FfxInt32;
+typedef int2        FfxInt32x2;
+typedef int3        FfxInt32x3;
+typedef int4        FfxInt32x4;
+#endif // #if defined(FFX_HLSL_6_2)
+
+#if FFX_HALF
+#if FFX_HLSL_6_2
+typedef float16_t   FfxFloat16;
+typedef float16_t2  FfxFloat16x2;
+typedef float16_t3  FfxFloat16x3;
+typedef float16_t4  FfxFloat16x4;
+
+/// A typedef for an unsigned 16bit integer.
+///
+/// @ingroup GPU
+typedef uint16_t    FfxUInt16;
+typedef uint16_t2   FfxUInt16x2;
+typedef uint16_t3   FfxUInt16x3;
+typedef uint16_t4   FfxUInt16x4;
+
+/// A typedef for a signed 16bit integer.
+///
+/// @ingroup GPU
+typedef int16_t     FfxInt16;
+typedef int16_t2    FfxInt16x2;
+typedef int16_t3    FfxInt16x3;
+typedef int16_t4    FfxInt16x4;
+#else
+typedef min16float  FfxFloat16;
+typedef min16float2 FfxFloat16x2;
+typedef min16float3 FfxFloat16x3;
+typedef min16float4 FfxFloat16x4;
+
+/// A typedef for an unsigned 16bit integer.
+///
+/// @ingroup GPU
+typedef min16uint   FfxUInt16;
+typedef min16uint2  FfxUInt16x2;
+typedef min16uint3  FfxUInt16x3;
+typedef min16uint4  FfxUInt16x4;
+
+/// A typedef for a signed 16bit integer.
+///
+/// @ingroup GPU
+typedef min16int    FfxInt16;
+typedef min16int2   FfxInt16x2;
+typedef min16int3   FfxInt16x3;
+typedef min16int4   FfxInt16x4;
+#endif  // FFX_HLSL_6_2
+#endif // FFX_HALF
+#endif // #if defined(FFX_HLSL)
+
+#if defined(FFX_GLSL)
+/// A typedef for a boolean value.
+///
+/// @ingroup GPU
+#define FfxBoolean   bool
+#define FfxFloat32   float
+#define FfxFloat32x2 vec2
+#define FfxFloat32x3 vec3
+#define FfxFloat32x4 vec4
+#define FfxUInt32    uint
+#define FfxUInt32x2  uvec2
+#define FfxUInt32x3  uvec3
+#define FfxUInt32x4  uvec4
+#define FfxInt32     int
+#define FfxInt32x2   ivec2
+#define FfxInt32x3   ivec3
+#define FfxInt32x4   ivec4
+#if FFX_HALF
+#define FfxFloat16   float16_t
+#define FfxFloat16x2 f16vec2
+#define FfxFloat16x3 f16vec3
+#define FfxFloat16x4 f16vec4
+#define FfxUInt16    uint16_t
+#define FfxUInt16x2  u16vec2
+#define FfxUInt16x3  u16vec3
+#define FfxUInt16x4  u16vec4
+#define FfxInt16     int16_t
+#define FfxInt16x2   i16vec2
+#define FfxInt16x3   i16vec3
+#define FfxInt16x4   i16vec4
+#endif // FFX_HALF
+#endif // #if defined(FFX_GLSL)
+
+// Global toggles:
+// #define FFX_HALF            (1)
+// #define FFX_HLSL_6_2        (1)
+
+#if FFX_HALF
+
+#if FFX_HLSL_6_2
+
+#define FFX_MIN16_SCALAR( TypeName, BaseComponentType )           typedef BaseComponentType##16_t TypeName;
+#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL )      typedef vector<BaseComponentType##16_t, COL> TypeName;
+#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix<BaseComponentType##16_t, ROW, COL> TypeName;
+
+#define FFX_16BIT_SCALAR( TypeName, BaseComponentType )           typedef BaseComponentType##16_t TypeName;
+#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL )      typedef vector<BaseComponentType##16_t, COL> TypeName;
+#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix<BaseComponentType##16_t, ROW, COL> TypeName;
+
+#else //FFX_HLSL_6_2
+
+#define FFX_MIN16_SCALAR( TypeName, BaseComponentType )           typedef min16##BaseComponentType TypeName;
+#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL )      typedef vector<min16##BaseComponentType, COL> TypeName;
+#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix<min16##BaseComponentType, ROW, COL> TypeName;
+
+#define FFX_16BIT_SCALAR( TypeName, BaseComponentType )           FFX_MIN16_SCALAR( TypeName, BaseComponentType );
+#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL )      FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL );
+#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL );
+
+#endif //FFX_HLSL_6_2
+
+#else //FFX_HALF
+
+#define FFX_MIN16_SCALAR( TypeName, BaseComponentType )           typedef BaseComponentType TypeName;
+#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL )      typedef vector<BaseComponentType, COL> TypeName;
+#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix<BaseComponentType, ROW, COL> TypeName;
+
+#define FFX_16BIT_SCALAR( TypeName, BaseComponentType )           typedef BaseComponentType TypeName;
+#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL )      typedef vector<BaseComponentType, COL> TypeName;
+#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix<BaseComponentType, ROW, COL> TypeName;
+
+#endif //FFX_HALF
+
+#if defined(FFX_GPU)
+// Common typedefs:
+#if defined(FFX_HLSL)
+FFX_MIN16_SCALAR( FFX_MIN16_F , float );
+FFX_MIN16_VECTOR( FFX_MIN16_F2, float, 2 );
+FFX_MIN16_VECTOR( FFX_MIN16_F3, float, 3 );
+FFX_MIN16_VECTOR( FFX_MIN16_F4, float, 4 );
+
+FFX_MIN16_SCALAR( FFX_MIN16_I,  int );
+FFX_MIN16_VECTOR( FFX_MIN16_I2, int, 2 );
+FFX_MIN16_VECTOR( FFX_MIN16_I3, int, 3 );
+FFX_MIN16_VECTOR( FFX_MIN16_I4, int, 4 );
+
+FFX_MIN16_SCALAR( FFX_MIN16_U,  uint );
+FFX_MIN16_VECTOR( FFX_MIN16_U2, uint, 2 );
+FFX_MIN16_VECTOR( FFX_MIN16_U3, uint, 3 );
+FFX_MIN16_VECTOR( FFX_MIN16_U4, uint, 4 );
+
+FFX_16BIT_SCALAR( FFX_F16_t , float );
+FFX_16BIT_VECTOR( FFX_F16_t2, float, 2 );
+FFX_16BIT_VECTOR( FFX_F16_t3, float, 3 );
+FFX_16BIT_VECTOR( FFX_F16_t4, float, 4 );
+
+FFX_16BIT_SCALAR( FFX_I16_t,  int );
+FFX_16BIT_VECTOR( FFX_I16_t2, int, 2 );
+FFX_16BIT_VECTOR( FFX_I16_t3, int, 3 );
+FFX_16BIT_VECTOR( FFX_I16_t4, int, 4 );
+
+FFX_16BIT_SCALAR( FFX_U16_t,  uint );
+FFX_16BIT_VECTOR( FFX_U16_t2, uint, 2 );
+FFX_16BIT_VECTOR( FFX_U16_t3, uint, 3 );
+FFX_16BIT_VECTOR( FFX_U16_t4, uint, 4 );
+
+#define TYPEDEF_MIN16_TYPES(Prefix)           \
+typedef FFX_MIN16_F     Prefix##_F;           \
+typedef FFX_MIN16_F2    Prefix##_F2;          \
+typedef FFX_MIN16_F3    Prefix##_F3;          \
+typedef FFX_MIN16_F4    Prefix##_F4;          \
+typedef FFX_MIN16_I     Prefix##_I;           \
+typedef FFX_MIN16_I2    Prefix##_I2;          \
+typedef FFX_MIN16_I3    Prefix##_I3;          \
+typedef FFX_MIN16_I4    Prefix##_I4;          \
+typedef FFX_MIN16_U     Prefix##_U;           \
+typedef FFX_MIN16_U2    Prefix##_U2;          \
+typedef FFX_MIN16_U3    Prefix##_U3;          \
+typedef FFX_MIN16_U4    Prefix##_U4;
+
+#define TYPEDEF_16BIT_TYPES(Prefix)           \
+typedef FFX_16BIT_F     Prefix##_F;           \
+typedef FFX_16BIT_F2    Prefix##_F2;          \
+typedef FFX_16BIT_F3    Prefix##_F3;          \
+typedef FFX_16BIT_F4    Prefix##_F4;          \
+typedef FFX_16BIT_I     Prefix##_I;           \
+typedef FFX_16BIT_I2    Prefix##_I2;          \
+typedef FFX_16BIT_I3    Prefix##_I3;          \
+typedef FFX_16BIT_I4    Prefix##_I4;          \
+typedef FFX_16BIT_U     Prefix##_U;           \
+typedef FFX_16BIT_U2    Prefix##_U2;          \
+typedef FFX_16BIT_U3    Prefix##_U3;          \
+typedef FFX_16BIT_U4    Prefix##_U4;
+
+#define TYPEDEF_FULL_PRECISION_TYPES(Prefix)  \
+typedef FfxFloat32      Prefix##_F;           \
+typedef FfxFloat32x2    Prefix##_F2;          \
+typedef FfxFloat32x3    Prefix##_F3;          \
+typedef FfxFloat32x4    Prefix##_F4;          \
+typedef FfxInt32        Prefix##_I;           \
+typedef FfxInt32x2      Prefix##_I2;          \
+typedef FfxInt32x3      Prefix##_I3;          \
+typedef FfxInt32x4      Prefix##_I4;          \
+typedef FfxUInt32       Prefix##_U;           \
+typedef FfxUInt32x2     Prefix##_U2;          \
+typedef FfxUInt32x3     Prefix##_U3;          \
+typedef FfxUInt32x4     Prefix##_U4;
+#endif // #if defined(FFX_HLSL)
+
+#if defined(FFX_GLSL)
+
+#if FFX_HALF
+
+#define  FFX_MIN16_F  float16_t
+#define  FFX_MIN16_F2 f16vec2
+#define  FFX_MIN16_F3 f16vec3
+#define  FFX_MIN16_F4 f16vec4
+
+#define  FFX_MIN16_I  int16_t
+#define  FFX_MIN16_I2 i16vec2
+#define  FFX_MIN16_I3 i16vec3
+#define  FFX_MIN16_I4 i16vec4
+
+#define  FFX_MIN16_U  uint16_t
+#define  FFX_MIN16_U2 u16vec2
+#define  FFX_MIN16_U3 u16vec3
+#define  FFX_MIN16_U4 u16vec4
+
+#define FFX_16BIT_F  float16_t
+#define FFX_16BIT_F2 f16vec2
+#define FFX_16BIT_F3 f16vec3
+#define FFX_16BIT_F4 f16vec4
+
+#define FFX_16BIT_I  int16_t
+#define FFX_16BIT_I2 i16vec2
+#define FFX_16BIT_I3 i16vec3
+#define FFX_16BIT_I4 i16vec4
+
+#define FFX_16BIT_U  uint16_t
+#define FFX_16BIT_U2 u16vec2
+#define FFX_16BIT_U3 u16vec3
+#define FFX_16BIT_U4 u16vec4
+
+#else // FFX_HALF
+
+#define  FFX_MIN16_F  float
+#define  FFX_MIN16_F2 vec2
+#define  FFX_MIN16_F3 vec3
+#define  FFX_MIN16_F4 vec4
+
+#define  FFX_MIN16_I  int
+#define  FFX_MIN16_I2 ivec2
+#define  FFX_MIN16_I3 ivec3
+#define  FFX_MIN16_I4 ivec4
+
+#define  FFX_MIN16_U  uint
+#define  FFX_MIN16_U2 uvec2
+#define  FFX_MIN16_U3 uvec3
+#define  FFX_MIN16_U4 uvec4
+
+#define FFX_16BIT_F  float
+#define FFX_16BIT_F2 vec2
+#define FFX_16BIT_F3 vec3
+#define FFX_16BIT_F4 vec4
+
+#define FFX_16BIT_I  int
+#define FFX_16BIT_I2 ivec2
+#define FFX_16BIT_I3 ivec3
+#define FFX_16BIT_I4 ivec4
+
+#define FFX_16BIT_U  uint
+#define FFX_16BIT_U2 uvec2
+#define FFX_16BIT_U3 uvec3
+#define FFX_16BIT_U4 uvec4
+
+#endif // FFX_HALF
+
+#endif // #if defined(FFX_GLSL)
+
+#endif // #if defined(FFX_GPU)
+#endif // #ifndef FFX_COMMON_TYPES_H
--- a/thirdparty/amd-fsr2/shaders/ffx_core.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core.h
@@ -0,0 +1,52 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/// @defgroup Core
+/// @defgroup HLSL
+/// @defgroup GLSL
+/// @defgroup GPU
+/// @defgroup CPU
+/// @defgroup CAS
+/// @defgroup FSR1
+
+#if !defined(FFX_CORE_H)
+#define FFX_CORE_H
+
+#include "ffx_common_types.h"
+
+#if defined(FFX_CPU)
+    #include "ffx_core_cpu.h"
+#endif // #if defined(FFX_CPU)
+
+#if defined(FFX_GLSL) && defined(FFX_GPU)
+    #include "ffx_core_glsl.h"
+#endif // #if defined(FFX_GLSL) && defined(FFX_GPU)
+
+#if defined(FFX_HLSL) && defined(FFX_GPU)
+    #include "ffx_core_hlsl.h"
+#endif // #if defined(FFX_HLSL) && defined(FFX_GPU)
+
+#if defined(FFX_GPU)
+    #include "ffx_core_gpu_common.h"
+    #include "ffx_core_gpu_common_half.h"
+    #include "ffx_core_portability.h"
+#endif // #if defined(FFX_GPU)
+#endif // #if !defined(FFX_CORE_H)
--- a/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_cpu.h
@@ -0,0 +1,332 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/// A define for a true value in a boolean expression.
+///
+/// @ingroup CPU
+#define FFX_TRUE (1)
+
+/// A define for a false value in a boolean expression.
+///
+/// @ingroup CPU
+#define FFX_FALSE (0)
+ 
+#if !defined(FFX_STATIC)
+/// A define to abstract declaration of static variables and functions.
+///
+/// @ingroup CPU
+#define FFX_STATIC static
+#endif // #if !defined(FFX_STATIC)
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wunused-variable"
+#endif
+
+/// Interpret the bit layout of an IEEE-754 floating point value as an unsigned integer.
+///
+/// @param [in] x               A 32bit floating value.
+///
+/// @returns
+/// An unsigned 32bit integer value containing the bit pattern of <c><i>x</i></c>.
+/// 
+/// @ingroup CPU
+FFX_STATIC FfxUInt32 ffxAsUInt32(FfxFloat32 x)
+{
+    union
+    {
+        FfxFloat32 f;
+        FfxUInt32  u;
+    } bits;
+
+    bits.f = x;
+    return bits.u;
+}
+
+FFX_STATIC FfxFloat32 ffxDot2(FfxFloat32x2 a, FfxFloat32x2 b)
+{
+    return a[0] * b[0] + a[1] * b[1];
+}
+
+FFX_STATIC FfxFloat32 ffxDot3(FfxFloat32x3 a, FfxFloat32x3 b)
+{
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+FFX_STATIC FfxFloat32 ffxDot4(FfxFloat32x4 a, FfxFloat32x4 b)
+{
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
+}
+
+/// Compute the linear interopation between two values.
+///
+/// Implemented by calling the GLSL <c><i>mix</i></c> instrinsic function. Implements the
+/// following math:
+///
+///     (1 - t) * x + t * y
+///
+/// @param [in] x               The first value to lerp between.
+/// @param [in] y               The second value to lerp between.
+/// @param [in] t               The value to determine how much of <c><i>x</i></c> and how much of <c><i>y</i></c>.
+///
+/// @returns
+/// A linearly interpolated value between <c><i>x</i></c> and <c><i>y</i></c> according to <c><i>t</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t)
+{
+    return y * t + (-x * t + x);
+}
+
+/// Compute the reciprocal of a value.
+///
+/// @param [in] x               The value to compute the reciprocal for.
+///
+/// @returns
+/// The reciprocal value of <c><i>x</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 ffxReciprocal(FfxFloat32 a)
+{
+    return 1.0f / a;
+}
+
+/// Compute the square root of a value.
+///
+/// @param [in] x                   The first value to compute the min of.
+///
+/// @returns
+/// The the square root of <c><i>x</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 ffxSqrt(FfxFloat32 x)
+{
+    return sqrt(x);
+}
+
+FFX_STATIC FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b)
+{
+    return FfxUInt32(FfxInt32(a) >> FfxInt32(b));
+}
+
+/// Compute the factional part of a decimal value.
+///
+/// This function calculates <c><i>x - floor(x)</i></c>. 
+///
+/// @param [in] x               The value to compute the fractional part from.
+///
+/// @returns
+/// The fractional part of <c><i>x</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 ffxFract(FfxFloat32 a)
+{
+    return a - floor(a);
+}
+
+/// Compute the reciprocal square root of a value.
+///
+/// @param [in] x               The value to compute the reciprocal for.
+///
+/// @returns
+/// The reciprocal square root value of <c><i>x</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 rsqrt(FfxFloat32 a)
+{
+    return ffxReciprocal(ffxSqrt(a));
+}
+
+FFX_STATIC FfxFloat32 ffxMin(FfxFloat32 x, FfxFloat32 y)
+{
+    return x < y ? x : y;
+}
+
+FFX_STATIC FfxUInt32 ffxMin(FfxUInt32 x, FfxUInt32 y)
+{
+    return x < y ? x : y;
+}
+
+FFX_STATIC FfxFloat32 ffxMax(FfxFloat32 x, FfxFloat32 y)
+{
+    return x > y ? x : y;
+}
+
+FFX_STATIC FfxUInt32 ffxMax(FfxUInt32 x, FfxUInt32 y)
+{
+    return x > y ? x : y;
+}
+
+/// Clamp a value to a [0..1] range.
+///
+/// @param [in] x               The value to clamp to [0..1] range.
+///
+/// @returns
+/// The clamped version of <c><i>x</i></c>.
+///
+/// @ingroup CPU
+FFX_STATIC FfxFloat32 ffxSaturate(FfxFloat32 a)
+{
+    return ffxMin(1.0f, ffxMax(0.0f, a));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+FFX_STATIC void opAAddOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)
+{
+    d[0] = a[0] + b;
+    d[1] = a[1] + b;
+    d[2] = a[2] + b;
+    return;
+}
+
+FFX_STATIC void opACpyF3(FfxFloat32x3 d, FfxFloat32x3 a)
+{
+    d[0] = a[0];
+    d[1] = a[1];
+    d[2] = a[2];
+    return;
+}
+
+FFX_STATIC void opAMulF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b)
+{
+    d[0] = a[0] * b[0];
+    d[1] = a[1] * b[1];
+    d[2] = a[2] * b[2];
+    return;
+}
+
+FFX_STATIC void opAMulOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)
+{
+    d[0] = a[0] * b;
+    d[1] = a[1] * b;
+    d[2] = a[2] * b;
+    return;
+}
+
+FFX_STATIC void opARcpF3(FfxFloat32x3 d, FfxFloat32x3 a)
+{
+    d[0] = ffxReciprocal(a[0]);
+    d[1] = ffxReciprocal(a[1]);
+    d[2] = ffxReciprocal(a[2]);
+    return;
+}
+
+/// Convert FfxFloat32 to half (in lower 16-bits of output).
+/// 
+/// This function implements the same fast technique that is documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+/// 
+/// The function supports denormals.
+/// 
+/// Some conversion rules are to make computations possibly "safer" on the GPU,
+///  -INF & -NaN -> -65504
+///  +INF & +NaN -> +65504
+///
+/// @param [in] f               The 32bit floating point value to convert.
+/// 
+/// @returns
+/// The closest 16bit floating point value to <c><i>f</i></c>.
+/// 
+/// @ingroup CPU
+FFX_STATIC FfxUInt32 f32tof16(FfxFloat32 f)
+{
+    static FfxUInt16 base[512] = {
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400,
+        0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000,
+        0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff,
+        0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x7bff, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+        0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002,
+        0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00,
+        0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff,
+        0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff, 0xfbff
+    };
+    
+    static FfxUInt8 shift[512] = {
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+        0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+        0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+        0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18
+    };
+
+    union
+    {
+        FfxFloat32      f;
+        FfxUInt32 u;
+    } bits;
+
+    bits.f       = f;
+    FfxUInt32 u = bits.u;
+    FfxUInt32 i = u >> 23;
+    return (FfxUInt32)(base[i]) + ((u & 0x7fffff) >> shift[i]);
+}
+
+/// Pack 2x32-bit floating point values in a single 32bit value.
+///
+/// This function first converts each component of <c><i>value</i></c> into their nearest 16-bit floating
+/// point representation, and then stores the X and Y components in the lower and upper 16 bits of the
+/// 32bit unsigned integer respectively.
+///
+/// @param [in] value               A 2-dimensional floating point value to convert and pack.
+///
+/// @returns
+/// A packed 32bit value containing 2 16bit floating point values.
+///
+/// @ingroup CPU
+FFX_STATIC FfxUInt32 packHalf2x16(FfxFloat32x2 a)
+{
+    return f32tof16(a[0]) + (f32tof16(a[1]) << 16);
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_core_glsl.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_glsl.h
--- a/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common.h
--- a/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_gpu_common_half.h
--- a/thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_hlsl.h
--- a/thirdparty/amd-fsr2/shaders/ffx_core_portability.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_core_portability.h
@@ -0,0 +1,50 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+FfxFloat32x3 opAAddOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)
+{
+    d = a + ffxBroadcast3(b);
+    return d;
+}
+
+FfxFloat32x3 opACpyF3(FfxFloat32x3 d, FfxFloat32x3 a)
+{
+    d = a;
+    return d;
+}
+
+FfxFloat32x3 opAMulF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b)
+{
+    d = a * b;
+    return d;
+}
+
+FfxFloat32x3 opAMulOneF3(FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b)
+{
+    d = a * ffxBroadcast3(b);
+    return d;
+}
+
+FfxFloat32x3 opARcpF3(FfxFloat32x3 d, FfxFloat32x3 a)
+{
+    d = rcp(a);
+    return d;
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr1.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr1.h
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate.h
@@ -0,0 +1,295 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_ACCUMULATE_H
+#define FFX_FSR2_ACCUMULATE_H
+
+FfxFloat32 GetPxHrVelocity(FfxFloat32x2 fMotionVector)
+{
+    return length(fMotionVector * DisplaySize());
+}
+#if FFX_HALF
+FFX_MIN16_F GetPxHrVelocity(FFX_MIN16_F2 fMotionVector)
+{
+    return length(fMotionVector * FFX_MIN16_F2(DisplaySize()));
+}
+#endif
+
+void Accumulate(const AccumulationPassCommonParams params, FFX_PARAMETER_INOUT FfxFloat32x3 fHistoryColor, FfxFloat32x3 fAccumulation, FFX_PARAMETER_IN FfxFloat32x4 fUpsampledColorAndWeight)
+{
+    // Aviod invalid values when accumulation and upsampled weight is 0
+    fAccumulation = ffxMax(FSR2_EPSILON.xxx, fAccumulation + fUpsampledColorAndWeight.www);
+
+#if FFX_FSR2_OPTION_HDR_COLOR_INPUT
+    //YCoCg -> RGB -> Tonemap -> YCoCg (Use RGB tonemapper to avoid color desaturation)
+    fUpsampledColorAndWeight.xyz = RGBToYCoCg(Tonemap(YCoCgToRGB(fUpsampledColorAndWeight.xyz)));
+    fHistoryColor = RGBToYCoCg(Tonemap(YCoCgToRGB(fHistoryColor)));
+#endif
+
+    const FfxFloat32x3 fAlpha = fUpsampledColorAndWeight.www / fAccumulation;
+    fHistoryColor = ffxLerp(fHistoryColor, fUpsampledColorAndWeight.xyz, fAlpha);
+
+    fHistoryColor = YCoCgToRGB(fHistoryColor);
+
+#if FFX_FSR2_OPTION_HDR_COLOR_INPUT
+    fHistoryColor = InverseTonemap(fHistoryColor);
+#endif
+}
+
+void RectifyHistory(
+    const AccumulationPassCommonParams params,
+    RectificationBox clippingBox,
+    FFX_PARAMETER_INOUT FfxFloat32x3 fHistoryColor,
+    FFX_PARAMETER_INOUT FfxFloat32x3 fAccumulation,
+    FfxFloat32 fLockContributionThisFrame,
+    FfxFloat32 fTemporalReactiveFactor,
+    FfxFloat32 fLumaInstabilityFactor)
+{
+    FfxFloat32 fScaleFactorInfluence = ffxMin(20.0f, ffxPow(FfxFloat32(1.0f / length(DownscaleFactor().x * DownscaleFactor().y)), 3.0f));
+
+    const FfxFloat32 fVecolityFactor = ffxSaturate(params.fHrVelocity / 20.0f);
+    const FfxFloat32 fBoxScaleT = ffxMax(params.fDepthClipFactor, ffxMax(params.fAccumulationMask, fVecolityFactor));
+    FfxFloat32 fBoxScale = ffxLerp(fScaleFactorInfluence, 1.0f, fBoxScaleT);
+
+    FfxFloat32x3 fScaledBoxVec = clippingBox.boxVec * fBoxScale;
+    FfxFloat32x3 boxMin = clippingBox.boxCenter - fScaledBoxVec;
+    FfxFloat32x3 boxMax = clippingBox.boxCenter + fScaledBoxVec;
+    FfxFloat32x3 boxCenter = clippingBox.boxCenter;
+    FfxFloat32 boxVecSize = length(clippingBox.boxVec);
+
+    boxMin = ffxMax(clippingBox.aabbMin, boxMin);
+    boxMax = ffxMin(clippingBox.aabbMax, boxMax);
+
+    if (any(FFX_GREATER_THAN(boxMin, fHistoryColor)) || any(FFX_GREATER_THAN(fHistoryColor, boxMax))) {
+
+        const FfxFloat32x3 fClampedHistoryColor = clamp(fHistoryColor, boxMin, boxMax);
+
+        FfxFloat32x3 fHistoryContribution = ffxMax(fLumaInstabilityFactor, fLockContributionThisFrame).xxx;
+        
+        const FfxFloat32 fReactiveFactor = params.fDilatedReactiveFactor;
+        const FfxFloat32 fReactiveContribution = 1.0f - ffxPow(fReactiveFactor, 1.0f / 2.0f);
+        fHistoryContribution *= fReactiveContribution;
+
+        // Scale history color using rectification info, also using accumulation mask to avoid potential invalid color protection
+        fHistoryColor = ffxLerp(fClampedHistoryColor, fHistoryColor, ffxSaturate(fHistoryContribution));
+
+        // Scale accumulation using rectification info
+        const FfxFloat32x3 fAccumulationMin = ffxMin(fAccumulation, FFX_BROADCAST_FLOAT32X3(0.1f));
+        fAccumulation = ffxLerp(fAccumulationMin, fAccumulation, ffxSaturate(fHistoryContribution));
+    }
+}
+
+void WriteUpscaledOutput(FfxInt32x2 iPxHrPos, FfxFloat32x3 fUpscaledColor)
+{
+    StoreUpscaledOutput(iPxHrPos, fUpscaledColor);
+}
+
+void FinalizeLockStatus(const AccumulationPassCommonParams params, FfxFloat32x2 fLockStatus, FfxFloat32 fUpsampledWeight)
+{
+    // we expect similar motion for next frame
+    // kill lock if that location is outside screen, avoid locks to be clamped to screen borders
+    FfxFloat32x2 fEstimatedUvNextFrame = params.fHrUv - params.fMotionVector;
+    if (IsUvInside(fEstimatedUvNextFrame) == false) {
+        KillLock(fLockStatus);
+    }
+    else {
+        // Decrease lock lifetime
+        const FfxFloat32 fLifetimeDecreaseLanczosMax = FfxFloat32(JitterSequenceLength()) * FfxFloat32(fAverageLanczosWeightPerFrame);
+        const FfxFloat32 fLifetimeDecrease = FfxFloat32(fUpsampledWeight / fLifetimeDecreaseLanczosMax);
+        fLockStatus[LOCK_LIFETIME_REMAINING] = ffxMax(FfxFloat32(0), fLockStatus[LOCK_LIFETIME_REMAINING] - fLifetimeDecrease);
+    }
+
+    StoreLockStatus(params.iPxHrPos, fLockStatus);
+}
+
+
+FfxFloat32x3 ComputeBaseAccumulationWeight(const AccumulationPassCommonParams params, FfxFloat32 fThisFrameReactiveFactor, FfxBoolean bInMotionLastFrame, FfxFloat32 fUpsampledWeight, LockState lockState)
+{
+    // Always assume max accumulation was reached
+    FfxFloat32 fBaseAccumulation = fMaxAccumulationLanczosWeight * FfxFloat32(params.bIsExistingSample) * (1.0f - fThisFrameReactiveFactor) * (1.0f - params.fDepthClipFactor);
+
+    fBaseAccumulation = ffxMin(fBaseAccumulation, ffxLerp(fBaseAccumulation, fUpsampledWeight * 10.0f, ffxMax(FfxFloat32(bInMotionLastFrame), ffxSaturate(params.fHrVelocity * FfxFloat32(10)))));
+
+    fBaseAccumulation = ffxMin(fBaseAccumulation, ffxLerp(fBaseAccumulation, fUpsampledWeight, ffxSaturate(params.fHrVelocity / FfxFloat32(20))));
+
+    return fBaseAccumulation.xxx;
+}
+
+FfxFloat32 ComputeLumaInstabilityFactor(const AccumulationPassCommonParams params, RectificationBox clippingBox, FfxFloat32 fThisFrameReactiveFactor, FfxFloat32 fLuminanceDiff)
+{
+    const FfxFloat32 fUnormThreshold = 1.0f / 255.0f;
+    const FfxInt32 N_MINUS_1 = 0;
+    const FfxInt32 N_MINUS_2 = 1;
+    const FfxInt32 N_MINUS_3 = 2;
+    const FfxInt32 N_MINUS_4 = 3;
+
+    FfxFloat32 fCurrentFrameLuma = clippingBox.boxCenter.x;
+
+#if FFX_FSR2_OPTION_HDR_COLOR_INPUT
+    fCurrentFrameLuma = fCurrentFrameLuma / (1.0f + ffxMax(0.0f, fCurrentFrameLuma));
+#endif
+
+    fCurrentFrameLuma = round(fCurrentFrameLuma * 255.0f) / 255.0f;
+
+    const FfxBoolean bSampleLumaHistory = (ffxMax(ffxMax(params.fDepthClipFactor, params.fAccumulationMask), fLuminanceDiff) < 0.1f) && (params.bIsNewSample == false);
+    FfxFloat32x4 fCurrentFrameLumaHistory = bSampleLumaHistory ? SampleLumaHistory(params.fReprojectedHrUv) : FFX_BROADCAST_FLOAT32X4(0.0f);
+
+    FfxFloat32 fLumaInstability = 0.0f;
+    FfxFloat32 fDiffs0 = (fCurrentFrameLuma - fCurrentFrameLumaHistory[N_MINUS_1]);
+
+    FfxFloat32 fMin = abs(fDiffs0);
+
+    if (fMin >= fUnormThreshold)
+    {
+        for (int i = N_MINUS_2; i <= N_MINUS_4; i++) {
+            FfxFloat32 fDiffs1 = (fCurrentFrameLuma - fCurrentFrameLumaHistory[i]);
+
+            if (sign(fDiffs0) == sign(fDiffs1)) {
+                
+                // Scale difference to protect historically similar values
+                const FfxFloat32 fMinBias = 1.0f;
+                fMin = ffxMin(fMin, abs(fDiffs1) * fMinBias);
+            }
+        }
+
+        const FfxFloat32 fBoxSize = clippingBox.boxVec.x;
+        const FfxFloat32 fBoxSizeFactor = ffxPow(ffxSaturate(fBoxSize / 0.1f), 6.0f);
+
+        fLumaInstability = FfxFloat32(fMin != abs(fDiffs0)) * fBoxSizeFactor;
+        fLumaInstability = FfxFloat32(fLumaInstability > fUnormThreshold);
+
+        fLumaInstability *= 1.0f - ffxMax(params.fAccumulationMask, ffxPow(fThisFrameReactiveFactor, 1.0f / 6.0f));
+    }
+
+    //shift history
+    fCurrentFrameLumaHistory[N_MINUS_4] = fCurrentFrameLumaHistory[N_MINUS_3];
+    fCurrentFrameLumaHistory[N_MINUS_3] = fCurrentFrameLumaHistory[N_MINUS_2];
+    fCurrentFrameLumaHistory[N_MINUS_2] = fCurrentFrameLumaHistory[N_MINUS_1];
+    fCurrentFrameLumaHistory[N_MINUS_1] = fCurrentFrameLuma;
+
+    StoreLumaHistory(params.iPxHrPos, fCurrentFrameLumaHistory);
+
+    return fLumaInstability * FfxFloat32(fCurrentFrameLumaHistory[N_MINUS_4] != 0);
+}
+
+FfxFloat32 ComputeTemporalReactiveFactor(const AccumulationPassCommonParams params, FfxFloat32 fTemporalReactiveFactor)
+{
+    FfxFloat32 fNewFactor = ffxMin(0.99f, fTemporalReactiveFactor);
+
+    fNewFactor = ffxMax(fNewFactor, ffxLerp(fNewFactor, 0.4f, ffxSaturate(params.fHrVelocity)));
+
+    fNewFactor = ffxMax(fNewFactor * fNewFactor, ffxMax(params.fDepthClipFactor * 0.1f, params.fDilatedReactiveFactor));
+
+    // Force reactive factor for new samples
+    fNewFactor = params.bIsNewSample ? 1.0f : fNewFactor;
+
+    if (ffxSaturate(params.fHrVelocity * 10.0f) >= 1.0f) {
+        fNewFactor = ffxMax(FSR2_EPSILON, fNewFactor) * -1.0f;
+    }
+    
+    return fNewFactor;
+}
+
+AccumulationPassCommonParams InitParams(FfxInt32x2 iPxHrPos)
+{
+    AccumulationPassCommonParams params;
+
+    params.iPxHrPos = iPxHrPos;
+    const FfxFloat32x2 fHrUv = (iPxHrPos + 0.5f) / DisplaySize();
+    params.fHrUv = fHrUv;
+    
+    const FfxFloat32x2 fLrUvJittered = fHrUv + Jitter() / RenderSize();
+    params.fLrUv_HwSampler = ClampUv(fLrUvJittered, RenderSize(), MaxRenderSize());
+
+    params.fMotionVector = GetMotionVector(iPxHrPos, fHrUv);
+    params.fHrVelocity = GetPxHrVelocity(params.fMotionVector);
+
+    ComputeReprojectedUVs(params, params.fReprojectedHrUv, params.bIsExistingSample);
+
+    params.fDepthClipFactor = ffxSaturate(SampleDepthClip(params.fLrUv_HwSampler));
+    
+    const FfxFloat32x2 fDilatedReactiveMasks = SampleDilatedReactiveMasks(params.fLrUv_HwSampler);
+    params.fDilatedReactiveFactor = fDilatedReactiveMasks.x;
+    params.fAccumulationMask = fDilatedReactiveMasks.y;
+    params.bIsResetFrame = (0 == FrameIndex());
+
+    params.bIsNewSample = (params.bIsExistingSample == false || params.bIsResetFrame);
+
+    return params;
+}
+
+void Accumulate(FfxInt32x2 iPxHrPos)
+{
+    const AccumulationPassCommonParams params = InitParams(iPxHrPos);
+
+    FfxFloat32x3 fHistoryColor = FfxFloat32x3(0, 0, 0);
+    FfxFloat32x2 fLockStatus;
+    InitializeNewLockSample(fLockStatus);
+
+    FfxFloat32 fTemporalReactiveFactor = 0.0f;
+    FfxBoolean bInMotionLastFrame = FFX_FALSE;
+    LockState lockState = { FFX_FALSE , FFX_FALSE };
+    if (params.bIsExistingSample && !params.bIsResetFrame) {
+        ReprojectHistoryColor(params, fHistoryColor, fTemporalReactiveFactor, bInMotionLastFrame);
+        lockState = ReprojectHistoryLockStatus(params, fLockStatus);
+    }
+
+    FfxFloat32 fThisFrameReactiveFactor = ffxMax(params.fDilatedReactiveFactor, fTemporalReactiveFactor);
+
+    FfxFloat32 fLuminanceDiff = 0.0f;
+    FfxFloat32 fLockContributionThisFrame = 0.0f;
+    UpdateLockStatus(params, fThisFrameReactiveFactor, lockState, fLockStatus, fLockContributionThisFrame, fLuminanceDiff);
+
+    // Load upsampled input color
+    RectificationBox clippingBox;
+    FfxFloat32x4 fUpsampledColorAndWeight = ComputeUpsampledColorAndWeight(params, clippingBox, fThisFrameReactiveFactor);
+    
+    const FfxFloat32 fLumaInstabilityFactor = ComputeLumaInstabilityFactor(params, clippingBox, fThisFrameReactiveFactor, fLuminanceDiff);
+
+
+    FfxFloat32x3 fAccumulation = ComputeBaseAccumulationWeight(params, fThisFrameReactiveFactor, bInMotionLastFrame, fUpsampledColorAndWeight.w, lockState);
+
+    if (params.bIsNewSample) {
+        fHistoryColor = YCoCgToRGB(fUpsampledColorAndWeight.xyz);
+    }
+    else {
+        RectifyHistory(params, clippingBox, fHistoryColor, fAccumulation, fLockContributionThisFrame, fThisFrameReactiveFactor, fLumaInstabilityFactor);
+
+        Accumulate(params, fHistoryColor, fAccumulation, fUpsampledColorAndWeight);
+    }
+
+    fHistoryColor = UnprepareRgb(fHistoryColor, Exposure());
+
+    FinalizeLockStatus(params, fLockStatus, fUpsampledColorAndWeight.w);
+
+    // Get new temporal reactive factor
+    fTemporalReactiveFactor = ComputeTemporalReactiveFactor(params, fThisFrameReactiveFactor);
+
+    StoreInternalColorAndWeight(iPxHrPos, FfxFloat32x4(fHistoryColor, fTemporalReactiveFactor));
+
+    // Output final color when RCAS is disabled
+#if FFX_FSR2_OPTION_APPLY_SHARPENING == 0
+    WriteUpscaledOutput(iPxHrPos, fHistoryColor);
+#endif
+    StoreNewLocks(iPxHrPos, 0);
+}
+
+#endif // FFX_FSR2_ACCUMULATE_H
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_accumulate_pass.glsl
@@ -0,0 +1,91 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+// Needed for rw_upscaled_output declaration
+#extension GL_EXT_shader_image_load_formatted : require
+
+#define FSR2_BIND_SRV_INPUT_EXPOSURE                         0
+#define FSR2_BIND_SRV_DILATED_REACTIVE_MASKS                 1
+#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS
+#define FSR2_BIND_SRV_DILATED_MOTION_VECTORS                 2
+#else
+#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS                   2
+#endif
+#define FSR2_BIND_SRV_INTERNAL_UPSCALED                      3
+#define FSR2_BIND_SRV_LOCK_STATUS                            4
+//#define FSR2_BIND_SRV_INPUT_DEPTH_CLIP                       5
+#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR                   6
+#define FSR2_BIND_SRV_LUMA_INSTABILITY                       7
+#define FSR2_BIND_SRV_LANCZOS_LUT                            8
+#define FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT               9
+#define FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS                   10
+#define FSR2_BIND_SRV_AUTO_EXPOSURE                          11
+#define FSR2_BIND_SRV_LUMA_HISTORY                           12
+
+#define FSR2_BIND_UAV_INTERNAL_UPSCALED                      13
+#define FSR2_BIND_UAV_LOCK_STATUS                            14
+#define FSR2_BIND_UAV_UPSCALED_OUTPUT                        15
+#define FSR2_BIND_UAV_NEW_LOCKS                              16
+#define FSR2_BIND_UAV_LUMA_HISTORY                           17
+
+#define FSR2_BIND_CB_FSR2                                    18
+
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+#define FSR2_BIND_SRV_INPUT_DEPTH                            5
+#endif
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+#include "ffx_fsr2_sample.h"
+#include "ffx_fsr2_upsample.h"
+#include "ffx_fsr2_postprocess_lock_status.h"
+#include "ffx_fsr2_reproject.h"
+#include "ffx_fsr2_accumulate.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+	uvec2 uGroupId = gl_WorkGroupID.xy;
+    const uint GroupRows = (uint(DisplaySize().y) + FFX_FSR2_THREAD_GROUP_HEIGHT - 1) / FFX_FSR2_THREAD_GROUP_HEIGHT;
+    uGroupId.y = GroupRows - uGroupId.y - 1;
+
+    uvec2 uDispatchThreadId = uGroupId * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy;
+
+    Accumulate(ivec2(uDispatchThreadId));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_autogen_reactive_pass.glsl
@@ -0,0 +1,93 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_INPUT_OPAQUE_ONLY                     0
+#define FSR2_BIND_SRV_INPUT_COLOR                           1
+#define FSR2_BIND_UAV_AUTOREACTIVE                          2
+#define FSR2_BIND_CB_REACTIVE                               3
+#define FSR2_BIND_CB_FSR2                                   4
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+
+// layout (set = 1, binding = FSR2_BIND_SRV_PRE_ALPHA_COLOR)  uniform texture2D   r_input_color_pre_alpha;
+// layout (set = 1, binding = FSR2_BIND_SRV_POST_ALPHA_COLOR) uniform texture2D   r_input_color_post_alpha;
+// layout (set = 1, binding = FSR2_BIND_UAV_REACTIVE, r8)     uniform image2D     rw_output_reactive_mask;
+
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+#if defined(FSR2_BIND_CB_REACTIVE)
+layout (set = 1, binding = FSR2_BIND_CB_REACTIVE, std140) uniform cbGenerateReactive_t
+{
+	float   scale;
+	float   threshold;
+	float   binaryValue;
+	uint    flags;
+} cbGenerateReactive;
+#endif
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+    FfxUInt32x2 uDispatchThreadId = gl_GlobalInvocationID.xy;
+
+    FfxFloat32x3 ColorPreAlpha  = LoadOpaqueOnly(FFX_MIN16_I2(uDispatchThreadId)).rgb;
+    FfxFloat32x3 ColorPostAlpha = LoadInputColor(FFX_MIN16_I2(uDispatchThreadId)).rgb;
+    
+    if ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP) != 0)
+    {
+        ColorPreAlpha = Tonemap(ColorPreAlpha);
+        ColorPostAlpha = Tonemap(ColorPostAlpha);
+    }
+
+    if ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP) != 0)
+    {
+        ColorPreAlpha = InverseTonemap(ColorPreAlpha);
+        ColorPostAlpha = InverseTonemap(ColorPostAlpha);
+    }
+
+    FfxFloat32 out_reactive_value = 0.f;
+    FfxFloat32x3 delta = abs(ColorPostAlpha - ColorPreAlpha);
+    
+    out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX)!=0) ? max(delta.x, max(delta.y, delta.z)) : length(delta);
+    out_reactive_value *= cbGenerateReactive.scale;
+
+    out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD)!=0) ? ((out_reactive_value < cbGenerateReactive.threshold) ? 0 : cbGenerateReactive.binaryValue) : out_reactive_value;
+
+    imageStore(rw_output_autoreactive, FfxInt32x2(uDispatchThreadId), vec4(out_reactive_value));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_glsl.h
@@ -0,0 +1,698 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+#include "ffx_fsr2_resources.h"
+
+#if defined(FFX_GPU)
+#include "ffx_core.h"
+#endif // #if defined(FFX_GPU)
+
+#if defined(FFX_GPU)
+#ifndef FFX_FSR2_PREFER_WAVE64
+#define FFX_FSR2_PREFER_WAVE64
+#endif // #if defined(FFX_GPU)
+
+#if defined(FSR2_BIND_CB_FSR2)
+	layout (set = 1, binding = FSR2_BIND_CB_FSR2, std140) uniform cbFSR2_t
+	{
+		FfxInt32x2    iRenderSize;
+		FfxInt32x2    iMaxRenderSize;
+		FfxInt32x2    iDisplaySize;
+		FfxInt32x2    iInputColorResourceDimensions;
+		FfxInt32x2    iLumaMipDimensions;
+		FfxInt32      iLumaMipLevelToUse;
+		FfxInt32      iFrameIndex;
+
+		FfxFloat32x4  fDeviceToViewDepth;
+		FfxFloat32x2  fJitter;
+		FfxFloat32x2  fMotionVectorScale;
+		FfxFloat32x2  fDownscaleFactor;
+		FfxFloat32x2  fMotionVectorJitterCancellation;
+		FfxFloat32    fPreExposure;
+		FfxFloat32    fPreviousFramePreExposure;
+		FfxFloat32    fTanHalfFOV;
+		FfxFloat32    fJitterSequenceLength;
+		FfxFloat32    fDeltaTime;
+		FfxFloat32    fDynamicResChangeFactor;
+		FfxFloat32    fViewSpaceToMetersFactor;
+
+		FfxFloat32    fPad;
+		mat4          mReprojectionMatrix;
+	} cbFSR2;
+#endif
+
+FfxInt32x2 RenderSize()
+{
+	return cbFSR2.iRenderSize;
+}
+
+FfxInt32x2 MaxRenderSize()
+{
+	return cbFSR2.iMaxRenderSize;
+}
+
+FfxInt32x2 DisplaySize()
+{
+	return cbFSR2.iDisplaySize;
+}
+
+FfxInt32x2 InputColorResourceDimensions()
+{
+	return cbFSR2.iInputColorResourceDimensions;
+}
+
+FfxInt32x2 LumaMipDimensions()
+{
+	return cbFSR2.iLumaMipDimensions;
+}
+
+FfxInt32  LumaMipLevelToUse()
+{
+	return cbFSR2.iLumaMipLevelToUse;
+}
+
+FfxInt32 FrameIndex()
+{
+	return cbFSR2.iFrameIndex;
+}
+
+FfxFloat32x4 DeviceToViewSpaceTransformFactors()
+{
+	return cbFSR2.fDeviceToViewDepth;
+}
+
+FfxFloat32x2 Jitter()
+{
+	return cbFSR2.fJitter;
+}
+
+FfxFloat32x2 MotionVectorScale()
+{
+	return cbFSR2.fMotionVectorScale;
+}
+
+FfxFloat32x2 DownscaleFactor()
+{
+	return cbFSR2.fDownscaleFactor;
+}
+
+FfxFloat32x2 MotionVectorJitterCancellation()
+{
+	return cbFSR2.fMotionVectorJitterCancellation;
+}
+
+FfxFloat32 PreExposure()
+{
+	return cbFSR2.fPreExposure;
+}
+
+FfxFloat32 PreviousFramePreExposure()
+{
+	return cbFSR2.fPreviousFramePreExposure;
+}
+
+FfxFloat32 TanHalfFoV()
+{
+	return cbFSR2.fTanHalfFOV;
+}
+
+FfxFloat32 JitterSequenceLength()
+{
+	return cbFSR2.fJitterSequenceLength;
+}
+
+FfxFloat32 DeltaTime()
+{
+	return cbFSR2.fDeltaTime;
+}
+
+FfxFloat32 DynamicResChangeFactor()
+{
+	return cbFSR2.fDynamicResChangeFactor;
+}
+
+FfxFloat32 ViewSpaceToMetersFactor()
+{
+	return cbFSR2.fViewSpaceToMetersFactor;
+}
+
+layout (set = 0, binding = 0) uniform sampler s_PointClamp;
+layout (set = 0, binding = 1) uniform sampler s_LinearClamp;
+
+// SRVs
+#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY)
+	layout (set = 1, binding = FSR2_BIND_SRV_INPUT_OPAQUE_ONLY)                       uniform texture2D  r_input_opaque_only;
+#endif
+#if defined(FSR2_BIND_SRV_INPUT_COLOR)
+	layout (set = 1, binding = FSR2_BIND_SRV_INPUT_COLOR)                             uniform texture2D  r_input_color_jittered;
+#endif
+#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS)
+	layout (set = 1, binding = FSR2_BIND_SRV_INPUT_MOTION_VECTORS)                    uniform texture2D  r_input_motion_vectors;
+#endif
+#if defined(FSR2_BIND_SRV_INPUT_DEPTH)
+	layout (set = 1, binding = FSR2_BIND_SRV_INPUT_DEPTH)                             uniform texture2D  r_input_depth;
+#endif
+#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE)
+	layout (set = 1, binding = FSR2_BIND_SRV_INPUT_EXPOSURE)                          uniform texture2D  r_input_exposure;
+#endif
+#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE)
+	layout(set = 1, binding = FSR2_BIND_SRV_AUTO_EXPOSURE)                            uniform texture2D  r_auto_exposure;
+#endif
+#if defined(FSR2_BIND_SRV_REACTIVE_MASK)
+	layout (set = 1, binding = FSR2_BIND_SRV_REACTIVE_MASK)                           uniform texture2D  r_reactive_mask;
+#endif
+#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK)
+	layout (set = 1, binding = FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK)       uniform texture2D  r_transparency_and_composition_mask;
+#endif
+#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH)
+	layout (set = 1, binding = FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH)        uniform utexture2D r_reconstructed_previous_nearest_depth;
+#endif
+#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS)
+	layout (set = 1, binding = FSR2_BIND_SRV_DILATED_MOTION_VECTORS)                  uniform texture2D  r_dilated_motion_vectors;
+#endif
+#if defined (FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS)
+	layout(set = 1, binding = FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS)          uniform texture2D  r_previous_dilated_motion_vectors;
+#endif
+#if defined(FSR2_BIND_SRV_DILATED_DEPTH)
+	layout (set = 1, binding = FSR2_BIND_SRV_DILATED_DEPTH)                           uniform texture2D  r_dilatedDepth;
+#endif
+#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED)
+	layout (set = 1, binding = FSR2_BIND_SRV_INTERNAL_UPSCALED)                       uniform texture2D  r_internal_upscaled_color;
+#endif
+#if defined(FSR2_BIND_SRV_LOCK_STATUS)
+	layout (set = 1, binding = FSR2_BIND_SRV_LOCK_STATUS)                             uniform texture2D  r_lock_status;
+#endif
+#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA)
+	layout (set = 1, binding = FSR2_BIND_SRV_LOCK_INPUT_LUMA)                         uniform texture2D  r_lock_input_luma;
+#endif
+#if defined(FSR2_BIND_SRV_NEW_LOCKS)
+	layout(set = 1, binding = FSR2_BIND_SRV_NEW_LOCKS)                                uniform texture2D  r_new_locks;
+#endif
+#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR)
+	layout (set = 1, binding = FSR2_BIND_SRV_PREPARED_INPUT_COLOR)                    uniform texture2D  r_prepared_input_color;
+#endif
+#if defined(FSR2_BIND_SRV_LUMA_HISTORY)
+	layout (set = 1, binding = FSR2_BIND_SRV_LUMA_HISTORY)                            uniform texture2D  r_luma_history;
+#endif
+#if defined(FSR2_BIND_SRV_RCAS_INPUT)
+	layout (set = 1, binding = FSR2_BIND_SRV_RCAS_INPUT)                              uniform texture2D  r_rcas_input;
+#endif
+#if defined(FSR2_BIND_SRV_LANCZOS_LUT)
+	layout (set = 1, binding = FSR2_BIND_SRV_LANCZOS_LUT)                             uniform texture2D  r_lanczos_lut;
+#endif
+#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS)
+	layout (set = 1, binding = FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS)                    uniform texture2D  r_imgMips;
+#endif
+#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT)
+	layout (set = 1, binding = FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT)                uniform texture2D  r_upsample_maximum_bias_lut;
+#endif
+#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS)
+	layout (set = 1, binding = FSR2_BIND_SRV_DILATED_REACTIVE_MASKS)                  uniform texture2D  r_dilated_reactive_masks;
+#endif			 
+#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR)
+	layout(set = 1, binding = FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) 				      uniform texture2D  r_input_prev_color_pre_alpha;
+#endif
+#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR)
+	layout(set = 1, binding = FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) 				      uniform texture2D  r_input_prev_color_post_alpha;
+#endif
+
+// UAV
+#if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH
+	layout (set = 1, binding = FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH, r32ui) uniform uimage2D   rw_reconstructed_previous_nearest_depth;
+#endif
+#if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS
+	layout (set = 1, binding = FSR2_BIND_UAV_DILATED_MOTION_VECTORS, rg16f)           writeonly uniform image2D  rw_dilated_motion_vectors;
+#endif
+#if defined FSR2_BIND_UAV_DILATED_DEPTH
+	layout (set = 1, binding = FSR2_BIND_UAV_DILATED_DEPTH, r16f)                     writeonly uniform image2D  rw_dilatedDepth;
+#endif
+#if defined FSR2_BIND_UAV_INTERNAL_UPSCALED
+	layout (set = 1, binding = FSR2_BIND_UAV_INTERNAL_UPSCALED, rgba16f)              writeonly uniform image2D  rw_internal_upscaled_color;
+#endif
+#if defined FSR2_BIND_UAV_LOCK_STATUS
+	layout (set = 1, binding = FSR2_BIND_UAV_LOCK_STATUS, rg16f)                      uniform image2D    rw_lock_status;
+#endif
+#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA)
+	layout(set = 1, binding = FSR2_BIND_UAV_LOCK_INPUT_LUMA, r16f)                    writeonly uniform image2D    rw_lock_input_luma;
+#endif
+#if defined FSR2_BIND_UAV_NEW_LOCKS
+	layout(set = 1, binding = FSR2_BIND_UAV_NEW_LOCKS, r8)				 		      uniform image2D    rw_new_locks;
+#endif
+#if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR
+	layout (set = 1, binding = FSR2_BIND_UAV_PREPARED_INPUT_COLOR, rgba16)            writeonly uniform image2D  rw_prepared_input_color;
+#endif
+#if defined FSR2_BIND_UAV_LUMA_HISTORY
+	layout (set = 1, binding = FSR2_BIND_UAV_LUMA_HISTORY, rgba8)                     uniform image2D  rw_luma_history;
+#endif
+#if defined FSR2_BIND_UAV_UPSCALED_OUTPUT
+	layout (set = 1, binding = FSR2_BIND_UAV_UPSCALED_OUTPUT /* app controlled format */) writeonly uniform image2D  rw_upscaled_output;
+#endif
+#if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE
+	layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE, r16f)              coherent uniform image2D  rw_img_mip_shading_change;
+#endif
+#if defined FSR2_BIND_UAV_EXPOSURE_MIP_5
+	layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_5, r16f)                        coherent uniform image2D  rw_img_mip_5;
+#endif
+#if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS
+	layout (set = 1, binding = FSR2_BIND_UAV_DILATED_REACTIVE_MASKS, rg8)                 writeonly uniform image2D	 rw_dilated_reactive_masks;
+#endif 
+#if defined FSR2_BIND_UAV_EXPOSURE 
+	layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE, rg32f)                         uniform image2D    rw_exposure;
+#endif
+#if defined FSR2_BIND_UAV_AUTO_EXPOSURE
+	layout(set = 1, binding = FSR2_BIND_UAV_AUTO_EXPOSURE, rg32f)                         uniform image2D    rw_auto_exposure;
+#endif
+#if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC 
+	layout (set = 1, binding = FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC, r32ui)       coherent uniform uimage2D   rw_spd_global_atomic;
+#endif
+
+#if defined FSR2_BIND_UAV_AUTOREACTIVE
+	layout(set = 1, binding = FSR2_BIND_UAV_AUTOREACTIVE, r32f)                       uniform image2D   	    rw_output_autoreactive;
+#endif
+#if defined FSR2_BIND_UAV_AUTOCOMPOSITION
+	layout(set = 1, binding = FSR2_BIND_UAV_AUTOCOMPOSITION, r32f)                    uniform image2D   	    rw_output_autocomposition;
+#endif
+#if defined FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR
+	layout(set = 1, binding = FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR, r11f_g11f_b10f)     uniform image2D   	    rw_output_prev_color_pre_alpha;
+#endif
+#if defined FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR
+	layout(set = 1, binding = FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR, r11f_g11f_b10f)    uniform image2D   	    rw_output_prev_color_post_alpha;
+#endif
+
+#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS)
+FfxFloat32 LoadMipLuma(FfxInt32x2 iPxPos, FfxInt32 mipLevel)
+{
+	return texelFetch(r_imgMips, iPxPos, FfxInt32(mipLevel)).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS)
+FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxInt32 mipLevel)
+{
+	return textureLod(sampler2D(r_imgMips, s_LinearClamp), fUV, FfxFloat32(mipLevel)).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_DEPTH)
+FfxFloat32 LoadInputDepth(FfxInt32x2 iPxPos)
+{
+	return texelFetch(r_input_depth, iPxPos, 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_REACTIVE_MASK) 
+FfxFloat32 LoadReactiveMask(FfxInt32x2 iPxPos)
+{
+#if FFX_FSR2_OPTION_GODOT_REACTIVE_MASK_CLAMP
+	return min(texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r, 0.9f);
+#else
+	return texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r;
+#endif
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK)
+FfxFloat32 LoadTransparencyAndCompositionMask(FfxUInt32x2 iPxPos)
+{
+	return texelFetch(r_transparency_and_composition_mask, FfxInt32x2(iPxPos), 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_COLOR)
+FfxFloat32x3 LoadInputColor(FfxInt32x2 iPxPos)
+{
+	return texelFetch(r_input_color_jittered, iPxPos, 0).rgb;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_COLOR)
+FfxFloat32x3 SampleInputColor(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_input_color_jittered, s_LinearClamp), fUV, 0.0f).rgb;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR)
+FfxFloat32x3 LoadPreparedInputColor(FfxInt32x2 iPxPos)
+{
+	return texelFetch(r_prepared_input_color, iPxPos, 0).xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS)
+FfxFloat32x2 LoadInputMotionVector(FfxInt32x2 iPxDilatedMotionVectorPos)
+{
+	FfxFloat32x2 fSrcMotionVector = texelFetch(r_input_motion_vectors, iPxDilatedMotionVectorPos, 0).xy;
+
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+	bool bInvalidMotionVector = all(lessThanEqual(fSrcMotionVector, vec2(-1.0f, -1.0f)));
+	if (bInvalidMotionVector)
+	{
+		FfxFloat32 fSrcDepth = LoadInputDepth(iPxDilatedMotionVectorPos);
+		FfxFloat32x2 fUv = (iPxDilatedMotionVectorPos + FfxFloat32(0.5)) / RenderSize();
+		fSrcMotionVector = FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS_FUNCTION(fUv, fSrcDepth, cbFSR2.mReprojectionMatrix);
+	}
+#endif
+
+	FfxFloat32x2 fUvMotionVector = fSrcMotionVector * MotionVectorScale();
+
+#if FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS
+	fUvMotionVector -= MotionVectorJitterCancellation();
+#endif
+
+	return fUvMotionVector;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED)
+FfxFloat32x4 LoadHistory(FfxInt32x2 iPxHistory)
+{
+	return texelFetch(r_internal_upscaled_color, iPxHistory, 0);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LUMA_HISTORY)
+void StoreLumaHistory(FfxInt32x2 iPxPos, FfxFloat32x4 fLumaHistory)
+{
+	imageStore(rw_luma_history, FfxInt32x2(iPxPos), fLumaHistory);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LUMA_HISTORY)
+FfxFloat32x4 SampleLumaHistory(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_luma_history, s_LinearClamp), fUV, 0.0f);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED)
+void StoreReprojectedHistory(FfxInt32x2 iPxHistory, FfxFloat32x4 fHistory)
+{
+	imageStore(rw_internal_upscaled_color, iPxHistory, fHistory);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED)
+void StoreInternalColorAndWeight(FfxInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight)
+{
+	imageStore(rw_internal_upscaled_color, FfxInt32x2(iPxPos), fColorAndWeight);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT)
+void StoreUpscaledOutput(FfxInt32x2 iPxPos, FfxFloat32x3 fColor)
+{
+    imageStore(rw_upscaled_output, FfxInt32x2(iPxPos), FfxFloat32x4(fColor, 1.f));
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LOCK_STATUS)
+FfxFloat32x2 LoadLockStatus(FfxInt32x2 iPxPos)
+{
+	FfxFloat32x2 fLockStatus = texelFetch(r_lock_status, iPxPos, 0).rg;
+
+    return fLockStatus;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LOCK_STATUS)
+void StoreLockStatus(FfxInt32x2 iPxPos, FfxFloat32x2 fLockstatus)
+{
+	imageStore(rw_lock_status, iPxPos, vec4(fLockstatus, 0.0f, 0.0f));
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA)
+FfxFloat32 LoadLockInputLuma(FfxInt32x2 iPxPos)
+{
+	return texelFetch(r_lock_input_luma, iPxPos, 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA)
+void StoreLockInputLuma(FfxInt32x2 iPxPos, FfxFloat32 fLuma)
+{
+	imageStore(rw_lock_input_luma, iPxPos, vec4(fLuma, 0, 0, 0));
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_NEW_LOCKS)
+FfxFloat32 LoadNewLocks(FfxInt32x2 iPxPos)
+{
+	return texelFetch(r_new_locks, iPxPos, 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_NEW_LOCKS)
+FfxFloat32 LoadRwNewLocks(FfxInt32x2 iPxPos)
+{
+	return imageLoad(rw_new_locks, iPxPos).r;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_NEW_LOCKS)
+void StoreNewLocks(FfxInt32x2 iPxPos, FfxFloat32 newLock)
+{
+	imageStore(rw_new_locks, iPxPos, vec4(newLock, 0, 0, 0));
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR)
+void StorePreparedInputColor(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped)
+{
+	imageStore(rw_prepared_input_color, iPxPos, fTonemapped);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR)
+FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_prepared_input_color, s_LinearClamp), fUV, 0.0f).w;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LOCK_STATUS)
+FfxFloat32x2 SampleLockStatus(FfxFloat32x2 fUV)
+{
+	FfxFloat32x2 fLockStatus = textureLod(sampler2D(r_lock_status, s_LinearClamp), fUV, 0.0f).rg;
+	return fLockStatus;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DEPTH)
+FfxFloat32 LoadSceneDepth(FfxInt32x2 iPxInput)
+{
+	return texelFetch(r_input_depth, iPxInput, 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH)
+FfxFloat32 LoadReconstructedPrevDepth(FfxInt32x2 iPxPos)
+{
+	return uintBitsToFloat(texelFetch(r_reconstructed_previous_nearest_depth, iPxPos, 0).r);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH)
+void StoreReconstructedDepth(FfxInt32x2 iPxSample, FfxFloat32 fDepth)
+{
+	FfxUInt32 uDepth = floatBitsToUint(fDepth);
+
+	#if FFX_FSR2_OPTION_INVERTED_DEPTH
+		imageAtomicMax(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth);
+	#else
+		imageAtomicMin(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth); // min for standard, max for inverted depth
+	#endif
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH)
+void SetReconstructedDepth(FfxInt32x2 iPxSample, FfxUInt32 uValue)
+{
+	imageStore(rw_reconstructed_previous_nearest_depth, iPxSample, uvec4(uValue, 0, 0, 0));
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_DEPTH)
+void StoreDilatedDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth)
+{
+	//FfxUInt32 uDepth = f32tof16(fDepth);
+	imageStore(rw_dilatedDepth, iPxPos, vec4(fDepth, 0.0f, 0.0f, 0.0f));
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) 
+void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector)
+{
+	imageStore(rw_dilated_motion_vectors, iPxPos, vec4(fMotionVector, 0.0f, 0.0f));
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS)
+FfxFloat32x2 LoadDilatedMotionVector(FfxInt32x2 iPxInput)
+{
+	return texelFetch(r_dilated_motion_vectors, iPxInput, 0).rg;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS)
+FfxFloat32x2 SampleDilatedMotionVector(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_dilated_motion_vectors, s_LinearClamp), fUV, 0.0f).rg;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS)
+FfxFloat32x2 LoadPreviousDilatedMotionVector(FfxInt32x2 iPxInput)
+{
+	return texelFetch(r_previous_dilated_motion_vectors, iPxInput, 0).rg;
+}
+
+FfxFloat32x2 SamplePreviousDilatedMotionVector(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_previous_dilated_motion_vectors, s_LinearClamp), fUV, 0.0f).xy;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_DEPTH)
+FfxFloat32 LoadDilatedDepth(FfxInt32x2 iPxInput)
+{
+	return texelFetch(r_dilatedDepth, iPxInput, 0).r;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE)
+FfxFloat32 Exposure()
+{
+	FfxFloat32 exposure = texelFetch(r_input_exposure, FfxInt32x2(0, 0), 0).x;
+
+	if (exposure == 0.0f) {
+		exposure = 1.0f;
+	}
+
+	return exposure;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE)
+FfxFloat32 AutoExposure()
+{
+	FfxFloat32 exposure = texelFetch(r_auto_exposure, FfxInt32x2(0, 0), 0).x;
+
+	if (exposure == 0.0f) {
+		exposure = 1.0f;
+	}
+
+	return exposure;
+}
+#endif
+
+FfxFloat32 SampleLanczos2Weight(FfxFloat32 x)
+{
+#if defined(FSR2_BIND_SRV_LANCZOS_LUT)
+	return textureLod(sampler2D(r_lanczos_lut, s_LinearClamp), FfxFloat32x2(x / 2.0f, 0.5f), 0.0f).x; 
+#else
+    return 0.f;
+#endif
+}
+
+#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT)
+FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv)
+{
+    // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range.
+    return FfxFloat32(2.0f) * FfxFloat32(textureLod(sampler2D(r_upsample_maximum_bias_lut, s_LinearClamp), abs(uv) * 2.0f, 0.0f).r);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS)
+FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV)
+{
+	return textureLod(sampler2D(r_dilated_reactive_masks, s_LinearClamp), fUV, 0.0f).rg;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS)
+FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos)
+{
+    return texelFetch(r_dilated_reactive_masks, iPxPos, 0).rg;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS)
+void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks)
+{
+    imageStore(rw_dilated_reactive_masks, iPxPos, vec4(fDilatedReactiveMasks, 0.0f, 0.0f));
+}
+#endif
+
+#if defined(FFX_INTERNAL)
+FfxFloat32x4 SampleDebug(FfxFloat32x2 fUV)
+{
+    return textureLod(sampler2D(r_debug_out, s_LinearClamp), fUV, 0.0f).rgba;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY)
+FfxFloat32x3 LoadOpaqueOnly(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+	return texelFetch(r_input_opaque_only, iPxPos, 0).xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR)
+FfxFloat32x3 LoadPrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+	return texelFetch(r_input_prev_color_pre_alpha, iPxPos, 0).xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR)
+FfxFloat32x3 LoadPrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+	return texelFetch(r_input_prev_color_post_alpha, iPxPos, 0).xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_AUTOREACTIVE)
+#if defined(FSR2_BIND_UAV_AUTOCOMPOSITION)
+void StoreAutoReactive(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F2 fReactive)
+{
+	imageStore(rw_output_autoreactive, iPxPos, vec4(FfxFloat32(fReactive.x), 0.0f, 0.0f, 0.0f));
+
+	imageStore(rw_output_autocomposition, iPxPos, vec4(FfxFloat32(fReactive.y), 0.0f, 0.0f, 0.0f));
+}
+#endif
+#endif
+
+#if defined(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR)
+void StorePrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color)
+{
+	imageStore(rw_output_prev_color_pre_alpha, iPxPos, vec4(color, 0.0f));
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR)
+void StorePrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color)
+{
+	imageStore(rw_output_prev_color_post_alpha, iPxPos, vec4(color, 0.0f));
+}
+#endif
+
+#endif // #if defined(FFX_GPU)
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_callbacks_hlsl.h
@@ -0,0 +1,799 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "ffx_fsr2_resources.h"
+
+#if defined(FFX_GPU)
+#ifdef __hlsl_dx_compiler
+#pragma dxc diagnostic push
+#pragma dxc diagnostic ignored "-Wambig-lit-shift"
+#endif //__hlsl_dx_compiler
+#include "ffx_core.h"
+#ifdef __hlsl_dx_compiler
+#pragma dxc diagnostic pop
+#endif //__hlsl_dx_compiler
+#endif // #if defined(FFX_GPU)
+
+#if defined(FFX_GPU)
+#ifndef FFX_FSR2_PREFER_WAVE64
+#define FFX_FSR2_PREFER_WAVE64
+#endif // #if defined(FFX_GPU)
+
+#if defined(FFX_GPU)
+#pragma warning(disable: 3205)  // conversion from larger type to smaller
+#endif // #if defined(FFX_GPU)
+
+#define DECLARE_SRV_REGISTER(regIndex)  t##regIndex
+#define DECLARE_UAV_REGISTER(regIndex)  u##regIndex
+#define DECLARE_CB_REGISTER(regIndex)   b##regIndex
+#define FFX_FSR2_DECLARE_SRV(regIndex)  register(DECLARE_SRV_REGISTER(regIndex))
+#define FFX_FSR2_DECLARE_UAV(regIndex)  register(DECLARE_UAV_REGISTER(regIndex))
+#define FFX_FSR2_DECLARE_CB(regIndex)   register(DECLARE_CB_REGISTER(regIndex))
+
+#if defined(FSR2_BIND_CB_FSR2) || defined(FFX_INTERNAL)
+    cbuffer cbFSR2 : FFX_FSR2_DECLARE_CB(FSR2_BIND_CB_FSR2)
+    {
+        FfxInt32x2    iRenderSize;
+        FfxInt32x2    iMaxRenderSize;
+        FfxInt32x2    iDisplaySize;
+        FfxInt32x2    iInputColorResourceDimensions;
+        FfxInt32x2    iLumaMipDimensions;
+        FfxInt32      iLumaMipLevelToUse;
+        FfxInt32      iFrameIndex;
+
+        FfxFloat32x4  fDeviceToViewDepth;
+        FfxFloat32x2  fJitter;
+        FfxFloat32x2  fMotionVectorScale;
+        FfxFloat32x2  fDownscaleFactor;
+        FfxFloat32x2  fMotionVectorJitterCancellation;
+        FfxFloat32    fPreExposure;
+        FfxFloat32    fPreviousFramePreExposure;
+        FfxFloat32    fTanHalfFOV;
+        FfxFloat32    fJitterSequenceLength;
+        FfxFloat32    fDeltaTime;
+        FfxFloat32    fDynamicResChangeFactor;
+        FfxFloat32    fViewSpaceToMetersFactor;
+    };
+
+#define FFX_FSR2_CONSTANT_BUFFER_1_SIZE (sizeof(cbFSR2) / 4)  // Number of 32-bit values. This must be kept in sync with the cbFSR2 size.
+#endif
+
+#if defined(FFX_GPU)
+#define FFX_FSR2_ROOTSIG_STRINGIFY(p) FFX_FSR2_ROOTSIG_STR(p)
+#define FFX_FSR2_ROOTSIG_STR(p) #p
+#define FFX_FSR2_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "DescriptorTable(SRV(t0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_1_SIZE) ", b0), " \
+                                    "StaticSampler(s0, filter = FILTER_MIN_MAG_MIP_POINT, " \
+                                                      "addressU = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressV = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressW = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "comparisonFunc = COMPARISON_NEVER, " \
+                                                      "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK), " \
+                                    "StaticSampler(s1, filter = FILTER_MIN_MAG_MIP_LINEAR, " \
+                                                      "addressU = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressV = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressW = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "comparisonFunc = COMPARISON_NEVER, " \
+                                                      "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )]
+
+#define FFX_FSR2_CONSTANT_BUFFER_2_SIZE 6  // Number of 32-bit values. This must be kept in sync with max( cbRCAS , cbSPD) size.
+
+#define FFX_FSR2_CB2_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "DescriptorTable(SRV(t0, numDescriptors = " FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_RESOURCE_IDENTIFIER_COUNT) ")), " \
+                                    "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_1_SIZE) ", b0), " \
+                                    "RootConstants(num32BitConstants=" FFX_FSR2_ROOTSIG_STRINGIFY(FFX_FSR2_CONSTANT_BUFFER_2_SIZE) ", b1), " \
+                                    "StaticSampler(s0, filter = FILTER_MIN_MAG_MIP_POINT, " \
+                                                      "addressU = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressV = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressW = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "comparisonFunc = COMPARISON_NEVER, " \
+                                                      "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK), " \
+                                    "StaticSampler(s1, filter = FILTER_MIN_MAG_MIP_LINEAR, " \
+                                                      "addressU = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressV = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "addressW = TEXTURE_ADDRESS_CLAMP, " \
+                                                      "comparisonFunc = COMPARISON_NEVER, " \
+                                                      "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )]
+#if defined(FFX_FSR2_EMBED_ROOTSIG)
+#define FFX_FSR2_EMBED_ROOTSIG_CONTENT FFX_FSR2_ROOTSIG
+#define FFX_FSR2_EMBED_CB2_ROOTSIG_CONTENT FFX_FSR2_CB2_ROOTSIG
+#else
+#define FFX_FSR2_EMBED_ROOTSIG_CONTENT
+#define FFX_FSR2_EMBED_CB2_ROOTSIG_CONTENT
+#endif // #if FFX_FSR2_EMBED_ROOTSIG
+#endif // #if defined(FFX_GPU)
+
+/* Define getter functions in the order they are defined in the CB! */
+FfxInt32x2 RenderSize()
+{
+    return iRenderSize;
+}
+
+FfxInt32x2 MaxRenderSize()
+{
+    return iMaxRenderSize;
+}
+
+FfxInt32x2 DisplaySize()
+{
+    return iDisplaySize;
+}
+
+FfxInt32x2 InputColorResourceDimensions()
+{
+    return iInputColorResourceDimensions;
+}
+
+FfxInt32x2 LumaMipDimensions()
+{
+    return iLumaMipDimensions;
+}
+
+FfxInt32  LumaMipLevelToUse()
+{
+    return iLumaMipLevelToUse;
+}
+
+FfxInt32 FrameIndex()
+{
+    return iFrameIndex;
+}
+
+FfxFloat32x2 Jitter()
+{
+    return fJitter;
+}
+
+FfxFloat32x4 DeviceToViewSpaceTransformFactors()
+{
+    return fDeviceToViewDepth;
+}
+
+FfxFloat32x2 MotionVectorScale()
+{
+    return fMotionVectorScale;
+}
+
+FfxFloat32x2 DownscaleFactor()
+{
+    return fDownscaleFactor;
+}
+
+FfxFloat32x2 MotionVectorJitterCancellation()
+{
+    return fMotionVectorJitterCancellation;
+}
+
+FfxFloat32 PreExposure()
+{
+    return fPreExposure;
+}
+
+FfxFloat32 PreviousFramePreExposure()
+{
+    return fPreviousFramePreExposure;
+}
+
+FfxFloat32 TanHalfFoV()
+{
+    return fTanHalfFOV;
+}
+
+FfxFloat32 JitterSequenceLength()
+{
+    return fJitterSequenceLength;
+}
+
+FfxFloat32 DeltaTime()
+{
+    return fDeltaTime;
+}
+
+FfxFloat32 DynamicResChangeFactor()
+{
+    return fDynamicResChangeFactor;
+}
+
+FfxFloat32 ViewSpaceToMetersFactor()
+{
+    return fViewSpaceToMetersFactor;
+}
+
+
+SamplerState s_PointClamp : register(s0);
+SamplerState s_LinearClamp : register(s1);
+
+// SRVs
+#if defined(FFX_INTERNAL)
+    Texture2D<FfxFloat32x4>                       r_input_opaque_only                       : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY);
+    Texture2D<FfxFloat32x4>                       r_input_color_jittered                    : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR);
+    Texture2D<FfxFloat32x4>                       r_input_motion_vectors                    : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS);
+    Texture2D<FfxFloat32>                         r_input_depth                             : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH);
+    Texture2D<FfxFloat32x2>                       r_input_exposure                          : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE);
+    Texture2D<FfxFloat32x2>                       r_auto_exposure                           : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE);
+    Texture2D<FfxFloat32>                         r_reactive_mask                           : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK);
+    Texture2D<FfxFloat32>                         r_transparency_and_composition_mask       : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK);
+    Texture2D<FfxUInt32>                          r_reconstructed_previous_nearest_depth    : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH);
+    Texture2D<FfxFloat32x2>                       r_dilated_motion_vectors                  : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS);
+    Texture2D<FfxFloat32x2>                       r_previous_dilated_motion_vectors         : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS);
+    Texture2D<FfxFloat32>                         r_dilatedDepth                            : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH);
+    Texture2D<FfxFloat32x4>                       r_internal_upscaled_color                 : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR);
+    Texture2D<unorm FfxFloat32x2>                 r_lock_status                             : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS);
+    Texture2D<FfxFloat32>                         r_lock_input_luma                         : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA);
+    Texture2D<unorm FfxFloat32>                   r_new_locks                               : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS);
+    Texture2D<FfxFloat32x4>                       r_prepared_input_color                    : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR);
+    Texture2D<FfxFloat32x4>                       r_luma_history                            : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY);
+    Texture2D<FfxFloat32x4>                       r_rcas_input                              : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT);
+    Texture2D<FfxFloat32>                         r_lanczos_lut                             : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT);
+    Texture2D<FfxFloat32>                         r_imgMips                                 : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE);
+    Texture2D<FfxFloat32>                         r_upsample_maximum_bias_lut               : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT);
+    Texture2D<unorm FfxFloat32x2>                 r_dilated_reactive_masks                  : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS);
+    Texture2D<float3>                             r_input_prev_color_pre_alpha              : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR);
+    Texture2D<float3>                             r_input_prev_color_post_alpha             : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR);
+
+    Texture2D<FfxFloat32x4>                       r_debug_out                               : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT);
+
+    // UAV declarations
+    RWTexture2D<FfxUInt32>                        rw_reconstructed_previous_nearest_depth   : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH);
+    RWTexture2D<FfxFloat32x2>                     rw_dilated_motion_vectors                 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS);
+    RWTexture2D<FfxFloat32>                       rw_dilatedDepth                           : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH);
+    RWTexture2D<FfxFloat32x4>                     rw_internal_upscaled_color                : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR);
+    RWTexture2D<unorm FfxFloat32x2>               rw_lock_status                            : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS);
+    RWTexture2D<FfxFloat32>                       rw_lock_input_luma                        : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA);
+    RWTexture2D<unorm FfxFloat32>                 rw_new_locks                              : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS);
+    RWTexture2D<FfxFloat32x4>                     rw_prepared_input_color                   : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR);
+    RWTexture2D<FfxFloat32x4>                     rw_luma_history                           : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY);
+    RWTexture2D<FfxFloat32x4>                     rw_upscaled_output                        : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT);
+
+    globallycoherent RWTexture2D<FfxFloat32>      rw_img_mip_shading_change                 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE);
+    globallycoherent RWTexture2D<FfxFloat32>      rw_img_mip_5                              : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5);
+    RWTexture2D<unorm FfxFloat32x2>               rw_dilated_reactive_masks                 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS);
+    RWTexture2D<FfxFloat32x2>                     rw_auto_exposure                          : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE);
+    globallycoherent RWTexture2D<FfxUInt32>       rw_spd_global_atomic                      : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT);
+    RWTexture2D<FfxFloat32x4>                     rw_debug_out                              : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT);
+    
+    RWTexture2D<float>                            rw_output_autoreactive                    : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE);
+    RWTexture2D<float>                            rw_output_autocomposition                 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION);
+    RWTexture2D<float3>                           rw_output_prev_color_pre_alpha            : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR);
+    RWTexture2D<float3>                           rw_output_prev_color_post_alpha           : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR);  
+
+#else // #if defined(FFX_INTERNAL)
+    #if defined FSR2_BIND_SRV_INPUT_COLOR
+        Texture2D<FfxFloat32x4>                   r_input_color_jittered                    : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_COLOR);
+    #endif
+    #if defined FSR2_BIND_SRV_INPUT_OPAQUE_ONLY
+        Texture2D<FfxFloat32x4>                   r_input_opaque_only                       : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY);
+    #endif
+    #if defined FSR2_BIND_SRV_INPUT_MOTION_VECTORS
+        Texture2D<FfxFloat32x4>                   r_input_motion_vectors                    : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_MOTION_VECTORS);
+    #endif
+    #if defined FSR2_BIND_SRV_INPUT_DEPTH
+        Texture2D<FfxFloat32>                     r_input_depth                             : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_DEPTH);
+    #endif 
+    #if defined FSR2_BIND_SRV_INPUT_EXPOSURE
+        Texture2D<FfxFloat32x2>                   r_input_exposure                          : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_EXPOSURE);
+    #endif
+    #if defined FSR2_BIND_SRV_AUTO_EXPOSURE
+        Texture2D<FfxFloat32x2>                   r_auto_exposure                           : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_AUTO_EXPOSURE);
+    #endif
+    #if defined FSR2_BIND_SRV_REACTIVE_MASK
+        Texture2D<FfxFloat32>                     r_reactive_mask                           : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_REACTIVE_MASK);
+    #endif 
+    #if defined FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK
+        Texture2D<FfxFloat32>                     r_transparency_and_composition_mask       : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK);
+    #endif
+    #if defined FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH
+        Texture2D<FfxUInt32>                      r_reconstructed_previous_nearest_depth    : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH);
+    #endif 
+    #if defined FSR2_BIND_SRV_DILATED_MOTION_VECTORS
+       Texture2D<FfxFloat32x2>                    r_dilated_motion_vectors                  : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_MOTION_VECTORS);
+    #endif
+    #if defined FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS
+           Texture2D<FfxFloat32x2>                r_previous_dilated_motion_vectors         : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS);
+    #endif
+    #if defined FSR2_BIND_SRV_DILATED_DEPTH
+        Texture2D<FfxFloat32>                     r_dilatedDepth                            : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_DEPTH);
+    #endif
+    #if defined FSR2_BIND_SRV_INTERNAL_UPSCALED
+        Texture2D<FfxFloat32x4>                   r_internal_upscaled_color                 : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INTERNAL_UPSCALED);
+    #endif
+    #if defined FSR2_BIND_SRV_LOCK_STATUS
+        Texture2D<unorm FfxFloat32x2>             r_lock_status                             : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_STATUS);
+    #endif
+    #if defined FSR2_BIND_SRV_LOCK_INPUT_LUMA
+        Texture2D<FfxFloat32>                     r_lock_input_luma                         : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_INPUT_LUMA);
+    #endif
+    #if defined FSR2_BIND_SRV_NEW_LOCKS
+        Texture2D<unorm FfxFloat32>               r_new_locks                               : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_NEW_LOCKS);
+    #endif
+    #if defined FSR2_BIND_SRV_PREPARED_INPUT_COLOR
+        Texture2D<FfxFloat32x4>                  r_prepared_input_color                    : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREPARED_INPUT_COLOR);
+    #endif
+    #if defined FSR2_BIND_SRV_LUMA_HISTORY
+        Texture2D<unorm FfxFloat32x4>             r_luma_history                            : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LUMA_HISTORY);
+    #endif
+    #if defined FSR2_BIND_SRV_RCAS_INPUT
+        Texture2D<FfxFloat32x4>                   r_rcas_input                              : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RCAS_INPUT);
+    #endif
+    #if defined FSR2_BIND_SRV_LANCZOS_LUT
+        Texture2D<FfxFloat32>                     r_lanczos_lut                             : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LANCZOS_LUT);
+    #endif
+    #if defined FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS
+        Texture2D<FfxFloat32>                     r_imgMips                                 : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS);
+    #endif
+    #if defined FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT
+        Texture2D<FfxFloat32>                     r_upsample_maximum_bias_lut               : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT);
+    #endif
+    #if defined FSR2_BIND_SRV_DILATED_REACTIVE_MASKS
+        Texture2D<unorm FfxFloat32x2>             r_dilated_reactive_masks                  : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS);
+    #endif
+
+    #if defined FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR
+        Texture2D<float3>                         r_input_prev_color_pre_alpha              : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR);
+    #endif
+    #if defined FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR
+        Texture2D<float3>                         r_input_prev_color_post_alpha             : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR);
+    #endif
+   
+    // UAV declarations
+    #if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH
+        RWTexture2D<FfxUInt32>                    rw_reconstructed_previous_nearest_depth   : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH);
+    #endif
+    #if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS
+        RWTexture2D<FfxFloat32x2>                 rw_dilated_motion_vectors                 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_MOTION_VECTORS);
+    #endif
+    #if defined FSR2_BIND_UAV_DILATED_DEPTH
+        RWTexture2D<FfxFloat32>                   rw_dilatedDepth                           : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_DEPTH);
+    #endif
+    #if defined FSR2_BIND_UAV_INTERNAL_UPSCALED
+        RWTexture2D<FfxFloat32x4>                 rw_internal_upscaled_color                : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_INTERNAL_UPSCALED);
+    #endif
+    #if defined FSR2_BIND_UAV_LOCK_STATUS
+        RWTexture2D<unorm FfxFloat32x2>           rw_lock_status                            : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_STATUS);
+    #endif
+    #if defined FSR2_BIND_UAV_LOCK_INPUT_LUMA
+        RWTexture2D<FfxFloat32>                   rw_lock_input_luma                        : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_INPUT_LUMA);
+    #endif
+    #if defined FSR2_BIND_UAV_NEW_LOCKS
+        RWTexture2D<unorm FfxFloat32>             rw_new_locks                              : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_NEW_LOCKS);
+    #endif
+    #if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR
+        RWTexture2D<FfxFloat32x4>                 rw_prepared_input_color                   : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREPARED_INPUT_COLOR);
+    #endif
+    #if defined FSR2_BIND_UAV_LUMA_HISTORY
+        RWTexture2D<FfxFloat32x4>                 rw_luma_history                           : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LUMA_HISTORY);
+    #endif
+    #if defined FSR2_BIND_UAV_UPSCALED_OUTPUT
+        RWTexture2D<FfxFloat32x4>                 rw_upscaled_output                        : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_UPSCALED_OUTPUT);
+    #endif
+    #if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE
+        globallycoherent RWTexture2D<FfxFloat32>  rw_img_mip_shading_change                 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE);
+    #endif
+    #if defined FSR2_BIND_UAV_EXPOSURE_MIP_5
+        globallycoherent RWTexture2D<FfxFloat32>  rw_img_mip_5                              : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_5);
+    #endif
+    #if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS
+        RWTexture2D<unorm FfxFloat32x2>           rw_dilated_reactive_masks                 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS);
+    #endif
+    #if defined FSR2_BIND_UAV_EXPOSURE
+        RWTexture2D<FfxFloat32x2>                 rw_exposure                               : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE);
+    #endif
+    #if defined FSR2_BIND_UAV_AUTO_EXPOSURE
+        RWTexture2D<FfxFloat32x2>                 rw_auto_exposure                          : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTO_EXPOSURE);
+    #endif
+    #if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC
+        globallycoherent RWTexture2D<FfxUInt32>   rw_spd_global_atomic                      : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC);
+    #endif
+
+    #if defined FSR2_BIND_UAV_AUTOREACTIVE
+        RWTexture2D<float>                        rw_output_autoreactive                    : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTOREACTIVE);
+    #endif
+    #if defined FSR2_BIND_UAV_AUTOCOMPOSITION
+        RWTexture2D<float>                        rw_output_autocomposition                 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_AUTOCOMPOSITION);
+    #endif
+    #if defined FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR
+        RWTexture2D<float3>                       rw_output_prev_color_pre_alpha            : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR);
+    #endif
+    #if defined FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR
+        RWTexture2D<float3>                       rw_output_prev_color_post_alpha           : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR);
+    #endif
+#endif // #if defined(FFX_INTERNAL)
+
+#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) || defined(FFX_INTERNAL)
+FfxFloat32 LoadMipLuma(FfxUInt32x2 iPxPos, FfxUInt32 mipLevel)
+{
+    return r_imgMips.mips[mipLevel][iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_SCENE_LUMINANCE_MIPS) || defined(FFX_INTERNAL)
+FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel)
+{
+    return r_imgMips.SampleLevel(s_LinearClamp, fUV, mipLevel);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_DEPTH) || defined(FFX_INTERNAL)
+FfxFloat32 LoadInputDepth(FfxUInt32x2 iPxPos)
+{
+    return r_input_depth[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_DEPTH) || defined(FFX_INTERNAL)
+FfxFloat32 SampleInputDepth(FfxFloat32x2 fUV)
+{
+    return r_input_depth.SampleLevel(s_LinearClamp, fUV, 0).x;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_REACTIVE_MASK) || defined(FFX_INTERNAL)
+FfxFloat32 LoadReactiveMask(FfxUInt32x2 iPxPos)
+{
+    return r_reactive_mask[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) || defined(FFX_INTERNAL)
+FfxFloat32 LoadTransparencyAndCompositionMask(FfxUInt32x2 iPxPos)
+{
+    return r_transparency_and_composition_mask[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32x3 LoadInputColor(FfxUInt32x2 iPxPos)
+{
+    return r_input_color_jittered[iPxPos].rgb;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32x3 SampleInputColor(FfxFloat32x2 fUV)
+{
+    return r_input_color_jittered.SampleLevel(s_LinearClamp, fUV, 0).rgb;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32x3 LoadPreparedInputColor(FfxUInt32x2 iPxPos)
+{
+    return r_prepared_input_color[iPxPos].xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_MOTION_VECTORS) || defined(FFX_INTERNAL)
+FfxFloat32x2 LoadInputMotionVector(FfxUInt32x2 iPxDilatedMotionVectorPos)
+{
+    FfxFloat32x2 fSrcMotionVector = r_input_motion_vectors[iPxDilatedMotionVectorPos].xy;
+
+    FfxFloat32x2 fUvMotionVector = fSrcMotionVector * MotionVectorScale();
+
+#if FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS
+    fUvMotionVector -= MotionVectorJitterCancellation();
+#endif
+
+    return fUvMotionVector;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL)
+FfxFloat32x4 LoadHistory(FfxUInt32x2 iPxHistory)
+{
+    return r_internal_upscaled_color[iPxHistory];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LUMA_HISTORY) || defined(FFX_INTERNAL)
+void StoreLumaHistory(FfxUInt32x2 iPxPos, FfxFloat32x4 fLumaHistory)
+{
+    rw_luma_history[iPxPos] = fLumaHistory;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LUMA_HISTORY) || defined(FFX_INTERNAL)
+FfxFloat32x4 SampleLumaHistory(FfxFloat32x2 fUV)
+{
+    return r_luma_history.SampleLevel(s_LinearClamp, fUV, 0);
+}
+#endif
+
+#if defined(FFX_INTERNAL)
+FfxFloat32x4 SampleDebug(FfxFloat32x2 fUV)
+{
+    return r_debug_out.SampleLevel(s_LinearClamp, fUV, 0).w;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL)
+void StoreReprojectedHistory(FfxUInt32x2 iPxHistory, FfxFloat32x4 fHistory)
+{
+    rw_internal_upscaled_color[iPxHistory] = fHistory;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL)
+void StoreInternalColorAndWeight(FfxUInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight)
+{
+    rw_internal_upscaled_color[iPxPos] = fColorAndWeight;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT) || defined(FFX_INTERNAL)
+void StoreUpscaledOutput(FfxUInt32x2 iPxPos, FfxFloat32x3 fColor)
+{
+    rw_upscaled_output[iPxPos] = FfxFloat32x4(fColor, 1.f);
+}
+#endif
+
+//LOCK_LIFETIME_REMAINING == 0
+//Should make LockInitialLifetime() return a const 1.0f later
+#if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL)
+FfxFloat32x2 LoadLockStatus(FfxUInt32x2 iPxPos)
+{
+    return r_lock_status[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LOCK_STATUS) || defined(FFX_INTERNAL)
+void StoreLockStatus(FfxUInt32x2 iPxPos, FfxFloat32x2 fLockStatus)
+{
+    rw_lock_status[iPxPos] = fLockStatus;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LOCK_INPUT_LUMA) || defined(FFX_INTERNAL)
+FfxFloat32 LoadLockInputLuma(FfxUInt32x2 iPxPos)
+{
+    return r_lock_input_luma[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_LOCK_INPUT_LUMA) || defined(FFX_INTERNAL)
+void StoreLockInputLuma(FfxUInt32x2 iPxPos, FfxFloat32 fLuma)
+{
+    rw_lock_input_luma[iPxPos] = fLuma;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_NEW_LOCKS) || defined(FFX_INTERNAL)
+FfxFloat32 LoadNewLocks(FfxUInt32x2 iPxPos)
+{
+    return r_new_locks[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_NEW_LOCKS) || defined(FFX_INTERNAL)
+FfxFloat32 LoadRwNewLocks(FfxUInt32x2 iPxPos)
+{
+    return rw_new_locks[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_NEW_LOCKS) || defined(FFX_INTERNAL)
+void StoreNewLocks(FfxUInt32x2 iPxPos, FfxFloat32 newLock)
+{
+    rw_new_locks[iPxPos] = newLock;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL)
+void StorePreparedInputColor(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped)
+{
+    rw_prepared_input_color[iPxPos] = fTonemapped;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV)
+{
+    return r_prepared_input_color.SampleLevel(s_LinearClamp, fUV, 0).w;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL)
+FfxFloat32x2 SampleLockStatus(FfxFloat32x2 fUV)
+{
+    FfxFloat32x2 fLockStatus = r_lock_status.SampleLevel(s_LinearClamp, fUV, 0);
+    return fLockStatus;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL)
+FfxFloat32 LoadReconstructedPrevDepth(FfxUInt32x2 iPxPos)
+{
+    return asfloat(r_reconstructed_previous_nearest_depth[iPxPos]);
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL)
+void StoreReconstructedDepth(FfxUInt32x2 iPxSample, FfxFloat32 fDepth)
+{
+    FfxUInt32 uDepth = asuint(fDepth);
+
+    #if FFX_FSR2_OPTION_INVERTED_DEPTH
+        InterlockedMax(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth);
+    #else
+        InterlockedMin(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth); // min for standard, max for inverted depth
+    #endif
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL)
+void SetReconstructedDepth(FfxUInt32x2 iPxSample, const FfxUInt32 uValue)
+{
+    rw_reconstructed_previous_nearest_depth[iPxSample] = uValue;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_DEPTH) || defined(FFX_INTERNAL)
+void StoreDilatedDepth(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth)
+{
+    rw_dilatedDepth[iPxPos] = fDepth;
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL)
+void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector)
+{
+    rw_dilated_motion_vectors[iPxPos] = fMotionVector;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL)
+FfxFloat32x2 LoadDilatedMotionVector(FfxUInt32x2 iPxInput)
+{
+    return r_dilated_motion_vectors[iPxInput].xy;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL)
+FfxFloat32x2 LoadPreviousDilatedMotionVector(FfxUInt32x2 iPxInput)
+{
+    return r_previous_dilated_motion_vectors[iPxInput].xy;
+}
+
+FfxFloat32x2 SamplePreviousDilatedMotionVector(FfxFloat32x2 uv)
+{
+    return r_previous_dilated_motion_vectors.SampleLevel(s_LinearClamp, uv, 0).xy;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_DEPTH) || defined(FFX_INTERNAL)
+FfxFloat32 LoadDilatedDepth(FfxUInt32x2 iPxInput)
+{
+    return r_dilatedDepth[iPxInput];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_EXPOSURE) || defined(FFX_INTERNAL)
+FfxFloat32 Exposure()
+{
+    FfxFloat32 exposure = r_input_exposure[FfxUInt32x2(0, 0)].x;
+
+    if (exposure == 0.0f) {
+        exposure = 1.0f;
+    }
+
+    return exposure;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_AUTO_EXPOSURE) || defined(FFX_INTERNAL)
+FfxFloat32 AutoExposure()
+{
+    FfxFloat32 exposure = r_auto_exposure[FfxUInt32x2(0, 0)].x;
+
+    if (exposure == 0.0f) {
+        exposure = 1.0f;
+    }
+
+    return exposure;
+}
+#endif
+
+FfxFloat32 SampleLanczos2Weight(FfxFloat32 x)
+{
+#if defined(FSR2_BIND_SRV_LANCZOS_LUT) || defined(FFX_INTERNAL)
+    return r_lanczos_lut.SampleLevel(s_LinearClamp, FfxFloat32x2(x / 2, 0.5f), 0);
+#else
+    return 0.f;
+#endif
+}
+
+#if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) || defined(FFX_INTERNAL)
+FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv)
+{
+    // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range.
+    return FfxFloat32(2.0) * r_upsample_maximum_bias_lut.SampleLevel(s_LinearClamp, abs(uv) * 2.0, 0);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL)
+FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV)
+{
+	return r_dilated_reactive_masks.SampleLevel(s_LinearClamp, fUV, 0);
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL)
+FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos)
+{
+    return r_dilated_reactive_masks[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL)
+void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks)
+{
+    rw_dilated_reactive_masks[iPxPos] = fDilatedReactiveMasks;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_INPUT_OPAQUE_ONLY) || defined(FFX_INTERNAL)
+FfxFloat32x3 LoadOpaqueOnly(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+    return r_input_opaque_only[iPxPos].xyz;
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32x3 LoadPrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+    return r_input_prev_color_pre_alpha[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR) || defined(FFX_INTERNAL)
+FfxFloat32x3 LoadPrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos)
+{
+    return r_input_prev_color_post_alpha[iPxPos];
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_AUTOREACTIVE) || defined(FFX_INTERNAL)
+#if defined(FSR2_BIND_UAV_AUTOCOMPOSITION) || defined(FFX_INTERNAL)
+void StoreAutoReactive(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F2 fReactive)
+{
+    rw_output_autoreactive[iPxPos] = fReactive.x;
+
+    rw_output_autocomposition[iPxPos] = fReactive.y;
+}
+#endif
+#endif
+
+#if defined(FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR) || defined(FFX_INTERNAL)
+void StorePrevPreAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color)
+{
+    rw_output_prev_color_pre_alpha[iPxPos] = color;
+
+}
+#endif
+
+#if defined(FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR) || defined(FFX_INTERNAL)
+void StorePrevPostAlpha(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F3 color)
+{
+    rw_output_prev_color_post_alpha[iPxPos] = color;
+}
+#endif
+
+#endif // #if defined(FFX_GPU)
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_common.h
@@ -0,0 +1,565 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#if !defined(FFX_FSR2_COMMON_H)
+#define FFX_FSR2_COMMON_H
+
+#if defined(FFX_CPU) || defined(FFX_GPU)
+//Locks
+#define LOCK_LIFETIME_REMAINING 0
+#define LOCK_TEMPORAL_LUMA 1
+#endif // #if defined(FFX_CPU) || defined(FFX_GPU)
+
+#if defined(FFX_GPU)
+FFX_STATIC const FfxFloat32 FSR2_FP16_MIN = 6.10e-05f;
+FFX_STATIC const FfxFloat32 FSR2_FP16_MAX = 65504.0f;
+FFX_STATIC const FfxFloat32 FSR2_EPSILON = 1e-03f;
+FFX_STATIC const FfxFloat32 FSR2_TONEMAP_EPSILON = 1.0f / FSR2_FP16_MAX;
+FFX_STATIC const FfxFloat32 FSR2_FLT_MAX = 3.402823466e+38f;
+FFX_STATIC const FfxFloat32 FSR2_FLT_MIN = 1.175494351e-38f;
+
+// treat vector truncation warnings as errors
+#pragma warning(error: 3206)
+
+// suppress warnings
+#pragma warning(disable: 3205)  // conversion from larger type to smaller
+#pragma warning(disable: 3571)  // in ffxPow(f, e), f could be negative
+
+// Reconstructed depth usage
+FFX_STATIC const FfxFloat32 fReconstructedDepthBilinearWeightThreshold = 0.01f;
+
+// Accumulation
+FFX_STATIC const FfxFloat32 fUpsampleLanczosWeightScale = 1.0f / 12.0f;
+FFX_STATIC const FfxFloat32 fMaxAccumulationLanczosWeight = 1.0f;
+FFX_STATIC const FfxFloat32 fAverageLanczosWeightPerFrame = 0.74f * fUpsampleLanczosWeightScale; // Average lanczos weight for jitter accumulated samples
+FFX_STATIC const FfxFloat32 fAccumulationMaxOnMotion = 3.0f * fUpsampleLanczosWeightScale;
+
+// Auto exposure
+FFX_STATIC const FfxFloat32 resetAutoExposureAverageSmoothing = 1e8f;
+
+struct AccumulationPassCommonParams
+{
+    FfxInt32x2 iPxHrPos;
+    FfxFloat32x2 fHrUv;
+    FfxFloat32x2 fLrUv_HwSampler;
+    FfxFloat32x2 fMotionVector;
+    FfxFloat32x2 fReprojectedHrUv;
+    FfxFloat32 fHrVelocity;
+    FfxFloat32 fDepthClipFactor;
+    FfxFloat32 fDilatedReactiveFactor;
+    FfxFloat32 fAccumulationMask;
+
+    FfxBoolean bIsResetFrame;
+    FfxBoolean bIsExistingSample;
+    FfxBoolean bIsNewSample;
+};
+
+struct LockState
+{
+    FfxBoolean NewLock; //Set for both unique new and re-locked new
+    FfxBoolean WasLockedPrevFrame; //Set to identify if the pixel was already locked (relock)
+};
+
+void InitializeNewLockSample(FFX_PARAMETER_OUT FfxFloat32x2 fLockStatus)
+{
+    fLockStatus = FfxFloat32x2(0, 0);
+}
+
+#if FFX_HALF
+void InitializeNewLockSample(FFX_PARAMETER_OUT FFX_MIN16_F2 fLockStatus)
+{
+    fLockStatus = FFX_MIN16_F2(0, 0);
+}
+#endif
+
+
+void KillLock(FFX_PARAMETER_INOUT FfxFloat32x2 fLockStatus)
+{
+    fLockStatus[LOCK_LIFETIME_REMAINING] = 0;
+}
+
+#if FFX_HALF
+void KillLock(FFX_PARAMETER_INOUT FFX_MIN16_F2 fLockStatus)
+{
+    fLockStatus[LOCK_LIFETIME_REMAINING] = FFX_MIN16_F(0);
+}
+#endif
+
+struct RectificationBox
+{
+    FfxFloat32x3 boxCenter;
+    FfxFloat32x3 boxVec;
+    FfxFloat32x3 aabbMin;
+    FfxFloat32x3 aabbMax;
+    FfxFloat32 fBoxCenterWeight;
+};
+#if FFX_HALF
+struct RectificationBoxMin16
+{
+    FFX_MIN16_F3 boxCenter;
+    FFX_MIN16_F3 boxVec;
+    FFX_MIN16_F3 aabbMin;
+    FFX_MIN16_F3 aabbMax;
+    FFX_MIN16_F fBoxCenterWeight;
+};
+#endif
+
+void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBox rectificationBox)
+{
+    rectificationBox.fBoxCenterWeight = FfxFloat32(0);
+
+    rectificationBox.boxCenter = FfxFloat32x3(0, 0, 0);
+    rectificationBox.boxVec = FfxFloat32x3(0, 0, 0);
+    rectificationBox.aabbMin = FfxFloat32x3(FSR2_FLT_MAX, FSR2_FLT_MAX, FSR2_FLT_MAX);
+    rectificationBox.aabbMax = -FfxFloat32x3(FSR2_FLT_MAX, FSR2_FLT_MAX, FSR2_FLT_MAX);
+}
+#if FFX_HALF
+void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox)
+{
+    rectificationBox.fBoxCenterWeight = FFX_MIN16_F(0);
+
+    rectificationBox.boxCenter = FFX_MIN16_F3(0, 0, 0);
+    rectificationBox.boxVec = FFX_MIN16_F3(0, 0, 0);
+    rectificationBox.aabbMin = FFX_MIN16_F3(FSR2_FP16_MAX, FSR2_FP16_MAX, FSR2_FP16_MAX);
+    rectificationBox.aabbMax = -FFX_MIN16_F3(FSR2_FP16_MAX, FSR2_FP16_MAX, FSR2_FP16_MAX);
+}
+#endif
+
+void RectificationBoxAddInitialSample(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 colorSample, const FfxFloat32 fSampleWeight)
+{
+    rectificationBox.aabbMin = colorSample;
+    rectificationBox.aabbMax = colorSample;
+
+    FfxFloat32x3 weightedSample = colorSample * fSampleWeight;
+    rectificationBox.boxCenter = weightedSample;
+    rectificationBox.boxVec = colorSample * weightedSample;
+    rectificationBox.fBoxCenterWeight = fSampleWeight;
+}
+
+void RectificationBoxAddSample(FfxBoolean bInitialSample, FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 colorSample, const FfxFloat32 fSampleWeight)
+{
+    if (bInitialSample) {
+        RectificationBoxAddInitialSample(rectificationBox, colorSample, fSampleWeight);
+    } else {
+        rectificationBox.aabbMin = ffxMin(rectificationBox.aabbMin, colorSample);
+        rectificationBox.aabbMax = ffxMax(rectificationBox.aabbMax, colorSample);
+
+        FfxFloat32x3 weightedSample = colorSample * fSampleWeight;
+        rectificationBox.boxCenter += weightedSample;
+        rectificationBox.boxVec += colorSample * weightedSample;
+        rectificationBox.fBoxCenterWeight += fSampleWeight;
+    }
+}
+#if FFX_HALF
+void RectificationBoxAddInitialSample(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 colorSample, const FFX_MIN16_F fSampleWeight)
+{
+    rectificationBox.aabbMin = colorSample;
+    rectificationBox.aabbMax = colorSample;
+
+    FFX_MIN16_F3 weightedSample = colorSample * fSampleWeight;
+    rectificationBox.boxCenter = weightedSample;
+    rectificationBox.boxVec = colorSample * weightedSample;
+    rectificationBox.fBoxCenterWeight = fSampleWeight;
+}
+
+void RectificationBoxAddSample(FfxBoolean bInitialSample, FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 colorSample, const FFX_MIN16_F fSampleWeight)
+{
+    if (bInitialSample) {
+        RectificationBoxAddInitialSample(rectificationBox, colorSample, fSampleWeight);
+    } else {
+        rectificationBox.aabbMin = ffxMin(rectificationBox.aabbMin, colorSample);
+        rectificationBox.aabbMax = ffxMax(rectificationBox.aabbMax, colorSample);
+
+        FFX_MIN16_F3 weightedSample = colorSample * fSampleWeight;
+        rectificationBox.boxCenter += weightedSample;
+        rectificationBox.boxVec += colorSample * weightedSample;
+        rectificationBox.fBoxCenterWeight += fSampleWeight;
+    }
+}
+#endif
+
+void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBox rectificationBox)
+{
+    rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FfxFloat32(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FfxFloat32(1.f));
+    rectificationBox.boxCenter /= rectificationBox.fBoxCenterWeight;
+    rectificationBox.boxVec /= rectificationBox.fBoxCenterWeight;
+    FfxFloat32x3 stdDev = sqrt(abs(rectificationBox.boxVec - rectificationBox.boxCenter * rectificationBox.boxCenter));
+    rectificationBox.boxVec = stdDev;
+}
+#if FFX_HALF
+void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox)
+{
+    rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FFX_MIN16_F(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FFX_MIN16_F(1.f));
+    rectificationBox.boxCenter /= rectificationBox.fBoxCenterWeight;
+    rectificationBox.boxVec /= rectificationBox.fBoxCenterWeight;
+    FFX_MIN16_F3 stdDev = sqrt(abs(rectificationBox.boxVec - rectificationBox.boxCenter * rectificationBox.boxCenter));
+    rectificationBox.boxVec = stdDev;
+}
+#endif
+
+FfxFloat32x3 SafeRcp3(FfxFloat32x3 v)
+{
+    return (all(FFX_NOT_EQUAL(v, FfxFloat32x3(0, 0, 0)))) ? (FfxFloat32x3(1, 1, 1) / v) : FfxFloat32x3(0, 0, 0);
+}
+#if FFX_HALF
+FFX_MIN16_F3 SafeRcp3(FFX_MIN16_F3 v)
+{
+    return (all(FFX_NOT_EQUAL(v, FFX_MIN16_F3(0, 0, 0)))) ? (FFX_MIN16_F3(1, 1, 1) / v) : FFX_MIN16_F3(0, 0, 0);
+}
+#endif
+
+FfxFloat32 MinDividedByMax(const FfxFloat32 v0, const FfxFloat32 v1)
+{
+    const FfxFloat32 m = ffxMax(v0, v1);
+    return m != 0 ? ffxMin(v0, v1) / m : 0;
+}
+
+#if FFX_HALF
+FFX_MIN16_F MinDividedByMax(const FFX_MIN16_F v0, const FFX_MIN16_F v1)
+{
+    const FFX_MIN16_F m = ffxMax(v0, v1);
+    return m != FFX_MIN16_F(0) ? ffxMin(v0, v1) / m : FFX_MIN16_F(0);
+}
+#endif
+
+FfxFloat32x3 YCoCgToRGB(FfxFloat32x3 fYCoCg)
+{
+    FfxFloat32x3 fRgb;
+
+    fRgb = FfxFloat32x3(
+        fYCoCg.x + fYCoCg.y - fYCoCg.z,
+        fYCoCg.x + fYCoCg.z,
+        fYCoCg.x - fYCoCg.y - fYCoCg.z);
+
+    return fRgb;
+}
+#if FFX_HALF
+FFX_MIN16_F3 YCoCgToRGB(FFX_MIN16_F3 fYCoCg)
+{
+    FFX_MIN16_F3 fRgb;
+
+    fRgb = FFX_MIN16_F3(
+        fYCoCg.x + fYCoCg.y - fYCoCg.z,
+        fYCoCg.x + fYCoCg.z,
+        fYCoCg.x - fYCoCg.y - fYCoCg.z);
+
+    return fRgb;
+}
+#endif
+
+FfxFloat32x3 RGBToYCoCg(FfxFloat32x3 fRgb)
+{
+    FfxFloat32x3 fYCoCg;
+
+    fYCoCg = FfxFloat32x3(
+        0.25f * fRgb.r + 0.5f * fRgb.g + 0.25f * fRgb.b,
+        0.5f * fRgb.r - 0.5f * fRgb.b,
+        -0.25f * fRgb.r + 0.5f * fRgb.g - 0.25f * fRgb.b);
+
+    return fYCoCg;
+}
+#if FFX_HALF
+FFX_MIN16_F3 RGBToYCoCg(FFX_MIN16_F3 fRgb)
+{
+    FFX_MIN16_F3 fYCoCg;
+
+    fYCoCg = FFX_MIN16_F3(
+        0.25 * fRgb.r + 0.5 * fRgb.g + 0.25 * fRgb.b,
+        0.5 * fRgb.r - 0.5 * fRgb.b,
+        -0.25 * fRgb.r + 0.5 * fRgb.g - 0.25 * fRgb.b);
+
+    return fYCoCg;
+}
+#endif
+
+FfxFloat32 RGBToLuma(FfxFloat32x3 fLinearRgb)
+{
+    return dot(fLinearRgb, FfxFloat32x3(0.2126f, 0.7152f, 0.0722f));
+}
+#if FFX_HALF
+FFX_MIN16_F RGBToLuma(FFX_MIN16_F3 fLinearRgb)
+{
+    return dot(fLinearRgb, FFX_MIN16_F3(0.2126f, 0.7152f, 0.0722f));
+}
+#endif
+
+FfxFloat32 RGBToPerceivedLuma(FfxFloat32x3 fLinearRgb)
+{
+    FfxFloat32 fLuminance = RGBToLuma(fLinearRgb);
+
+    FfxFloat32 fPercievedLuminance = 0;
+    if (fLuminance <= 216.0f / 24389.0f) {
+        fPercievedLuminance = fLuminance * (24389.0f / 27.0f);
+    }
+    else {
+        fPercievedLuminance = ffxPow(fLuminance, 1.0f / 3.0f) * 116.0f - 16.0f;
+    }
+
+    return fPercievedLuminance * 0.01f;
+}
+#if FFX_HALF
+FFX_MIN16_F RGBToPerceivedLuma(FFX_MIN16_F3 fLinearRgb)
+{
+    FFX_MIN16_F fLuminance = RGBToLuma(fLinearRgb);
+
+    FFX_MIN16_F fPercievedLuminance = FFX_MIN16_F(0);
+    if (fLuminance <= FFX_MIN16_F(216.0f / 24389.0f)) {
+        fPercievedLuminance = fLuminance * FFX_MIN16_F(24389.0f / 27.0f);
+    }
+    else {
+        fPercievedLuminance = ffxPow(fLuminance, FFX_MIN16_F(1.0f / 3.0f)) * FFX_MIN16_F(116.0f) - FFX_MIN16_F(16.0f);
+    }
+
+    return fPercievedLuminance * FFX_MIN16_F(0.01f);
+}
+#endif
+
+FfxFloat32x3 Tonemap(FfxFloat32x3 fRgb)
+{
+    return fRgb / (ffxMax(ffxMax(0.f, fRgb.r), ffxMax(fRgb.g, fRgb.b)) + 1.f).xxx;
+}
+
+FfxFloat32x3 InverseTonemap(FfxFloat32x3 fRgb)
+{
+    return fRgb / ffxMax(FSR2_TONEMAP_EPSILON, 1.f - ffxMax(fRgb.r, ffxMax(fRgb.g, fRgb.b))).xxx;
+}
+
+#if FFX_HALF
+FFX_MIN16_F3 Tonemap(FFX_MIN16_F3 fRgb)
+{
+    return fRgb / (ffxMax(ffxMax(FFX_MIN16_F(0.f), fRgb.r), ffxMax(fRgb.g, fRgb.b)) + FFX_MIN16_F(1.f)).xxx;
+}
+
+FFX_MIN16_F3 InverseTonemap(FFX_MIN16_F3 fRgb)
+{
+    return fRgb / ffxMax(FFX_MIN16_F(FSR2_TONEMAP_EPSILON), FFX_MIN16_F(1.f) - ffxMax(fRgb.r, ffxMax(fRgb.g, fRgb.b))).xxx;
+}
+#endif
+
+FfxInt32x2 ClampLoad(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize)
+{
+    FfxInt32x2 result = iPxSample + iPxOffset;
+    result.x = (iPxOffset.x < 0) ? ffxMax(result.x, 0) : result.x;
+    result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x;
+    result.y = (iPxOffset.y < 0) ? ffxMax(result.y, 0) : result.y;
+    result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y;
+    return result;
+
+    // return ffxMed3(iPxSample + iPxOffset, FfxInt32x2(0, 0), iTextureSize - FfxInt32x2(1, 1));
+}
+#if FFX_HALF
+FFX_MIN16_I2 ClampLoad(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize)
+{
+    FFX_MIN16_I2 result = iPxSample + iPxOffset;
+    result.x = (iPxOffset.x < 0) ? ffxMax(result.x, FFX_MIN16_I(0)) : result.x;
+    result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - FFX_MIN16_I(1)) : result.x;
+    result.y = (iPxOffset.y < 0) ? ffxMax(result.y, FFX_MIN16_I(0)) : result.y;
+    result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - FFX_MIN16_I(1)) : result.y;
+    return result;
+
+    // return ffxMed3Half(iPxSample + iPxOffset, FFX_MIN16_I2(0, 0), iTextureSize - FFX_MIN16_I2(1, 1));
+}
+#endif
+
+FfxFloat32x2 ClampUv(FfxFloat32x2 fUv, FfxInt32x2 iTextureSize, FfxInt32x2 iResourceSize)
+{
+    const FfxFloat32x2 fSampleLocation = fUv * iTextureSize;
+    const FfxFloat32x2 fClampedLocation = ffxMax(FfxFloat32x2(0.5f, 0.5f), ffxMin(fSampleLocation, FfxFloat32x2(iTextureSize) - FfxFloat32x2(0.5f, 0.5f)));
+    const FfxFloat32x2 fClampedUv = fClampedLocation / FfxFloat32x2(iResourceSize);
+
+    return fClampedUv;
+}
+
+FfxBoolean IsOnScreen(FfxInt32x2 pos, FfxInt32x2 size)
+{
+    return all(FFX_LESS_THAN(FfxUInt32x2(pos), FfxUInt32x2(size)));
+}
+#if FFX_HALF
+FfxBoolean IsOnScreen(FFX_MIN16_I2 pos, FFX_MIN16_I2 size)
+{
+    return all(FFX_LESS_THAN(FFX_MIN16_U2(pos), FFX_MIN16_U2(size)));
+}
+#endif
+
+FfxFloat32 ComputeAutoExposureFromLavg(FfxFloat32 Lavg)
+{
+    Lavg = exp(Lavg);
+
+    const FfxFloat32 S = 100.0f; //ISO arithmetic speed
+    const FfxFloat32 K = 12.5f;
+    FfxFloat32 ExposureISO100 = log2((Lavg * S) / K);
+
+    const FfxFloat32 q = 0.65f;
+    FfxFloat32 Lmax = (78.0f / (q * S)) * ffxPow(2.0f, ExposureISO100);
+
+    return 1 / Lmax;
+}
+#if FFX_HALF
+FFX_MIN16_F ComputeAutoExposureFromLavg(FFX_MIN16_F Lavg)
+{
+    Lavg = exp(Lavg);
+
+    const FFX_MIN16_F S = FFX_MIN16_F(100.0f); //ISO arithmetic speed
+    const FFX_MIN16_F K = FFX_MIN16_F(12.5f);
+    const FFX_MIN16_F ExposureISO100 = log2((Lavg * S) / K);
+
+    const FFX_MIN16_F q = FFX_MIN16_F(0.65f);
+    const FFX_MIN16_F Lmax = (FFX_MIN16_F(78.0f) / (q * S)) * ffxPow(FFX_MIN16_F(2.0f), ExposureISO100);
+
+    return FFX_MIN16_F(1) / Lmax;
+}
+#endif
+
+FfxInt32x2 ComputeHrPosFromLrPos(FfxInt32x2 iPxLrPos)
+{
+    FfxFloat32x2 fSrcJitteredPos = FfxFloat32x2(iPxLrPos) + 0.5f - Jitter();
+    FfxFloat32x2 fLrPosInHr = (fSrcJitteredPos / RenderSize()) * DisplaySize();
+    FfxInt32x2 iPxHrPos = FfxInt32x2(floor(fLrPosInHr));
+    return iPxHrPos;
+}
+#if FFX_HALF
+FFX_MIN16_I2 ComputeHrPosFromLrPos(FFX_MIN16_I2 iPxLrPos)
+{
+    FFX_MIN16_F2 fSrcJitteredPos = FFX_MIN16_F2(iPxLrPos) + FFX_MIN16_F(0.5f) - FFX_MIN16_F2(Jitter());
+    FFX_MIN16_F2 fLrPosInHr = (fSrcJitteredPos / FFX_MIN16_F2(RenderSize())) * FFX_MIN16_F2(DisplaySize());
+    FFX_MIN16_I2 iPxHrPos = FFX_MIN16_I2(floor(fLrPosInHr));
+    return iPxHrPos;
+}
+#endif
+
+FfxFloat32x2 ComputeNdc(FfxFloat32x2 fPxPos, FfxInt32x2 iSize)
+{
+    return fPxPos / FfxFloat32x2(iSize) * FfxFloat32x2(2.0f, -2.0f) + FfxFloat32x2(-1.0f, 1.0f);
+}
+
+FfxFloat32 GetViewSpaceDepth(FfxFloat32 fDeviceDepth)
+{
+    const FfxFloat32x4 fDeviceToViewDepth = DeviceToViewSpaceTransformFactors();
+
+    // fDeviceToViewDepth details found in ffx_fsr2.cpp
+    return (fDeviceToViewDepth[1] / (fDeviceDepth - fDeviceToViewDepth[0]));
+}
+
+FfxFloat32 GetViewSpaceDepthInMeters(FfxFloat32 fDeviceDepth)
+{
+    return GetViewSpaceDepth(fDeviceDepth) * ViewSpaceToMetersFactor();
+}
+
+FfxFloat32x3 GetViewSpacePosition(FfxInt32x2 iViewportPos, FfxInt32x2 iViewportSize, FfxFloat32 fDeviceDepth)
+{
+    const FfxFloat32x4 fDeviceToViewDepth = DeviceToViewSpaceTransformFactors();
+
+    const FfxFloat32 Z = GetViewSpaceDepth(fDeviceDepth);
+
+    const FfxFloat32x2 fNdcPos = ComputeNdc(iViewportPos, iViewportSize);
+    const FfxFloat32 X = fDeviceToViewDepth[2] * fNdcPos.x * Z;
+    const FfxFloat32 Y = fDeviceToViewDepth[3] * fNdcPos.y * Z;
+
+    return FfxFloat32x3(X, Y, Z);
+}
+
+FfxFloat32x3 GetViewSpacePositionInMeters(FfxInt32x2 iViewportPos, FfxInt32x2 iViewportSize, FfxFloat32 fDeviceDepth)
+{
+    return GetViewSpacePosition(iViewportPos, iViewportSize, fDeviceDepth) * ViewSpaceToMetersFactor();
+}
+
+FfxFloat32 GetMaxDistanceInMeters()
+{
+#if FFX_FSR2_OPTION_INVERTED_DEPTH
+    return GetViewSpaceDepth(0.0f) * ViewSpaceToMetersFactor();
+#else
+    return GetViewSpaceDepth(1.0f) * ViewSpaceToMetersFactor();
+#endif
+}
+
+FfxFloat32x3 PrepareRgb(FfxFloat32x3 fRgb, FfxFloat32 fExposure, FfxFloat32 fPreExposure)
+{
+    fRgb /= fPreExposure;
+    fRgb *= fExposure;
+
+    fRgb = clamp(fRgb, 0.0f, FSR2_FP16_MAX);
+
+    return fRgb;
+}
+
+FfxFloat32x3 UnprepareRgb(FfxFloat32x3 fRgb, FfxFloat32 fExposure)
+{
+    fRgb /= fExposure;
+    fRgb *= PreExposure();
+
+    return fRgb;
+}
+
+
+struct BilinearSamplingData
+{
+    FfxInt32x2 iOffsets[4];
+    FfxFloat32 fWeights[4];
+    FfxInt32x2 iBasePos;
+};
+
+BilinearSamplingData GetBilinearSamplingData(FfxFloat32x2 fUv, FfxInt32x2 iSize)
+{
+    BilinearSamplingData data;
+
+    FfxFloat32x2 fPxSample = (fUv * iSize) - FfxFloat32x2(0.5f, 0.5f);
+    data.iBasePos = FfxInt32x2(floor(fPxSample));
+    FfxFloat32x2 fPxFrac = ffxFract(fPxSample);
+
+    data.iOffsets[0] = FfxInt32x2(0, 0);
+    data.iOffsets[1] = FfxInt32x2(1, 0);
+    data.iOffsets[2] = FfxInt32x2(0, 1);
+    data.iOffsets[3] = FfxInt32x2(1, 1);
+
+    data.fWeights[0] = (1 - fPxFrac.x) * (1 - fPxFrac.y);
+    data.fWeights[1] = (fPxFrac.x) * (1 - fPxFrac.y);
+    data.fWeights[2] = (1 - fPxFrac.x) * (fPxFrac.y);
+    data.fWeights[3] = (fPxFrac.x) * (fPxFrac.y);
+
+    return data;
+}
+
+struct PlaneData
+{
+    FfxFloat32x3 fNormal;
+    FfxFloat32 fDistanceFromOrigin;
+};
+
+PlaneData GetPlaneFromPoints(FfxFloat32x3 fP0, FfxFloat32x3 fP1, FfxFloat32x3 fP2)
+{
+    PlaneData plane;
+
+    FfxFloat32x3 v0 = fP0 - fP1;
+    FfxFloat32x3 v1 = fP0 - fP2;
+    plane.fNormal = normalize(cross(v0, v1));
+    plane.fDistanceFromOrigin = -dot(fP0, plane.fNormal);
+
+    return plane;
+}
+
+FfxFloat32 PointToPlaneDistance(PlaneData plane, FfxFloat32x3 fPoint)
+{
+    return abs(dot(plane.fNormal, fPoint) + plane.fDistanceFromOrigin);
+}
+
+#endif // #if defined(FFX_GPU)
+
+#endif //!defined(FFX_FSR2_COMMON_H)
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid.h
@@ -0,0 +1,189 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+FFX_GROUPSHARED FfxUInt32 spdCounter;
+
+#ifndef SPD_PACKED_ONLY
+FFX_GROUPSHARED FfxFloat32 spdIntermediateR[16][16];
+FFX_GROUPSHARED FfxFloat32 spdIntermediateG[16][16];
+FFX_GROUPSHARED FfxFloat32 spdIntermediateB[16][16];
+FFX_GROUPSHARED FfxFloat32 spdIntermediateA[16][16];
+
+FfxFloat32x4 SpdLoadSourceImage(FfxFloat32x2 tex, FfxUInt32 slice)
+{
+    FfxFloat32x2 fUv = (tex + 0.5f + Jitter()) / RenderSize();
+    fUv = ClampUv(fUv, RenderSize(), InputColorResourceDimensions());
+    FfxFloat32x3 fRgb = SampleInputColor(fUv);
+
+    fRgb /= PreExposure();
+   
+    //compute log luma
+    const FfxFloat32 fLogLuma = log(ffxMax(FSR2_EPSILON, RGBToLuma(fRgb)));
+
+    // Make sure out of screen pixels contribute no value to the end result
+    const FfxFloat32 result = all(FFX_LESS_THAN(tex, RenderSize())) ? fLogLuma : 0.0f;
+
+    return FfxFloat32x4(result, 0, 0, 0);
+}
+
+FfxFloat32x4 SpdLoad(FfxInt32x2 tex, FfxUInt32 slice)
+{
+    return SPD_LoadMipmap5(tex);
+}
+
+void SpdStore(FfxInt32x2 pix, FfxFloat32x4 outValue, FfxUInt32 index, FfxUInt32 slice)
+{
+    if (index == LumaMipLevelToUse() || index == 5)
+    {
+        SPD_SetMipmap(pix, index, outValue.r);
+    }
+
+    if (index == MipCount() - 1) { //accumulate on 1x1 level
+
+        if (all(FFX_EQUAL(pix, FfxInt32x2(0, 0))))
+        {
+            FfxFloat32 prev = SPD_LoadExposureBuffer().y;
+            FfxFloat32 result = outValue.r;
+
+            if (prev < resetAutoExposureAverageSmoothing) // Compare Lavg, so small or negative values
+            {
+                FfxFloat32 rate = 1.0f;
+                result = prev + (result - prev) * (1 - exp(-DeltaTime() * rate));
+            }
+            FfxFloat32x2 spdOutput = FfxFloat32x2(ComputeAutoExposureFromLavg(result), result);
+            SPD_SetExposureBuffer(spdOutput);
+        }
+    }
+}
+
+void SpdIncreaseAtomicCounter(FfxUInt32 slice)
+{
+    SPD_IncreaseAtomicCounter(spdCounter);
+}
+
+FfxUInt32 SpdGetAtomicCounter()
+{
+    return spdCounter;
+}
+
+void SpdResetAtomicCounter(FfxUInt32 slice)
+{
+    SPD_ResetAtomicCounter();
+}
+
+FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
+{
+    return FfxFloat32x4(
+        spdIntermediateR[x][y],
+        spdIntermediateG[x][y],
+        spdIntermediateB[x][y],
+        spdIntermediateA[x][y]);
+}
+void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
+{
+    spdIntermediateR[x][y] = value.x;
+    spdIntermediateG[x][y] = value.y;
+    spdIntermediateB[x][y] = value.z;
+    spdIntermediateA[x][y] = value.w;
+}
+FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
+{
+    return (v0 + v1 + v2 + v3) * 0.25f;
+}
+#endif
+
+// define fetch and store functions Packed
+#if FFX_HALF
+#error Callback must be implemented
+
+FFX_GROUPSHARED FfxFloat16x2 spdIntermediateRG[16][16];
+FFX_GROUPSHARED FfxFloat16x2 spdIntermediateBA[16][16];
+
+FfxFloat16x4 SpdLoadSourceImageH(FfxFloat32x2 tex, FfxUInt32 slice)
+{
+    return FfxFloat16x4(imgDst[0][FfxFloat32x3(tex, slice)]);
+}
+FfxFloat16x4 SpdLoadH(FfxInt32x2 p, FfxUInt32 slice)
+{
+    return FfxFloat16x4(imgDst6[FfxUInt32x3(p, slice)]);
+}
+void SpdStoreH(FfxInt32x2 p, FfxFloat16x4 value, FfxUInt32 mip, FfxUInt32 slice)
+{
+    if (index == LumaMipLevelToUse() || index == 5)
+    {
+        imgDst6[FfxUInt32x3(p, slice)] = FfxFloat32x4(value);
+        return;
+    }
+    imgDst[mip + 1][FfxUInt32x3(p, slice)] = FfxFloat32x4(value);
+}
+void SpdIncreaseAtomicCounter(FfxUInt32 slice)
+{
+    InterlockedAdd(rw_spd_global_atomic[FfxInt16x2(0, 0)].counter[slice], 1, spdCounter);
+}
+FfxUInt32 SpdGetAtomicCounter()
+{
+    return spdCounter;
+}
+void SpdResetAtomicCounter(FfxUInt32 slice)
+{
+    rw_spd_global_atomic[FfxInt16x2(0, 0)].counter[slice] = 0;
+}
+FfxFloat16x4 SpdLoadIntermediateH(FfxUInt32 x, FfxUInt32 y)
+{
+    return FfxFloat16x4(
+        spdIntermediateRG[x][y].x,
+        spdIntermediateRG[x][y].y,
+        spdIntermediateBA[x][y].x,
+        spdIntermediateBA[x][y].y);
+}
+void SpdStoreIntermediateH(FfxUInt32 x, FfxUInt32 y, FfxFloat16x4 value)
+{
+    spdIntermediateRG[x][y] = value.xy;
+    spdIntermediateBA[x][y] = value.zw;
+}
+FfxFloat16x4 SpdReduce4H(FfxFloat16x4 v0, FfxFloat16x4 v1, FfxFloat16x4 v2, FfxFloat16x4 v3)
+{
+    return (v0 + v1 + v2 + v3) * FfxFloat16(0.25);
+}
+#endif
+
+#include "ffx_spd.h"
+
+void ComputeAutoExposure(FfxUInt32x3 WorkGroupId, FfxUInt32 LocalThreadIndex)
+{
+#if FFX_HALF
+    SpdDownsampleH(
+        FfxUInt32x2(WorkGroupId.xy),
+        FfxUInt32(LocalThreadIndex),
+        FfxUInt32(MipCount()),
+        FfxUInt32(NumWorkGroups()),
+        FfxUInt32(WorkGroupId.z),
+        FfxUInt32x2(WorkGroupOffset()));
+#else
+    SpdDownsample(
+        FfxUInt32x2(WorkGroupId.xy),
+        FfxUInt32(LocalThreadIndex),
+        FfxUInt32(MipCount()),
+        FfxUInt32(NumWorkGroups()),
+        FfxUInt32(WorkGroupId.z),
+        FfxUInt32x2(WorkGroupOffset()));
+#endif
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl
@@ -0,0 +1,134 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_INPUT_COLOR                     0
+#define FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC               1
+#define FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE        2
+#define FSR2_BIND_UAV_EXPOSURE_MIP_5                  3
+#define FSR2_BIND_UAV_AUTO_EXPOSURE                   4
+#define FSR2_BIND_CB_FSR2                             5
+#define FSR2_BIND_CB_SPD                              6
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+
+#if defined(FSR2_BIND_CB_SPD)
+	layout (set = 1, binding = FSR2_BIND_CB_SPD, std140) uniform cbSPD_t
+	{
+		uint mips;
+		uint numWorkGroups;
+		uvec2 workGroupOffset;
+		uvec2 renderSize;
+	} cbSPD;
+
+	uint MipCount()
+	{
+		return cbSPD.mips;
+	}
+
+	uint NumWorkGroups()
+	{
+		return cbSPD.numWorkGroups;
+	}
+
+	uvec2 WorkGroupOffset()
+	{
+		return cbSPD.workGroupOffset;
+	}
+
+	uvec2 SPD_RenderSize()
+	{
+		return cbSPD.renderSize;
+	}
+#endif
+
+vec2 SPD_LoadExposureBuffer()
+{
+	return imageLoad(rw_auto_exposure, ivec2(0,0)).xy;
+}
+
+void SPD_SetExposureBuffer(vec2 value)
+{
+	imageStore(rw_auto_exposure, ivec2(0,0), vec4(value, 0.0f, 0.0f));
+}
+
+vec4 SPD_LoadMipmap5(ivec2 iPxPos)
+{
+	return vec4(imageLoad(rw_img_mip_5, iPxPos).x, 0.0f, 0.0f, 0.0f);
+}
+
+void SPD_SetMipmap(ivec2 iPxPos, uint slice, float value)
+{
+	switch (slice)
+	{
+	case FFX_FSR2_SHADING_CHANGE_MIP_LEVEL:
+		imageStore(rw_img_mip_shading_change, iPxPos, vec4(value, 0.0f, 0.0f, 0.0f));
+		break;
+	case 5:
+		imageStore(rw_img_mip_5, iPxPos, vec4(value, 0.0f, 0.0f, 0.0f));
+		break;
+	default:
+
+        // avoid flattened side effect
+#if defined(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE)
+		imageStore(rw_img_mip_shading_change, iPxPos, vec4(imageLoad(rw_img_mip_shading_change, iPxPos).x, 0.0f, 0.0f, 0.0f));
+#elif defined(FSR2_BIND_UAV_EXPOSURE_MIP_5)
+		imageStore(rw_img_mip_5, iPxPos, vec4(imageLoad(rw_img_mip_5, iPxPos).x, 0.0f, 0.0f, 0.0f));
+#endif
+		break;
+	}
+}
+
+void SPD_IncreaseAtomicCounter(inout uint spdCounter)
+{
+	spdCounter = imageAtomicAdd(rw_spd_global_atomic, ivec2(0,0), 1);
+}
+
+void SPD_ResetAtomicCounter()
+{
+	imageStore(rw_spd_global_atomic, ivec2(0,0), uvec4(0));
+}
+
+#include "ffx_fsr2_compute_luminance_pyramid.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 256
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+    ComputeAutoExposure(gl_WorkGroupID.xyz, gl_LocalInvocationIndex);
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip.h
@@ -0,0 +1,258 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_DEPTH_CLIP_H
+#define FFX_FSR2_DEPTH_CLIP_H
+
+FFX_STATIC const FfxFloat32 DepthClipBaseScale = 4.0f;
+
+FfxFloat32 ComputeDepthClip(FfxFloat32x2 fUvSample, FfxFloat32 fCurrentDepthSample)
+{
+    FfxFloat32 fCurrentDepthViewSpace = GetViewSpaceDepth(fCurrentDepthSample);
+    BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fUvSample, RenderSize());
+
+    FfxFloat32 fDilatedSum = 0.0f;
+    FfxFloat32 fDepth = 0.0f;
+    FfxFloat32 fWeightSum = 0.0f;
+    for (FfxInt32 iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) {
+
+        const FfxInt32x2 iOffset = bilinearInfo.iOffsets[iSampleIndex];
+        const FfxInt32x2 iSamplePos = bilinearInfo.iBasePos + iOffset;
+
+        if (IsOnScreen(iSamplePos, RenderSize())) {
+            const FfxFloat32 fWeight = bilinearInfo.fWeights[iSampleIndex];
+            if (fWeight > fReconstructedDepthBilinearWeightThreshold) {
+
+                const FfxFloat32 fPrevDepthSample = LoadReconstructedPrevDepth(iSamplePos);
+                const FfxFloat32 fPrevNearestDepthViewSpace = GetViewSpaceDepth(fPrevDepthSample);
+
+                const FfxFloat32 fDepthDiff = fCurrentDepthViewSpace - fPrevNearestDepthViewSpace;
+
+                if (fDepthDiff > 0.0f) {
+
+#if FFX_FSR2_OPTION_INVERTED_DEPTH
+                    const FfxFloat32 fPlaneDepth = ffxMin(fPrevDepthSample, fCurrentDepthSample);
+#else
+                    const FfxFloat32 fPlaneDepth = ffxMax(fPrevDepthSample, fCurrentDepthSample);
+#endif
+                    
+                    const FfxFloat32x3 fCenter = GetViewSpacePosition(FfxInt32x2(RenderSize() * 0.5f), RenderSize(), fPlaneDepth);
+                    const FfxFloat32x3 fCorner = GetViewSpacePosition(FfxInt32x2(0, 0), RenderSize(), fPlaneDepth);
+
+                    const FfxFloat32 fHalfViewportWidth = length(FfxFloat32x2(RenderSize()));
+                    const FfxFloat32 fDepthThreshold = ffxMax(fCurrentDepthViewSpace, fPrevNearestDepthViewSpace);
+
+                    const FfxFloat32 Ksep = 1.37e-05f;
+                    const FfxFloat32 Kfov = length(fCorner) / length(fCenter);
+                    const FfxFloat32 fRequiredDepthSeparation = Ksep * Kfov * fHalfViewportWidth * fDepthThreshold;
+
+                    const FfxFloat32 fResolutionFactor = ffxSaturate(length(FfxFloat32x2(RenderSize())) / length(FfxFloat32x2(1920.0f, 1080.0f)));
+                    const FfxFloat32 fPower = ffxLerp(1.0f, 3.0f, fResolutionFactor);
+                    fDepth += ffxPow(ffxSaturate(FfxFloat32(fRequiredDepthSeparation / fDepthDiff)), fPower) * fWeight;
+                    fWeightSum += fWeight;
+                }
+            }
+        }
+    }
+
+    return (fWeightSum > 0) ? ffxSaturate(1.0f - fDepth / fWeightSum) : 0.0f;
+}
+
+FfxFloat32 ComputeMotionDivergence(FfxInt32x2 iPxPos, FfxInt32x2 iPxInputMotionVectorSize)
+{
+    FfxFloat32 minconvergence = 1.0f;
+
+    FfxFloat32x2 fMotionVectorNucleus = LoadInputMotionVector(iPxPos);
+    FfxFloat32 fNucleusVelocityLr = length(fMotionVectorNucleus * RenderSize());
+    FfxFloat32 fMaxVelocityUv = length(fMotionVectorNucleus);
+
+    const FfxFloat32 MotionVectorVelocityEpsilon = 1e-02f;
+
+    if (fNucleusVelocityLr > MotionVectorVelocityEpsilon) {
+        for (FfxInt32 y = -1; y <= 1; ++y) {
+            for (FfxInt32 x = -1; x <= 1; ++x) {
+
+                FfxInt32x2 sp = ClampLoad(iPxPos, FfxInt32x2(x, y), iPxInputMotionVectorSize);
+
+                FfxFloat32x2 fMotionVector = LoadInputMotionVector(sp);
+                FfxFloat32 fVelocityUv = length(fMotionVector);
+
+                fMaxVelocityUv = ffxMax(fVelocityUv, fMaxVelocityUv);
+                fVelocityUv = ffxMax(fVelocityUv, fMaxVelocityUv);
+                minconvergence = ffxMin(minconvergence, dot(fMotionVector / fVelocityUv, fMotionVectorNucleus / fVelocityUv));
+            }
+        }
+    }
+
+    return ffxSaturate(1.0f - minconvergence) * ffxSaturate(fMaxVelocityUv / 0.01f);
+}
+
+FfxFloat32 ComputeDepthDivergence(FfxInt32x2 iPxPos)
+{
+    const FfxFloat32 fMaxDistInMeters = GetMaxDistanceInMeters();
+    FfxFloat32 fDepthMax = 0.0f;
+    FfxFloat32 fDepthMin = fMaxDistInMeters;
+
+    FfxInt32 iMaxDistFound = 0;
+
+    for (FfxInt32 y = -1; y < 2; y++) {
+        for (FfxInt32 x = -1; x < 2; x++) {
+
+            const FfxInt32x2 iOffset = FfxInt32x2(x, y);
+            const FfxInt32x2 iSamplePos = iPxPos + iOffset;
+
+            const FfxFloat32 fOnScreenFactor = IsOnScreen(iSamplePos, RenderSize()) ? 1.0f : 0.0f;
+            FfxFloat32 fDepth = GetViewSpaceDepthInMeters(LoadDilatedDepth(iSamplePos)) * fOnScreenFactor;
+
+            iMaxDistFound |= FfxInt32(fMaxDistInMeters == fDepth);
+
+            fDepthMin = ffxMin(fDepthMin, fDepth);
+            fDepthMax = ffxMax(fDepthMax, fDepth);
+        }
+    }
+
+    return (1.0f - fDepthMin / fDepthMax) * (FfxBoolean(iMaxDistFound) ? 0.0f : 1.0f);
+}
+
+FfxFloat32 ComputeTemporalMotionDivergence(FfxInt32x2 iPxPos)
+{
+    const FfxFloat32x2 fUv = FfxFloat32x2(iPxPos + 0.5f) / RenderSize();
+
+    FfxFloat32x2 fMotionVector = LoadDilatedMotionVector(iPxPos);
+    FfxFloat32x2 fReprojectedUv = fUv + fMotionVector;
+    fReprojectedUv = ClampUv(fReprojectedUv, RenderSize(), MaxRenderSize());
+    FfxFloat32x2 fPrevMotionVector = SamplePreviousDilatedMotionVector(fReprojectedUv);
+
+    float fPxDistance = length(fMotionVector * DisplaySize());
+    return fPxDistance > 1.0f ? ffxLerp(0.0f, 1.0f - ffxSaturate(length(fPrevMotionVector) / length(fMotionVector)), ffxSaturate(ffxPow(fPxDistance / 20.0f, 3.0f))) : 0;
+}
+
+void PreProcessReactiveMasks(FfxInt32x2 iPxLrPos, FfxFloat32 fMotionDivergence)
+{
+    // Compensate for bilinear sampling in accumulation pass
+
+    FfxFloat32x3 fReferenceColor = LoadInputColor(iPxLrPos).xyz;
+    FfxFloat32x2 fReactiveFactor = FfxFloat32x2(0.0f, fMotionDivergence);
+
+    float fMasksSum = 0.0f;
+
+    FfxFloat32x3 fColorSamples[9];
+    FfxFloat32 fReactiveSamples[9];
+    FfxFloat32 fTransparencyAndCompositionSamples[9];
+
+    FFX_UNROLL
+    for (FfxInt32 y = -1; y < 2; y++) {
+        FFX_UNROLL
+        for (FfxInt32 x = -1; x < 2; x++) {
+
+            const FfxInt32x2 sampleCoord = ClampLoad(iPxLrPos, FfxInt32x2(x, y), FfxInt32x2(RenderSize()));
+
+            FfxInt32 sampleIdx = (y + 1) * 3 + x + 1;
+
+            FfxFloat32x3 fColorSample = LoadInputColor(sampleCoord).xyz;
+            FfxFloat32 fReactiveSample = LoadReactiveMask(sampleCoord);
+            FfxFloat32 fTransparencyAndCompositionSample = LoadTransparencyAndCompositionMask(sampleCoord);
+
+            fColorSamples[sampleIdx] = fColorSample;
+            fReactiveSamples[sampleIdx] = fReactiveSample;
+            fTransparencyAndCompositionSamples[sampleIdx] = fTransparencyAndCompositionSample;
+
+            fMasksSum += (fReactiveSample + fTransparencyAndCompositionSample);
+        }
+    }
+
+    if (fMasksSum > 0)
+    {
+        for (FfxInt32 sampleIdx = 0; sampleIdx < 9; sampleIdx++)
+        {
+            FfxFloat32x3 fColorSample = fColorSamples[sampleIdx];
+            FfxFloat32 fReactiveSample = fReactiveSamples[sampleIdx];
+            FfxFloat32 fTransparencyAndCompositionSample = fTransparencyAndCompositionSamples[sampleIdx];
+
+            const FfxFloat32 fMaxLenSq = ffxMax(dot(fReferenceColor, fReferenceColor), dot(fColorSample, fColorSample));
+            const FfxFloat32 fSimilarity = dot(fReferenceColor, fColorSample) / fMaxLenSq;
+
+            // Increase power for non-similar samples
+            const FfxFloat32 fPowerBiasMax = 6.0f;
+            const FfxFloat32 fSimilarityPower = 1.0f + (fPowerBiasMax - fSimilarity * fPowerBiasMax);
+            const FfxFloat32 fWeightedReactiveSample = ffxPow(fReactiveSample, fSimilarityPower);
+            const FfxFloat32 fWeightedTransparencyAndCompositionSample = ffxPow(fTransparencyAndCompositionSample, fSimilarityPower);
+
+            fReactiveFactor = ffxMax(fReactiveFactor, FfxFloat32x2(fWeightedReactiveSample, fWeightedTransparencyAndCompositionSample));
+        }
+    }
+
+    StoreDilatedReactiveMasks(iPxLrPos, fReactiveFactor);
+}
+
+FfxFloat32x3 ComputePreparedInputColor(FfxInt32x2 iPxLrPos)
+{
+    //We assume linear data. if non-linear input (sRGB, ...),
+    //then we should convert to linear first and back to sRGB on output.
+    FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iPxLrPos));
+
+    fRgb = PrepareRgb(fRgb, Exposure(), PreExposure());
+
+    const FfxFloat32x3 fPreparedYCoCg = RGBToYCoCg(fRgb);
+
+    return fPreparedYCoCg;
+}
+
+FfxFloat32 EvaluateSurface(FfxInt32x2 iPxPos, FfxFloat32x2 fMotionVector)
+{
+    FfxFloat32 d0 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, -1)));
+    FfxFloat32 d1 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, 0)));
+    FfxFloat32 d2 = GetViewSpaceDepth(LoadReconstructedPrevDepth(iPxPos + FfxInt32x2(0, 1)));
+
+    return 1.0f - FfxFloat32(((d0 - d1) > (d1 * 0.01f)) && ((d1 - d2) > (d2 * 0.01f)));
+}
+
+void DepthClip(FfxInt32x2 iPxPos)
+{
+    FfxFloat32x2 fDepthUv = (iPxPos + 0.5f) / RenderSize();
+    FfxFloat32x2 fMotionVector = LoadDilatedMotionVector(iPxPos);
+
+    // Discard tiny mvs
+    fMotionVector *= FfxFloat32(length(fMotionVector * DisplaySize()) > 0.01f);
+
+    const FfxFloat32x2 fDilatedUv = fDepthUv + fMotionVector;
+    const FfxFloat32 fDilatedDepth = LoadDilatedDepth(iPxPos);
+    const FfxFloat32 fCurrentDepthViewSpace = GetViewSpaceDepth(LoadInputDepth(iPxPos));
+
+    // Compute prepared input color and depth clip
+    FfxFloat32 fDepthClip = ComputeDepthClip(fDilatedUv, fDilatedDepth) * EvaluateSurface(iPxPos, fMotionVector);
+    FfxFloat32x3 fPreparedYCoCg = ComputePreparedInputColor(iPxPos);
+    StorePreparedInputColor(iPxPos, FfxFloat32x4(fPreparedYCoCg, fDepthClip));
+
+    // Compute dilated reactive mask
+#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS
+    FfxInt32x2 iSamplePos = iPxPos;
+#else
+    FfxInt32x2 iSamplePos = ComputeHrPosFromLrPos(iPxPos);
+#endif
+
+    FfxFloat32 fMotionDivergence = ComputeMotionDivergence(iSamplePos, RenderSize());
+    FfxFloat32 fTemporalMotionDifference = ffxSaturate(ComputeTemporalMotionDivergence(iPxPos) - ComputeDepthDivergence(iPxPos));
+
+    PreProcessReactiveMasks(iPxPos, ffxMax(fTemporalMotionDifference, fMotionDivergence));
+}
+
+#endif //!defined( FFX_FSR2_DEPTH_CLIPH )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_depth_clip_pass.glsl
@@ -0,0 +1,66 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH      0
+#define FSR2_BIND_SRV_DILATED_MOTION_VECTORS                1
+#define FSR2_BIND_SRV_DILATED_DEPTH                         2
+#define FSR2_BIND_SRV_REACTIVE_MASK                         3
+#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK     4
+#define FSR2_BIND_SRV_PREVIOUS_DILATED_MOTION_VECTORS       6
+#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS                  7
+#define FSR2_BIND_SRV_INPUT_COLOR                           8
+#define FSR2_BIND_SRV_INPUT_DEPTH                           9
+#define FSR2_BIND_SRV_INPUT_EXPOSURE                        10
+
+#define FSR2_BIND_UAV_DEPTH_CLIP                            11
+#define FSR2_BIND_UAV_DILATED_REACTIVE_MASKS                12
+#define FSR2_BIND_UAV_PREPARED_INPUT_COLOR                  13
+
+#define FSR2_BIND_CB_FSR2                                   14
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+#include "ffx_fsr2_sample.h"
+#include "ffx_fsr2_depth_clip.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+	DepthClip(ivec2(gl_GlobalInvocationID.xy));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_begin.h
@@ -0,0 +1 @@
+// This file doesn't exist in this version of FSR.
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_force16_end.h
@@ -0,0 +1 @@
+// This file doesn't exist in this version of FSR.
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock.h
@@ -0,0 +1,115 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_LOCK_H
+#define FFX_FSR2_LOCK_H
+
+void ClearResourcesForNextFrame(in FfxInt32x2 iPxHrPos)
+{
+    if (all(FFX_LESS_THAN(iPxHrPos, FfxInt32x2(RenderSize()))))
+    {
+#if FFX_FSR2_OPTION_INVERTED_DEPTH
+        const FfxUInt32 farZ = 0x0;
+#else
+        const FfxUInt32 farZ = 0x3f800000;
+#endif
+        SetReconstructedDepth(iPxHrPos, farZ);
+    }
+}
+
+FfxBoolean ComputeThinFeatureConfidence(FfxInt32x2 pos)
+{
+    const FfxInt32 RADIUS = 1;
+
+    FfxFloat32 fNucleus = LoadLockInputLuma(pos);
+
+    FfxFloat32 similar_threshold = 1.05f;
+    FfxFloat32 dissimilarLumaMin = FSR2_FLT_MAX;
+    FfxFloat32 dissimilarLumaMax = 0;
+
+    /*
+     0 1 2
+     3 4 5
+     6 7 8
+    */
+
+    #define SETBIT(x) (1U << x)
+
+    FfxUInt32 mask = SETBIT(4); //flag fNucleus as similar
+
+    const FfxUInt32 uNumRejectionMasks = 4;
+    const FfxUInt32 uRejectionMasks[uNumRejectionMasks] = {
+        SETBIT(0) | SETBIT(1) | SETBIT(3) | SETBIT(4), //Upper left
+        SETBIT(1) | SETBIT(2) | SETBIT(4) | SETBIT(5), //Upper right
+        SETBIT(3) | SETBIT(4) | SETBIT(6) | SETBIT(7), //Lower left
+        SETBIT(4) | SETBIT(5) | SETBIT(7) | SETBIT(8), //Lower right
+    };
+
+    FfxInt32 idx = 0;
+    FFX_UNROLL
+    for (FfxInt32 y = -RADIUS; y <= RADIUS; y++) {
+        FFX_UNROLL
+        for (FfxInt32 x = -RADIUS; x <= RADIUS; x++, idx++) {
+            if (x == 0 && y == 0) continue;
+
+            FfxInt32x2 samplePos = ClampLoad(pos, FfxInt32x2(x, y), FfxInt32x2(RenderSize()));
+
+            FfxFloat32 sampleLuma = LoadLockInputLuma(samplePos);
+            FfxFloat32 difference = ffxMax(sampleLuma, fNucleus) / ffxMin(sampleLuma, fNucleus);
+
+            if (difference > 0 && (difference < similar_threshold)) {
+                mask |= SETBIT(idx);
+            } else {
+                dissimilarLumaMin = ffxMin(dissimilarLumaMin, sampleLuma);
+                dissimilarLumaMax = ffxMax(dissimilarLumaMax, sampleLuma);
+            }
+        }
+    }
+
+    FfxBoolean isRidge = fNucleus > dissimilarLumaMax || fNucleus < dissimilarLumaMin;
+
+    if (FFX_FALSE == isRidge) {
+
+        return false;
+    }
+
+    FFX_UNROLL
+    for (FfxInt32 i = 0; i < 4; i++) {
+
+        if ((mask & uRejectionMasks[i]) == uRejectionMasks[i]) {
+            return false;
+        }
+    }
+    
+    return true;
+}
+
+void ComputeLock(FfxInt32x2 iPxLrPos)
+{
+    if (ComputeThinFeatureConfidence(iPxLrPos))
+    {
+        StoreNewLocks(ComputeHrPosFromLrPos(iPxLrPos), 1.f);
+    }
+
+    ClearResourcesForNextFrame(iPxLrPos);
+}
+
+#endif // FFX_FSR2_LOCK_H
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_lock_pass.glsl
@@ -0,0 +1,56 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_LOCK_INPUT_LUMA                       0
+#define FSR2_BIND_UAV_NEW_LOCKS                             1
+#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH      2
+#define FSR2_BIND_CB_FSR2                                   3
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+#include "ffx_fsr2_sample.h"
+#include "ffx_fsr2_lock.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+    uvec2 uDispatchThreadId = gl_WorkGroupID.xy * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy;
+
+    ComputeLock(ivec2(uDispatchThreadId));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_postprocess_lock_status.h
@@ -0,0 +1,106 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_POSTPROCESS_LOCK_STATUS_H
+#define FFX_FSR2_POSTPROCESS_LOCK_STATUS_H
+
+FfxFloat32x4 WrapShadingChangeLuma(FfxInt32x2 iPxSample)
+{
+    return FfxFloat32x4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0);
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 WrapShadingChangeLuma(FFX_MIN16_I2 iPxSample)
+{
+    return FFX_MIN16_F4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0);
+}
+#endif
+
+#if FFX_FSR2_OPTION_POSTPROCESSLOCKSTATUS_SAMPLERS_USE_DATA_HALF && FFX_HALF
+DeclareCustomFetchBilinearSamplesMin16(FetchShadingChangeLumaSamples, WrapShadingChangeLuma)
+#else
+DeclareCustomFetchBicubicSamples(FetchShadingChangeLumaSamples, WrapShadingChangeLuma)
+#endif
+DeclareCustomTextureSample(ShadingChangeLumaSample, Lanczos2, FetchShadingChangeLumaSamples)
+
+FfxFloat32 GetShadingChangeLuma(FfxInt32x2 iPxHrPos, FfxFloat32x2 fUvCoord)
+{
+    FfxFloat32 fShadingChangeLuma = 0;
+
+#if 0
+    fShadingChangeLuma = Exposure() * exp(ShadingChangeLumaSample(fUvCoord, LumaMipDimensions()).x);
+#else
+
+    const FfxFloat32 fDiv = FfxFloat32(2 << LumaMipLevelToUse());
+    FfxInt32x2 iMipRenderSize = FfxInt32x2(RenderSize() / fDiv);
+
+    fUvCoord = ClampUv(fUvCoord, iMipRenderSize, LumaMipDimensions());
+    fShadingChangeLuma = Exposure() * exp(FfxFloat32(SampleMipLuma(fUvCoord, LumaMipLevelToUse())));
+#endif
+
+    fShadingChangeLuma = ffxPow(fShadingChangeLuma, 1.0f / 6.0f);
+
+    return fShadingChangeLuma;
+}
+
+void UpdateLockStatus(AccumulationPassCommonParams params,
+    FFX_PARAMETER_INOUT FfxFloat32 fReactiveFactor, LockState state,
+    FFX_PARAMETER_INOUT FfxFloat32x2 fLockStatus,
+    FFX_PARAMETER_OUT FfxFloat32 fLockContributionThisFrame,
+    FFX_PARAMETER_OUT FfxFloat32 fLuminanceDiff) {
+
+    const FfxFloat32 fShadingChangeLuma = GetShadingChangeLuma(params.iPxHrPos, params.fHrUv);
+
+    //init temporal shading change factor, init to -1 or so in reproject to know if "true new"?
+    fLockStatus[LOCK_TEMPORAL_LUMA] = (fLockStatus[LOCK_TEMPORAL_LUMA] == FfxFloat32(0.0f)) ? fShadingChangeLuma : fLockStatus[LOCK_TEMPORAL_LUMA];
+
+    FfxFloat32 fPreviousShadingChangeLuma = fLockStatus[LOCK_TEMPORAL_LUMA];
+
+    fLuminanceDiff = 1.0f - MinDividedByMax(fPreviousShadingChangeLuma, fShadingChangeLuma);
+
+    if (state.NewLock) {
+        fLockStatus[LOCK_TEMPORAL_LUMA] = fShadingChangeLuma;
+
+        fLockStatus[LOCK_LIFETIME_REMAINING] = (fLockStatus[LOCK_LIFETIME_REMAINING] != 0.0f) ? 2.0f : 1.0f;
+    }
+    else if(fLockStatus[LOCK_LIFETIME_REMAINING] <= 1.0f) {
+        fLockStatus[LOCK_TEMPORAL_LUMA] = ffxLerp(fLockStatus[LOCK_TEMPORAL_LUMA], FfxFloat32(fShadingChangeLuma), 0.5f);
+    }
+    else {
+        if (fLuminanceDiff > 0.1f) {
+            KillLock(fLockStatus);
+        }
+    }
+
+    fReactiveFactor = ffxMax(fReactiveFactor, ffxSaturate((fLuminanceDiff - 0.1f) * 10.0f));
+    fLockStatus[LOCK_LIFETIME_REMAINING] *= (1.0f - fReactiveFactor);
+
+    fLockStatus[LOCK_LIFETIME_REMAINING] *= ffxSaturate(1.0f - params.fAccumulationMask);
+    fLockStatus[LOCK_LIFETIME_REMAINING] *= FfxFloat32(params.fDepthClipFactor < 0.1f);
+
+    // Compute this frame lock contribution
+    const FfxFloat32 fLifetimeContribution = ffxSaturate(fLockStatus[LOCK_LIFETIME_REMAINING] - 1.0f);
+    const FfxFloat32 fShadingChangeContribution = ffxSaturate(MinDividedByMax(fLockStatus[LOCK_TEMPORAL_LUMA], fShadingChangeLuma));
+
+    fLockContributionThisFrame = ffxSaturate(ffxSaturate(fLifetimeContribution * 4.0f) * fShadingChangeContribution);
+}
+
+#endif //!defined( FFX_FSR2_POSTPROCESS_LOCK_STATUS_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas.h
@@ -0,0 +1,67 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define GROUP_SIZE  8
+
+#define FSR_RCAS_DENOISE 1
+
+void WriteUpscaledOutput(FFX_MIN16_U2 iPxHrPos, FfxFloat32x3 fUpscaledColor)
+{
+    StoreUpscaledOutput(FFX_MIN16_I2(iPxHrPos), fUpscaledColor);
+}
+
+#define FSR_RCAS_F
+FfxFloat32x4 FsrRcasLoadF(FfxInt32x2 p)
+{
+    FfxFloat32x4 fColor = LoadRCAS_Input(p);
+
+    fColor.rgb = PrepareRgb(fColor.rgb, Exposure(), PreExposure());
+
+    return fColor;
+}
+
+void FsrRcasInputF(inout FfxFloat32 r, inout FfxFloat32 g, inout FfxFloat32 b) {}
+
+#include "ffx_fsr1.h"
+
+
+void CurrFilter(FFX_MIN16_U2 pos)
+{
+    FfxFloat32x3 c;
+    FsrRcasF(c.r, c.g, c.b, pos, RCASConfig());
+
+    c = UnprepareRgb(c, Exposure());
+
+    WriteUpscaledOutput(pos, c);
+}
+
+void RCAS(FfxUInt32x3 LocalThreadId, FfxUInt32x3 WorkGroupId, FfxUInt32x3 Dtid)
+{
+    // Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
+    FfxUInt32x2 gxy = ffxRemapForQuad(LocalThreadId.x) + FfxUInt32x2(WorkGroupId.x << 4u, WorkGroupId.y << 4u);
+    CurrFilter(FFX_MIN16_U2(gxy));
+    gxy.x += 8u;
+    CurrFilter(FFX_MIN16_U2(gxy));
+    gxy.y += 8u;
+    CurrFilter(FFX_MIN16_U2(gxy));
+    gxy.x -= 8u;
+    CurrFilter(FFX_MIN16_U2(gxy));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_rcas_pass.glsl
@@ -0,0 +1,80 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+// Needed for rw_upscaled_output declaration
+#extension GL_EXT_shader_image_load_formatted : require
+
+#define FSR2_BIND_SRV_INPUT_EXPOSURE        0
+#define FSR2_BIND_SRV_RCAS_INPUT            1
+#define FSR2_BIND_UAV_UPSCALED_OUTPUT       2
+#define FSR2_BIND_CB_FSR2                   3
+#define FSR2_BIND_CB_RCAS                   4
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+
+//Move to prototype shader!
+#if defined(FSR2_BIND_CB_RCAS)
+    layout (set = 1, binding = FSR2_BIND_CB_RCAS, std140) uniform cbRCAS_t
+    {
+        uvec4 rcasConfig;
+    } cbRCAS;
+
+    uvec4 RCASConfig()
+    {
+        return cbRCAS.rcasConfig;
+    }
+#else
+    uvec4 RCASConfig()
+    {
+        return uvec4(0);
+    }
+#endif
+
+vec4 LoadRCAS_Input(FfxInt32x2 iPxPos)
+{
+    return texelFetch(r_rcas_input, iPxPos, 0);
+}
+
+#include "ffx_fsr2_rcas.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 64
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+    RCAS(gl_LocalInvocationID.xyz, gl_WorkGroupID.xyz, gl_GlobalInvocationID.xyz);
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h
@@ -0,0 +1,145 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H
+#define FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H
+
+void ReconstructPrevDepth(FfxInt32x2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 fMotionVector, FfxInt32x2 iPxDepthSize)
+{
+    fMotionVector *= FfxFloat32(length(fMotionVector * DisplaySize()) > 0.1f);
+
+    FfxFloat32x2 fUv = (iPxPos + FfxFloat32(0.5)) / iPxDepthSize;
+    FfxFloat32x2 fReprojectedUv = fUv + fMotionVector;
+ 
+    BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fReprojectedUv, RenderSize());
+
+    // Project current depth into previous frame locations.
+    // Push to all pixels having some contribution if reprojection is using bilinear logic.
+    for (FfxInt32 iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) {
+        
+        const FfxInt32x2 iOffset = bilinearInfo.iOffsets[iSampleIndex];
+        FfxFloat32 fWeight = bilinearInfo.fWeights[iSampleIndex];
+
+        if (fWeight > fReconstructedDepthBilinearWeightThreshold) {
+
+            FfxInt32x2 iStorePos = bilinearInfo.iBasePos + iOffset;
+            if (IsOnScreen(iStorePos, iPxDepthSize)) {
+                StoreReconstructedDepth(iStorePos, fDepth);
+            }
+        }
+    }
+}
+
+void FindNearestDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxInt32x2 iPxSize, FFX_PARAMETER_OUT FfxFloat32 fNearestDepth, FFX_PARAMETER_OUT FfxInt32x2 fNearestDepthCoord)
+{
+    const FfxInt32 iSampleCount = 9;
+    const FfxInt32x2 iSampleOffsets[iSampleCount] = {
+        FfxInt32x2(+0, +0),
+        FfxInt32x2(+1, +0),
+        FfxInt32x2(+0, +1),
+        FfxInt32x2(+0, -1),
+        FfxInt32x2(-1, +0),
+        FfxInt32x2(-1, +1),
+        FfxInt32x2(+1, +1),
+        FfxInt32x2(-1, -1),
+        FfxInt32x2(+1, -1),
+    };
+
+    // pull out the depth loads to allow SC to batch them
+    FfxFloat32 depth[9];
+    FfxInt32 iSampleIndex = 0;
+    FFX_UNROLL
+    for (iSampleIndex = 0; iSampleIndex < iSampleCount; ++iSampleIndex) {
+
+        FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex];
+        depth[iSampleIndex] = LoadInputDepth(iPos);
+    }
+
+    // find closest depth
+    fNearestDepthCoord = iPxPos;
+    fNearestDepth = depth[0];
+    FFX_UNROLL
+    for (iSampleIndex = 1; iSampleIndex < iSampleCount; ++iSampleIndex) {
+
+        FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex];
+        if (IsOnScreen(iPos, iPxSize)) {
+
+            FfxFloat32 fNdDepth = depth[iSampleIndex];
+#if FFX_FSR2_OPTION_INVERTED_DEPTH
+            if (fNdDepth > fNearestDepth) {
+#else
+            if (fNdDepth < fNearestDepth) {
+#endif
+                fNearestDepthCoord = iPos;
+                fNearestDepth = fNdDepth;
+            }
+        }
+    }
+}
+
+FfxFloat32 ComputeLockInputLuma(FfxInt32x2 iPxLrPos)
+{
+    //We assume linear data. if non-linear input (sRGB, ...),
+    //then we should convert to linear first and back to sRGB on output.
+    FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iPxLrPos));
+
+    // Use internal auto exposure for locking logic
+    fRgb /= PreExposure();
+    fRgb *= Exposure();
+
+#if FFX_FSR2_OPTION_HDR_COLOR_INPUT
+    fRgb = Tonemap(fRgb);
+#endif
+
+    //compute luma used to lock pixels, if used elsewhere the ffxPow must be moved!
+    const FfxFloat32 fLockInputLuma = ffxPow(RGBToPerceivedLuma(fRgb), FfxFloat32(1.0 / 6.0));
+
+    return fLockInputLuma;
+}
+
+void ReconstructAndDilate(FfxInt32x2 iPxLrPos)
+{
+    FfxFloat32 fDilatedDepth;
+    FfxInt32x2 iNearestDepthCoord;
+
+    FindNearestDepth(iPxLrPos, RenderSize(), fDilatedDepth, iNearestDepthCoord);
+
+#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS
+    FfxInt32x2 iSamplePos = iPxLrPos;
+    FfxInt32x2 iMotionVectorPos = iNearestDepthCoord;
+#else
+    FfxInt32x2 iSamplePos = ComputeHrPosFromLrPos(iPxLrPos);
+    FfxInt32x2 iMotionVectorPos = ComputeHrPosFromLrPos(iNearestDepthCoord);
+#endif
+
+    FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iMotionVectorPos);
+
+    StoreDilatedDepth(iPxLrPos, fDilatedDepth);
+    StoreDilatedMotionVector(iPxLrPos, fDilatedMotionVector);
+
+    ReconstructPrevDepth(iPxLrPos, fDilatedDepth, fDilatedMotionVector, RenderSize());
+
+    FfxFloat32 fLockInputLuma = ComputeLockInputLuma(iPxLrPos);
+    StoreLockInputLuma(iPxLrPos, fLockInputLuma);
+}
+
+
+#endif //!defined( FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl
@@ -0,0 +1,65 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS                  0
+#define FSR2_BIND_SRV_INPUT_DEPTH                           1
+#define FSR2_BIND_SRV_INPUT_COLOR                           2
+#define FSR2_BIND_SRV_INPUT_EXPOSURE                        3
+#define FSR2_BIND_SRV_LUMA_HISTORY                          4
+
+#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH      5
+#define FSR2_BIND_UAV_DILATED_MOTION_VECTORS                6
+#define FSR2_BIND_UAV_DILATED_DEPTH                         7
+#define FSR2_BIND_UAV_PREPARED_INPUT_COLOR                  8
+#define FSR2_BIND_UAV_LUMA_HISTORY                          9
+#define FSR2_BIND_UAV_LUMA_INSTABILITY                      10
+#define FSR2_BIND_UAV_LOCK_INPUT_LUMA                       11
+
+#define FSR2_BIND_CB_FSR2                                   12
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+#include "ffx_fsr2_sample.h"
+#include "ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+	ReconstructAndDilate(FFX_MIN16_I2(gl_GlobalInvocationID.xy));
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_reproject.h
@@ -0,0 +1,136 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_REPROJECT_H
+#define FFX_FSR2_REPROJECT_H
+
+#ifndef FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE
+#define FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE 0 // Reference
+#endif
+
+FfxFloat32x4 WrapHistory(FfxInt32x2 iPxSample)
+{
+    return LoadHistory(iPxSample);
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 WrapHistory(FFX_MIN16_I2 iPxSample)
+{
+    return FFX_MIN16_F4(LoadHistory(iPxSample));
+}
+#endif
+
+
+#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF
+DeclareCustomFetchBicubicSamplesMin16(FetchHistorySamples, WrapHistory)
+DeclareCustomTextureSampleMin16(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples)
+#else
+DeclareCustomFetchBicubicSamples(FetchHistorySamples, WrapHistory)
+DeclareCustomTextureSample(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples)
+#endif
+
+FfxFloat32x4 WrapLockStatus(FfxInt32x2 iPxSample)
+{
+    FfxFloat32x4 fSample = FfxFloat32x4(LoadLockStatus(iPxSample), 0.0f, 0.0f);
+    return fSample;
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 WrapLockStatus(FFX_MIN16_I2 iPxSample)
+{
+    FFX_MIN16_F4 fSample = FFX_MIN16_F4(LoadLockStatus(iPxSample), 0.0, 0.0);
+
+    return fSample;
+}
+#endif
+
+#if 1
+#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF
+DeclareCustomFetchBilinearSamplesMin16(FetchLockStatusSamples, WrapLockStatus)
+DeclareCustomTextureSampleMin16(LockStatusSample, Bilinear, FetchLockStatusSamples)
+#else
+DeclareCustomFetchBilinearSamples(FetchLockStatusSamples, WrapLockStatus)
+DeclareCustomTextureSample(LockStatusSample, Bilinear, FetchLockStatusSamples)
+#endif
+#else
+#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF
+DeclareCustomFetchBicubicSamplesMin16(FetchLockStatusSamples, WrapLockStatus)
+DeclareCustomTextureSampleMin16(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples)
+#else
+DeclareCustomFetchBicubicSamples(FetchLockStatusSamples, WrapLockStatus)
+DeclareCustomTextureSample(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples)
+#endif
+#endif
+
+FfxFloat32x2 GetMotionVector(FfxInt32x2 iPxHrPos, FfxFloat32x2 fHrUv)
+{
+#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS
+    FfxFloat32x2 fDilatedMotionVector = LoadDilatedMotionVector(FFX_MIN16_I2(fHrUv * RenderSize()));
+#else
+    FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iPxHrPos);
+#endif
+
+    return fDilatedMotionVector;
+}
+
+FfxBoolean IsUvInside(FfxFloat32x2 fUv)
+{
+    return (fUv.x >= 0.0f && fUv.x <= 1.0f) && (fUv.y >= 0.0f && fUv.y <= 1.0f);
+}
+
+void ComputeReprojectedUVs(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x2 fReprojectedHrUv, FFX_PARAMETER_OUT FfxBoolean bIsExistingSample)
+{
+    fReprojectedHrUv = params.fHrUv + params.fMotionVector;
+
+    bIsExistingSample = IsUvInside(fReprojectedHrUv);
+}
+
+void ReprojectHistoryColor(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x3 fHistoryColor, FFX_PARAMETER_OUT FfxFloat32 fTemporalReactiveFactor, FFX_PARAMETER_OUT FfxBoolean bInMotionLastFrame)
+{
+    FfxFloat32x4 fHistory = HistorySample(params.fReprojectedHrUv, DisplaySize());
+
+    fHistoryColor = PrepareRgb(fHistory.rgb, Exposure(), PreviousFramePreExposure());
+
+    fHistoryColor = RGBToYCoCg(fHistoryColor);
+
+    //Compute temporal reactivity info
+    fTemporalReactiveFactor = ffxSaturate(abs(fHistory.w));
+    bInMotionLastFrame = (fHistory.w < 0.0f);
+}
+
+LockState ReprojectHistoryLockStatus(const AccumulationPassCommonParams params, FFX_PARAMETER_OUT FfxFloat32x2 fReprojectedLockStatus)
+{
+    LockState state = { FFX_FALSE, FFX_FALSE };
+    const FfxFloat32 fNewLockIntensity = LoadRwNewLocks(params.iPxHrPos);
+    state.NewLock = fNewLockIntensity > (127.0f / 255.0f);
+
+    FfxFloat32 fInPlaceLockLifetime = state.NewLock ? fNewLockIntensity : 0;
+
+    fReprojectedLockStatus = SampleLockStatus(params.fReprojectedHrUv);
+
+    if (fReprojectedLockStatus[LOCK_LIFETIME_REMAINING] != FfxFloat32(0.0f)) {
+        state.WasLockedPrevFrame = true;
+    }
+
+    return state;
+}
+
+#endif //!defined( FFX_FSR2_REPROJECT_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_resources.h
@@ -0,0 +1,105 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_RESOURCES_H
+#define FFX_FSR2_RESOURCES_H
+
+#if defined(FFX_CPU) || defined(FFX_GPU)
+#define FFX_FSR2_RESOURCE_IDENTIFIER_NULL                                           0
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_OPAQUE_ONLY                              1
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR                                    2
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS                           3
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH                                    4
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE                                 5
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK                            6
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK        7
+#define FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH           8
+#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS                         9
+#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH                                  10
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR                        11
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS                                    12
+#define FFX_FSR2_RESOURCE_IDENTIFIER_NEW_LOCKS                                      13
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR                           14
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY                                   15
+#define FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT                                   16
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT                                    17
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT                               18
+#define FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT                                19
+#define FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT                                     20
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1                                  21
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2                                  22
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_1                      23
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR_2                      24
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY                    25
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_TRANSPARENCY_AND_COMPOSITION  26
+#define FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT                      27
+#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS                         28
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE                                29 // same as FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0                       29
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_1                       30
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_2                       31
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_3                       32
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_4                       33
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_5                       34
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_6                       35
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_7                       36
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_8                       37
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_9                       38
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_10                      39
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_11                      40
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_12                      41
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_EXPOSURE                      42
+#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE                                  43
+#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTOREACTIVE                                   44
+#define FFX_FSR2_RESOURCE_IDENTIFIER_AUTOCOMPOSITION                                45
+
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR                           46
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR                          47
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_1                         48
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_1                        49
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_PRE_ALPHA_COLOR_2                         50
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREV_POST_ALPHA_COLOR_2                        51
+#define FFX_FSR2_RESOURCE_IDENTIFIER_PREVIOUS_DILATED_MOTION_VECTORS                52
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_1              53
+#define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DILATED_MOTION_VECTORS_2              54
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_1                                 55
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY_2                                 56
+#define FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_INPUT_LUMA                                57
+
+// Shading change detection mip level setting, value must be in the range [FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_0, FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_12]
+#define FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE          FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_4
+#define FFX_FSR2_SHADING_CHANGE_MIP_LEVEL                                           (FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE_MIPMAP_SHADING_CHANGE - FFX_FSR2_RESOURCE_IDENTIFIER_SCENE_LUMINANCE)
+
+#define FFX_FSR2_RESOURCE_IDENTIFIER_COUNT                                          58
+
+#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2                                     0
+#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD                                      1
+#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS                                     2
+#define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_GENREACTIVE                              3
+
+#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP                                    1
+#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP                             2
+#define FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD                                  4
+#define FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX                               8
+
+#endif // #if defined(FFX_CPU) || defined(FFX_GPU)
+
+#endif //!defined( FFX_FSR2_RESOURCES_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_sample.h
@@ -0,0 +1,605 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_SAMPLE_H
+#define FFX_FSR2_SAMPLE_H
+
+// suppress warnings
+#ifdef FFX_HLSL
+#pragma warning(disable: 4008) // potentially divide by zero
+#endif //FFX_HLSL
+
+struct FetchedBilinearSamples {
+
+    FfxFloat32x4 fColor00;
+    FfxFloat32x4 fColor10;
+
+    FfxFloat32x4 fColor01;
+    FfxFloat32x4 fColor11;
+};
+
+struct FetchedBicubicSamples {
+
+    FfxFloat32x4 fColor00;
+    FfxFloat32x4 fColor10;
+    FfxFloat32x4 fColor20;
+    FfxFloat32x4 fColor30;
+
+    FfxFloat32x4 fColor01;
+    FfxFloat32x4 fColor11;
+    FfxFloat32x4 fColor21;
+    FfxFloat32x4 fColor31;
+
+    FfxFloat32x4 fColor02;
+    FfxFloat32x4 fColor12;
+    FfxFloat32x4 fColor22;
+    FfxFloat32x4 fColor32;
+
+    FfxFloat32x4 fColor03;
+    FfxFloat32x4 fColor13;
+    FfxFloat32x4 fColor23;
+    FfxFloat32x4 fColor33;
+};
+
+#if FFX_HALF
+struct FetchedBilinearSamplesMin16 {
+
+    FFX_MIN16_F4 fColor00;
+    FFX_MIN16_F4 fColor10;
+
+    FFX_MIN16_F4 fColor01;
+    FFX_MIN16_F4 fColor11;
+};
+
+struct FetchedBicubicSamplesMin16 {
+
+    FFX_MIN16_F4 fColor00;
+    FFX_MIN16_F4 fColor10;
+    FFX_MIN16_F4 fColor20;
+    FFX_MIN16_F4 fColor30;
+
+    FFX_MIN16_F4 fColor01;
+    FFX_MIN16_F4 fColor11;
+    FFX_MIN16_F4 fColor21;
+    FFX_MIN16_F4 fColor31;
+
+    FFX_MIN16_F4 fColor02;
+    FFX_MIN16_F4 fColor12;
+    FFX_MIN16_F4 fColor22;
+    FFX_MIN16_F4 fColor32;
+
+    FFX_MIN16_F4 fColor03;
+    FFX_MIN16_F4 fColor13;
+    FFX_MIN16_F4 fColor23;
+    FFX_MIN16_F4 fColor33;
+};
+#else //FFX_HALF
+#define FetchedBicubicSamplesMin16 FetchedBicubicSamples
+#define FetchedBilinearSamplesMin16 FetchedBilinearSamples
+#endif //FFX_HALF
+
+FfxFloat32x4 Linear(FfxFloat32x4 A, FfxFloat32x4 B, FfxFloat32 t)
+{
+    return A + (B - A) * t;
+}
+
+FfxFloat32x4 Bilinear(FetchedBilinearSamples BilinearSamples, FfxFloat32x2 fPxFrac)
+{
+    FfxFloat32x4 fColorX0 = Linear(BilinearSamples.fColor00, BilinearSamples.fColor10, fPxFrac.x);
+    FfxFloat32x4 fColorX1 = Linear(BilinearSamples.fColor01, BilinearSamples.fColor11, fPxFrac.x);
+    FfxFloat32x4 fColorXY = Linear(fColorX0, fColorX1, fPxFrac.y);
+    return fColorXY;
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 Linear(FFX_MIN16_F4 A, FFX_MIN16_F4 B, FFX_MIN16_F t)
+{
+    return A + (B - A) * t;
+}
+
+FFX_MIN16_F4 Bilinear(FetchedBilinearSamplesMin16 BilinearSamples, FFX_MIN16_F2 fPxFrac)
+{
+    FFX_MIN16_F4 fColorX0 = Linear(BilinearSamples.fColor00, BilinearSamples.fColor10, fPxFrac.x);
+    FFX_MIN16_F4 fColorX1 = Linear(BilinearSamples.fColor01, BilinearSamples.fColor11, fPxFrac.x);
+    FFX_MIN16_F4 fColorXY = Linear(fColorX0, fColorX1, fPxFrac.y);
+    return fColorXY;
+}
+#endif
+
+FfxFloat32 Lanczos2NoClamp(FfxFloat32 x)
+{
+    const FfxFloat32 PI = 3.141592653589793f; // TODO: share SDK constants
+    return abs(x) < FSR2_EPSILON ? 1.f : (sin(PI * x) / (PI * x)) * (sin(0.5f * PI * x) / (0.5f * PI * x));
+}
+
+FfxFloat32 Lanczos2(FfxFloat32 x)
+{
+    x = ffxMin(abs(x), 2.0f);
+    return Lanczos2NoClamp(x);
+}
+
+#if FFX_HALF
+
+#if 0
+FFX_MIN16_F Lanczos2NoClamp(FFX_MIN16_F x)
+{
+    const FFX_MIN16_F PI = FFX_MIN16_F(3.141592653589793f); // TODO: share SDK constants
+    return abs(x) < FFX_MIN16_F(FSR2_EPSILON) ? FFX_MIN16_F(1.f) : (sin(PI * x) / (PI * x)) * (sin(FFX_MIN16_F(0.5f) * PI * x) / (FFX_MIN16_F(0.5f) * PI * x));
+}
+#endif
+
+FFX_MIN16_F Lanczos2(FFX_MIN16_F x)
+{
+    x = ffxMin(abs(x), FFX_MIN16_F(2.0f));
+    return FFX_MIN16_F(Lanczos2NoClamp(x));
+}
+#endif //FFX_HALF
+
+// FSR1 lanczos approximation. Input is x*x and must be <= 4.
+FfxFloat32 Lanczos2ApproxSqNoClamp(FfxFloat32 x2)
+{
+    FfxFloat32 a = (2.0f / 5.0f) * x2 - 1;
+    FfxFloat32 b = (1.0f / 4.0f) * x2 - 1;
+    return ((25.0f / 16.0f) * a * a - (25.0f / 16.0f - 1)) * (b * b);
+}
+
+#if FFX_HALF
+FFX_MIN16_F Lanczos2ApproxSqNoClamp(FFX_MIN16_F x2)
+{
+    FFX_MIN16_F a = FFX_MIN16_F(2.0f / 5.0f) * x2 - FFX_MIN16_F(1);
+    FFX_MIN16_F b = FFX_MIN16_F(1.0f / 4.0f) * x2 - FFX_MIN16_F(1);
+    return (FFX_MIN16_F(25.0f / 16.0f) * a * a - FFX_MIN16_F(25.0f / 16.0f - 1)) * (b * b);
+}
+#endif //FFX_HALF
+
+FfxFloat32 Lanczos2ApproxSq(FfxFloat32 x2)
+{
+    x2 = ffxMin(x2, 4.0f);
+    return Lanczos2ApproxSqNoClamp(x2);
+}
+
+#if FFX_HALF
+FFX_MIN16_F Lanczos2ApproxSq(FFX_MIN16_F x2)
+{
+    x2 = ffxMin(x2, FFX_MIN16_F(4.0f));
+    return Lanczos2ApproxSqNoClamp(x2);
+}
+#endif //FFX_HALF
+
+FfxFloat32 Lanczos2ApproxNoClamp(FfxFloat32 x)
+{
+    return Lanczos2ApproxSqNoClamp(x * x);
+}
+
+#if FFX_HALF
+FFX_MIN16_F Lanczos2ApproxNoClamp(FFX_MIN16_F x)
+{
+    return Lanczos2ApproxSqNoClamp(x * x);
+}
+#endif //FFX_HALF
+
+FfxFloat32 Lanczos2Approx(FfxFloat32 x)
+{
+    return Lanczos2ApproxSq(x * x);
+}
+
+#if FFX_HALF
+FFX_MIN16_F Lanczos2Approx(FFX_MIN16_F x)
+{
+    return Lanczos2ApproxSq(x * x);
+}
+#endif //FFX_HALF
+
+FfxFloat32 Lanczos2_UseLUT(FfxFloat32 x)
+{
+    return SampleLanczos2Weight(abs(x));
+}
+
+#if FFX_HALF
+FFX_MIN16_F Lanczos2_UseLUT(FFX_MIN16_F x)
+{
+    return FFX_MIN16_F(SampleLanczos2Weight(abs(x)));
+}
+#endif //FFX_HALF
+
+FfxFloat32x4 Lanczos2_UseLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t)
+{
+    FfxFloat32 fWeight0 = Lanczos2_UseLUT(-1.f - t);
+    FfxFloat32 fWeight1 = Lanczos2_UseLUT(-0.f - t);
+    FfxFloat32 fWeight2 = Lanczos2_UseLUT(+1.f - t);
+    FfxFloat32 fWeight3 = Lanczos2_UseLUT(+2.f - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+#if FFX_HALF
+FFX_MIN16_F4 Lanczos2_UseLUT(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t)
+{
+    FFX_MIN16_F fWeight0 = Lanczos2_UseLUT(FFX_MIN16_F(-1.f) - t);
+    FFX_MIN16_F fWeight1 = Lanczos2_UseLUT(FFX_MIN16_F(-0.f) - t);
+    FFX_MIN16_F fWeight2 = Lanczos2_UseLUT(FFX_MIN16_F(+1.f) - t);
+    FFX_MIN16_F fWeight3 = Lanczos2_UseLUT(FFX_MIN16_F(+2.f) - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+#endif
+
+FfxFloat32x4 Lanczos2(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t)
+{
+    FfxFloat32 fWeight0 = Lanczos2(-1.f - t);
+    FfxFloat32 fWeight1 = Lanczos2(-0.f - t);
+    FfxFloat32 fWeight2 = Lanczos2(+1.f - t);
+    FfxFloat32 fWeight3 = Lanczos2(+2.f - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+
+FfxFloat32x4 Lanczos2(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac)
+{
+    FfxFloat32x4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FfxFloat32x4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FfxFloat32x4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FfxFloat32x4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FfxFloat32x4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FfxFloat32x4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FfxFloat32x4 fDeringingMin = fDeringingSamples[0];
+    FfxFloat32x4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) {
+
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 Lanczos2(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t)
+{
+    FFX_MIN16_F fWeight0 = Lanczos2(FFX_MIN16_F(-1.f) - t);
+    FFX_MIN16_F fWeight1 = Lanczos2(FFX_MIN16_F(-0.f) - t);
+    FFX_MIN16_F fWeight2 = Lanczos2(FFX_MIN16_F(+1.f) - t);
+    FFX_MIN16_F fWeight3 = Lanczos2(FFX_MIN16_F(+2.f) - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+
+FFX_MIN16_F4 Lanczos2(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac)
+{
+    FFX_MIN16_F4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FFX_MIN16_F4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FFX_MIN16_F4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FFX_MIN16_F4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FFX_MIN16_F4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FFX_MIN16_F4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0];
+    FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex)
+    {
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+#endif //FFX_HALF
+
+
+FfxFloat32x4 Lanczos2LUT(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac)
+{
+    FfxFloat32x4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FfxFloat32x4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FfxFloat32x4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FfxFloat32x4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FfxFloat32x4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FfxFloat32x4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FfxFloat32x4 fDeringingMin = fDeringingSamples[0];
+    FfxFloat32x4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) {
+
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 Lanczos2LUT(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac)
+{
+    FFX_MIN16_F4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FFX_MIN16_F4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FFX_MIN16_F4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FFX_MIN16_F4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FFX_MIN16_F4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FFX_MIN16_F4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0];
+    FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex)
+    {
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+#endif //FFX_HALF
+
+
+
+FfxFloat32x4 Lanczos2Approx(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t)
+{
+    FfxFloat32 fWeight0 = Lanczos2ApproxNoClamp(-1.f - t);
+    FfxFloat32 fWeight1 = Lanczos2ApproxNoClamp(-0.f - t);
+    FfxFloat32 fWeight2 = Lanczos2ApproxNoClamp(+1.f - t);
+    FfxFloat32 fWeight3 = Lanczos2ApproxNoClamp(+2.f - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 Lanczos2Approx(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t)
+{
+    FFX_MIN16_F fWeight0 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-1.f) - t);
+    FFX_MIN16_F fWeight1 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-0.f) - t);
+    FFX_MIN16_F fWeight2 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+1.f) - t);
+    FFX_MIN16_F fWeight3 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+2.f) - t);
+    return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3);
+}
+#endif //FFX_HALF
+
+FfxFloat32x4 Lanczos2Approx(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac)
+{
+    FfxFloat32x4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FfxFloat32x4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FfxFloat32x4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FfxFloat32x4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FfxFloat32x4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FfxFloat32x4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FfxFloat32x4 fDeringingMin = fDeringingSamples[0];
+    FfxFloat32x4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex)
+    {
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+
+#if FFX_HALF
+FFX_MIN16_F4 Lanczos2Approx(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac)
+{
+    FFX_MIN16_F4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x);
+    FFX_MIN16_F4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x);
+    FFX_MIN16_F4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x);
+    FFX_MIN16_F4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x);
+    FFX_MIN16_F4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y);
+
+    // Deringing
+
+    // TODO: only use 4 by checking jitter
+    const FfxInt32 iDeringingSampleCount = 4;
+    const FFX_MIN16_F4 fDeringingSamples[4] = {
+        Samples.fColor11,
+        Samples.fColor21,
+        Samples.fColor12,
+        Samples.fColor22,
+    };
+
+    FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0];
+    FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0];
+
+    FFX_UNROLL
+    for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex)
+    {
+        fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]);
+        fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]);
+    }
+
+    fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax);
+
+    return fColorXY;
+}
+#endif
+
+// Clamp by offset direction. Assuming iPxSample is already in range and iPxOffset is compile time constant.
+FfxInt32x2 ClampCoord(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize)
+{
+    FfxInt32x2 result = iPxSample + iPxOffset;
+    result.x = (iPxOffset.x < 0) ? ffxMax(result.x, 0) : result.x;
+    result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x;
+    result.y = (iPxOffset.y < 0) ? ffxMax(result.y, 0) : result.y;
+    result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y;
+    return result;
+}
+#if FFX_HALF
+FFX_MIN16_I2 ClampCoord(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize)
+{
+    FFX_MIN16_I2 result = iPxSample + iPxOffset;
+    result.x = (iPxOffset.x < FFX_MIN16_I(0)) ? ffxMax(result.x, FFX_MIN16_I(0)) : result.x;
+    result.x = (iPxOffset.x > FFX_MIN16_I(0)) ? ffxMin(result.x, iTextureSize.x - FFX_MIN16_I(1)) : result.x;
+    result.y = (iPxOffset.y < FFX_MIN16_I(0)) ? ffxMax(result.y, FFX_MIN16_I(0)) : result.y;
+    result.y = (iPxOffset.y > FFX_MIN16_I(0)) ? ffxMin(result.y, iTextureSize.y - FFX_MIN16_I(1)) : result.y;
+    return result;
+}
+#endif //FFX_HALF
+
+
+#define DeclareCustomFetchBicubicSamplesWithType(SampleType, TextureType, AddrType, Name, LoadTexture)               \
+    SampleType Name(AddrType iPxSample, AddrType iTextureSize)                                          \
+    {                                                                                                   \
+        SampleType Samples;                                                                             \
+                                                                                                        \
+        Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, -1), iTextureSize)));    \
+        Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, -1), iTextureSize)));    \
+        Samples.fColor20 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, -1), iTextureSize)));    \
+        Samples.fColor30 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, -1), iTextureSize)));    \
+                                                                                                        \
+        Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +0), iTextureSize)));    \
+        Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize)));    \
+        Samples.fColor21 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize)));    \
+        Samples.fColor31 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +0), iTextureSize)));    \
+                                                                                                        \
+        Samples.fColor02 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +1), iTextureSize)));    \
+        Samples.fColor12 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize)));    \
+        Samples.fColor22 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize)));    \
+        Samples.fColor32 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +1), iTextureSize)));    \
+                                                                                                        \
+        Samples.fColor03 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +2), iTextureSize)));    \
+        Samples.fColor13 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +2), iTextureSize)));    \
+        Samples.fColor23 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +2), iTextureSize)));    \
+        Samples.fColor33 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +2), iTextureSize)));    \
+                                                                                                        \
+        return Samples;                                                                                 \
+    }
+
+#define DeclareCustomFetchBicubicSamples(Name, LoadTexture)                                             \
+    DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture)
+
+#define DeclareCustomFetchBicubicSamplesMin16(Name, LoadTexture)                                        \
+    DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture)
+
+#define DeclareCustomFetchBilinearSamplesWithType(SampleType, TextureType,AddrType, Name, LoadTexture)  \
+    SampleType Name(AddrType iPxSample, AddrType iTextureSize)                                          \
+    {                                                                                                   \
+        SampleType Samples;                                                                             \
+        Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize)));           \
+        Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize)));           \
+        Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize)));           \
+        Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize)));           \
+        return Samples;                                                                                 \
+    }
+
+#define DeclareCustomFetchBilinearSamples(Name, LoadTexture)                                             \
+    DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture)
+
+#define DeclareCustomFetchBilinearSamplesMin16(Name, LoadTexture)                                        \
+    DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture)
+
+// BE CAREFUL: there is some precision issues and (3253, 125) leading to (3252.9989778, 125.001102)
+// is common, so iPxSample can "jitter"
+#define DeclareCustomTextureSample(Name, InterpolateSamples, FetchSamples)                                           \
+    FfxFloat32x4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize)                                               \
+    {                                                                                                                \
+        FfxFloat32x2 fPxSample = (fUvSample * FfxFloat32x2(iTextureSize)) - FfxFloat32x2(0.5f, 0.5f);                \
+        /* Clamp base coords */                                                                                      \
+        fPxSample.x = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.x), fPxSample.x));                                 \
+        fPxSample.y = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.y), fPxSample.y));                                 \
+        /* */                                                                                                        \
+        FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample));                                                         \
+        FfxFloat32x2 fPxFrac = ffxFract(fPxSample);                                                                  \
+        FfxFloat32x4 fColorXY = FfxFloat32x4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac));    \
+        return fColorXY;                                                                                             \
+    }
+
+#define DeclareCustomTextureSampleMin16(Name, InterpolateSamples, FetchSamples)                                      \
+    FFX_MIN16_F4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize)                                               \
+    {                                                                                                                \
+        FfxFloat32x2 fPxSample = (fUvSample * FfxFloat32x2(iTextureSize)) - FfxFloat32x2(0.5f, 0.5f);                \
+        /* Clamp base coords */                                                                                      \
+        fPxSample.x = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.x), fPxSample.x));                                 \
+        fPxSample.y = ffxMax(0.0f, ffxMin(FfxFloat32(iTextureSize.y), fPxSample.y));                                 \
+        /* */                                                                                                        \
+        FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample));                                                         \
+        FFX_MIN16_F2 fPxFrac = FFX_MIN16_F2(ffxFract(fPxSample));                                                    \
+        FFX_MIN16_F4 fColorXY = FFX_MIN16_F4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac));    \
+        return fColorXY;                                                                                             \
+    }
+
+#define FFX_FSR2_CONCAT_ID(x, y) x ## y
+#define FFX_FSR2_CONCAT(x, y) FFX_FSR2_CONCAT_ID(x, y)
+#define FFX_FSR2_SAMPLER_1D_0 Lanczos2
+#define FFX_FSR2_SAMPLER_1D_1 Lanczos2LUT
+#define FFX_FSR2_SAMPLER_1D_2 Lanczos2Approx
+
+#define FFX_FSR2_GET_LANCZOS_SAMPLER1D(x) FFX_FSR2_CONCAT(FFX_FSR2_SAMPLER_1D_, x)
+
+#endif //!defined( FFX_FSR2_SAMPLE_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen.h
@@ -0,0 +1,250 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#define USE_YCOCG 1
+
+#define fAutogenEpsilon 0.01f
+
+// EXPERIMENTAL
+
+FFX_MIN16_F ComputeAutoTC_01(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx)
+{
+    FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId);
+    FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId);
+    FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx);
+    FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx);
+
+#if USE_YCOCG    
+    colorPreAlpha = RGBToYCoCg(colorPreAlpha);
+    colorPostAlpha = RGBToYCoCg(colorPostAlpha);
+    colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha);
+    colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha);
+#endif
+
+    FfxFloat32x3 colorDeltaCurr = colorPostAlpha - colorPreAlpha;
+    FfxFloat32x3 colorDeltaPrev = colorPrevPostAlpha - colorPrevPreAlpha;
+    bool hasAlpha = any(FFX_GREATER_THAN(abs(colorDeltaCurr), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon)));
+    bool hadAlpha = any(FFX_GREATER_THAN(abs(colorDeltaPrev), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon)));
+
+    FfxFloat32x3 X = colorPreAlpha;
+    FfxFloat32x3 Y = colorPostAlpha;
+    FfxFloat32x3 Z = colorPrevPreAlpha;
+    FfxFloat32x3 W = colorPrevPostAlpha;
+
+    FFX_MIN16_F retVal = FFX_MIN16_F(ffxSaturate(dot(abs(abs(Y - X) - abs(W - Z)), FfxFloat32x3(1, 1, 1))));
+
+    // cleanup very small values
+    retVal = (retVal < getTcThreshold()) ? FFX_MIN16_F(0.0f) : FFX_MIN16_F(1.f);
+
+    return retVal;
+}
+
+// works ok: thin edges
+FFX_MIN16_F ComputeAutoTC_02(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx)
+{
+    FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId);
+    FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId);
+    FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx);
+    FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx);
+
+#if USE_YCOCG    
+    colorPreAlpha = RGBToYCoCg(colorPreAlpha);
+    colorPostAlpha = RGBToYCoCg(colorPostAlpha);
+    colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha);
+    colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha);
+#endif
+
+    FfxFloat32x3 colorDelta = colorPostAlpha - colorPreAlpha;
+    FfxFloat32x3 colorPrevDelta = colorPrevPostAlpha - colorPrevPreAlpha;
+    bool hasAlpha = any(FFX_GREATER_THAN(abs(colorDelta), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon)));
+    bool hadAlpha = any(FFX_GREATER_THAN(abs(colorPrevDelta), FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon)));
+
+    FfxFloat32x3 delta = colorPostAlpha - colorPreAlpha;              //prev+1*d = post   => d = color, alpha =
+    FfxFloat32x3 deltaPrev = colorPrevPostAlpha - colorPrevPreAlpha;
+
+    FfxFloat32x3 X = colorPrevPreAlpha;
+    FfxFloat32x3 N = colorPreAlpha - colorPrevPreAlpha;
+    FfxFloat32x3 YAminusXA = colorPrevPostAlpha - colorPrevPreAlpha;
+    FfxFloat32x3 NminusNA = colorPostAlpha - colorPrevPostAlpha;
+
+    FfxFloat32x3 A = (hasAlpha || hadAlpha) ? NminusNA / max(FfxFloat32x3(fAutogenEpsilon, fAutogenEpsilon, fAutogenEpsilon), N) : FfxFloat32x3(0, 0, 0);
+
+    FFX_MIN16_F retVal = FFX_MIN16_F( max(max(A.x, A.y), A.z) );
+
+    // only pixels that have significantly changed in color shuold be considered
+    retVal = ffxSaturate(retVal * FFX_MIN16_F(length(colorPostAlpha - colorPrevPostAlpha)) );
+
+    return retVal;
+}
+
+// This function computes the TransparencyAndComposition mask:
+// This mask indicates pixels that should discard locks and apply color clamping.
+// 
+// Typically this is the case for translucent pixels (that don't write depth values) or pixels where the correctness of 
+// the MVs can not be guaranteed (e.g. procedutal movement or vegetation that does not have MVs to reduce the cost during rasterization)
+// Also, large changes in color due to changed lighting should be marked to remove locks on pixels with "old" lighting.
+//
+// This function takes a opaque only and a final texture and uses internal copies of those textures from the last frame.
+// The function tries to determine where the color changes between opaque only and final image to determine the pixels that use transparency.
+// Also it uses the previous frames and detects where the use of transparency changed to mark those pixels.
+// Additionally it marks pixels where the color changed significantly in the opaque only image, e.g. due to lighting or texture animation.
+// 
+// In the final step it stores the current textures in internal textures for the next frame
+
+FFX_MIN16_F ComputeTransparencyAndComposition(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx)
+{
+    FFX_MIN16_F retVal = ComputeAutoTC_02(uDispatchThreadId, iPrevIdx);
+
+    // [branch]
+    if (retVal > FFX_MIN16_F(0.01f))
+    {
+        retVal = ComputeAutoTC_01(uDispatchThreadId, iPrevIdx);
+    }
+    return retVal;
+}
+
+float computeSolidEdge(FFX_MIN16_I2 curPos, FFX_MIN16_I2 prevPos)
+{
+    float lum[9];
+    int i = 0;
+    for (int y = -1; y < 2; ++y)
+    {
+        for (int x = -1; x < 2; ++x)
+        {
+            FfxFloat32x3 curCol  = LoadOpaqueOnly(curPos + FFX_MIN16_I2(x, y)).rgb;
+            FfxFloat32x3 prevCol = LoadPrevPreAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb;
+            lum[i++] = length(curCol - prevCol);
+        }
+    }
+
+    //float gradX = abs(lum[3] - lum[4]) + abs(lum[5] - lum[4]);
+    //float gradY = abs(lum[1] - lum[4]) + abs(lum[7] - lum[4]);
+
+    //return sqrt(gradX * gradX + gradY * gradY);
+
+    float gradX = abs(lum[3] - lum[4]) * abs(lum[5] - lum[4]);
+    float gradY = abs(lum[1] - lum[4]) * abs(lum[7] - lum[4]);
+
+    return sqrt(sqrt(gradX * gradY));
+}
+
+float computeAlphaEdge(FFX_MIN16_I2 curPos, FFX_MIN16_I2 prevPos)
+{
+    float lum[9];
+    int i = 0;
+    for (int y = -1; y < 2; ++y)
+    {
+        for (int x = -1; x < 2; ++x)
+        {
+            FfxFloat32x3 curCol  = abs(LoadInputColor(curPos + FFX_MIN16_I2(x, y)).rgb - LoadOpaqueOnly(curPos + FFX_MIN16_I2(x, y)).rgb);
+            FfxFloat32x3 prevCol = abs(LoadPrevPostAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb - LoadPrevPreAlpha(prevPos + FFX_MIN16_I2(x, y)).rgb);
+            lum[i++] = length(curCol - prevCol);
+        }
+    }
+
+    //float gradX = abs(lum[3] - lum[4]) + abs(lum[5] - lum[4]);
+    //float gradY = abs(lum[1] - lum[4]) + abs(lum[7] - lum[4]);
+
+    //return sqrt(gradX * gradX + gradY * gradY);
+
+    float gradX = abs(lum[3] - lum[4]) * abs(lum[5] - lum[4]);
+    float gradY = abs(lum[1] - lum[4]) * abs(lum[7] - lum[4]);
+
+    return sqrt(sqrt(gradX * gradY));
+}
+
+FFX_MIN16_F ComputeAabbOverlap(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx)
+{
+    FFX_MIN16_F retVal = FFX_MIN16_F(0.f);
+
+    FfxFloat32x2 fMotionVector = LoadInputMotionVector(uDispatchThreadId);
+    FfxFloat32x3 colorPreAlpha = LoadOpaqueOnly(uDispatchThreadId);
+    FfxFloat32x3 colorPostAlpha = LoadInputColor(uDispatchThreadId);
+    FfxFloat32x3 colorPrevPreAlpha = LoadPrevPreAlpha(iPrevIdx);
+    FfxFloat32x3 colorPrevPostAlpha = LoadPrevPostAlpha(iPrevIdx);
+
+#if USE_YCOCG    
+    colorPreAlpha = RGBToYCoCg(colorPreAlpha);
+    colorPostAlpha = RGBToYCoCg(colorPostAlpha);
+    colorPrevPreAlpha = RGBToYCoCg(colorPrevPreAlpha);
+    colorPrevPostAlpha = RGBToYCoCg(colorPrevPostAlpha);
+#endif
+    FfxFloat32x3 minPrev = FFX_MIN16_F3(+1000.f, +1000.f, +1000.f);
+    FfxFloat32x3 maxPrev = FFX_MIN16_F3(-1000.f, -1000.f, -1000.f);
+    for (int y = -1; y < 2; ++y)
+    {
+        for (int x = -1; x < 2; ++x)
+        {
+            FfxFloat32x3 W = LoadPrevPostAlpha(iPrevIdx + FFX_MIN16_I2(x, y));
+
+#if USE_YCOCG
+            W = RGBToYCoCg(W);
+#endif
+            minPrev = min(minPrev, W);
+            maxPrev = max(maxPrev, W);
+        }
+    }
+    // instead of computing the overlap: simply count how many samples are outside
+    // set reactive based on that
+    FFX_MIN16_F count = FFX_MIN16_F(0.f);
+    for (int y = -1; y < 2; ++y)
+    {
+        for (int x = -1; x < 2; ++x)
+        {
+            FfxFloat32x3 Y = LoadInputColor(uDispatchThreadId + FFX_MIN16_I2(x, y));
+
+#if USE_YCOCG
+            Y = RGBToYCoCg(Y);
+#endif
+            count += ((Y.x < minPrev.x) || (Y.x > maxPrev.x)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f);
+            count += ((Y.y < minPrev.y) || (Y.y > maxPrev.y)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f);
+            count += ((Y.z < minPrev.z) || (Y.z > maxPrev.z)) ? FFX_MIN16_F(1.f) : FFX_MIN16_F(0.f);
+        }
+    }
+    retVal = count / FFX_MIN16_F(27.f);
+
+    return retVal;
+}
+
+
+// This function computes the Reactive mask:
+// We want pixels marked where the alpha portion of the frame changes a lot between neighbours
+// Those pixels are expected to change quickly between frames, too. (e.g. small particles, reflections on curved surfaces...)
+// As a result history would not be trustworthy.
+// On the other hand we don't want pixels marked where pre-alpha has a large differnce, since those would profit from accumulation
+// For mirrors we may assume the pre-alpha is pretty uniform color.
+// 
+// This works well generally, but also marks edge pixels
+FFX_MIN16_F ComputeReactive(FFX_MIN16_I2 uDispatchThreadId, FFX_MIN16_I2 iPrevIdx)
+{
+    // we only get here if alpha has a significant contribution and has changed since last frame.
+    FFX_MIN16_F retVal = FFX_MIN16_F(0.f);
+
+    // mark pixels with huge variance in alpha as reactive
+    FFX_MIN16_F alphaEdge = FFX_MIN16_F(computeAlphaEdge(uDispatchThreadId, iPrevIdx));
+    FFX_MIN16_F opaqueEdge = FFX_MIN16_F(computeSolidEdge(uDispatchThreadId, iPrevIdx));
+    retVal = ffxSaturate(alphaEdge - opaqueEdge);
+
+    // the above also marks edge pixels due to jitter, so we need to cancel those out
+
+
+    return retVal;
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_tcr_autogen_pass.glsl
@@ -0,0 +1,120 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+//#version 450
+
+#extension GL_GOOGLE_include_directive : require
+#extension GL_EXT_samplerless_texture_functions : require
+
+#define FSR2_BIND_SRV_INPUT_OPAQUE_ONLY                     0
+#define FSR2_BIND_SRV_INPUT_COLOR                           1
+#define FSR2_BIND_SRV_INPUT_MOTION_VECTORS                  2
+#define FSR2_BIND_SRV_PREV_PRE_ALPHA_COLOR                  3
+#define FSR2_BIND_SRV_PREV_POST_ALPHA_COLOR                 4
+#define FSR2_BIND_SRV_REACTIVE_MASK                         5
+#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK     6
+
+#define FSR2_BIND_UAV_AUTOREACTIVE                          7
+#define FSR2_BIND_UAV_AUTOCOMPOSITION                       8
+#define FSR2_BIND_UAV_PREV_PRE_ALPHA_COLOR                  9
+#define FSR2_BIND_UAV_PREV_POST_ALPHA_COLOR                 10
+
+#define FSR2_BIND_CB_FSR2									11
+#define FSR2_BIND_CB_REACTIVE                               12
+
+#if FFX_FSR2_OPTION_GODOT_DERIVE_INVALID_MOTION_VECTORS
+#define FSR2_BIND_SRV_INPUT_DEPTH                           13
+#endif
+
+#include "ffx_fsr2_callbacks_glsl.h"
+#include "ffx_fsr2_common.h"
+
+#ifdef FSR2_BIND_CB_REACTIVE
+layout (set = 1, binding = FSR2_BIND_CB_REACTIVE, std140) uniform cbGenerateReactive_t
+{
+        float   fTcThreshold; // 0.1 is a good starting value, lower will result in more TC pixels
+        float   fTcScale;     
+        float   fReactiveScale;
+        float   fReactiveMax;
+} cbGenerateReactive;
+
+float getTcThreshold()
+{
+    return cbGenerateReactive.fTcThreshold;
+}
+
+#else
+ float getTcThreshold()
+ {
+    return 0.05f;
+ }
+#endif
+
+#include "ffx_fsr2_tcr_autogen.h"
+
+#ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#define FFX_FSR2_THREAD_GROUP_WIDTH 8
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_WIDTH
+#ifndef FFX_FSR2_THREAD_GROUP_HEIGHT
+#define FFX_FSR2_THREAD_GROUP_HEIGHT 8
+#endif // FFX_FSR2_THREAD_GROUP_HEIGHT
+#ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#define FFX_FSR2_THREAD_GROUP_DEPTH 1
+#endif // #ifndef FFX_FSR2_THREAD_GROUP_DEPTH
+#ifndef FFX_FSR2_NUM_THREADS
+#define FFX_FSR2_NUM_THREADS layout (local_size_x = FFX_FSR2_THREAD_GROUP_WIDTH, local_size_y = FFX_FSR2_THREAD_GROUP_HEIGHT, local_size_z = FFX_FSR2_THREAD_GROUP_DEPTH) in;
+#endif // #ifndef FFX_FSR2_NUM_THREADS
+
+FFX_FSR2_NUM_THREADS
+void main()
+{
+    FFX_MIN16_I2 uDispatchThreadId = FFX_MIN16_I2(gl_GlobalInvocationID.xy);
+
+    // ToDo: take into account jitter (i.e. add delta of previous jitter and current jitter to previous UV
+    // fetch pre- and post-alpha color values
+    FFX_MIN16_F2 fUv = ( FFX_MIN16_F2(uDispatchThreadId) + FFX_MIN16_F2(0.5f, 0.5f) ) / FFX_MIN16_F2( RenderSize() );
+    FFX_MIN16_F2 fPrevUV = fUv + FFX_MIN16_F2( LoadInputMotionVector(uDispatchThreadId) );
+    FFX_MIN16_I2 iPrevIdx = FFX_MIN16_I2(fPrevUV * FFX_MIN16_F2(RenderSize()) - 0.5f);
+
+    FFX_MIN16_F3 colorPreAlpha  = FFX_MIN16_F3( LoadOpaqueOnly( uDispatchThreadId ) );
+    FFX_MIN16_F3 colorPostAlpha = FFX_MIN16_F3( LoadInputColor( uDispatchThreadId ) );
+
+    FFX_MIN16_F2 outReactiveMask = FFX_MIN16_F2( 0.f, 0.f );
+    
+    outReactiveMask.y = ComputeTransparencyAndComposition(uDispatchThreadId, iPrevIdx);
+
+    if (outReactiveMask.y > 0.5f)
+    {
+        outReactiveMask.x = ComputeReactive(uDispatchThreadId, iPrevIdx);
+        outReactiveMask.x *= FFX_MIN16_F(cbGenerateReactive.fReactiveScale);
+        outReactiveMask.x = outReactiveMask.x < cbGenerateReactive.fReactiveMax ? outReactiveMask.x : FFX_MIN16_F( cbGenerateReactive.fReactiveMax );
+    }
+
+    outReactiveMask.y *= FFX_MIN16_F(cbGenerateReactive.fTcScale);
+
+    outReactiveMask.x = ffxMax(outReactiveMask.x, FFX_MIN16_F(LoadReactiveMask(uDispatchThreadId)));
+    outReactiveMask.y = ffxMax(outReactiveMask.y, FFX_MIN16_F(LoadTransparencyAndCompositionMask(uDispatchThreadId)));
+
+    StoreAutoReactive(uDispatchThreadId, outReactiveMask);
+
+    StorePrevPreAlpha(uDispatchThreadId, colorPreAlpha);
+    StorePrevPostAlpha(uDispatchThreadId, colorPostAlpha);
+}
--- a/thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_fsr2_upsample.h
@@ -0,0 +1,194 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef FFX_FSR2_UPSAMPLE_H
+#define FFX_FSR2_UPSAMPLE_H
+
+FFX_STATIC const FfxUInt32 iLanczos2SampleCount = 16;
+
+void Deringing(RectificationBox clippingBox, FFX_PARAMETER_INOUT FfxFloat32x3 fColor)
+{
+    fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax);
+}
+#if FFX_HALF
+void Deringing(RectificationBoxMin16 clippingBox, FFX_PARAMETER_INOUT FFX_MIN16_F3 fColor)
+{
+    fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax);
+}
+#endif
+
+#ifndef FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE
+#define FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE 2 // Approximate
+#endif
+
+FfxFloat32 GetUpsampleLanczosWeight(FfxFloat32x2 fSrcSampleOffset, FfxFloat32 fKernelWeight)
+{
+    FfxFloat32x2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx;
+#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE
+    FfxFloat32 fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT
+    FfxFloat32 fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+    FfxFloat32 fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
+#else
+#error "Invalid Lanczos type"
+#endif
+    return fSampleWeight;
+}
+
+#if FFX_HALF
+FFX_MIN16_F GetUpsampleLanczosWeight(FFX_MIN16_F2 fSrcSampleOffset, FFX_MIN16_F fKernelWeight)
+{
+    FFX_MIN16_F2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight.xx;
+#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE
+    FFX_MIN16_F fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT
+    FFX_MIN16_F fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased));
+#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE
+    FFX_MIN16_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
+
+    // To Test: Save reciproqual sqrt compute
+    // FfxFloat32 fSampleWeight = Lanczos2Sq_UseLUT(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased));
+#else
+#error "Invalid Lanczos type"
+#endif
+    return fSampleWeight;
+}
+#endif
+
+FfxFloat32 ComputeMaxKernelWeight() {
+    const FfxFloat32 fKernelSizeBias = 1.0f;
+
+    FfxFloat32 fKernelWeight = FfxFloat32(1) + (FfxFloat32(1.0f) / FfxFloat32x2(DownscaleFactor()) - FfxFloat32(1)).x * FfxFloat32(fKernelSizeBias);
+
+    return ffxMin(FfxFloat32(1.99f), fKernelWeight);
+}
+
+FfxFloat32x4 ComputeUpsampledColorAndWeight(const AccumulationPassCommonParams params,
+    FFX_PARAMETER_INOUT RectificationBox clippingBox, FfxFloat32 fReactiveFactor)
+{
+    #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF
+    #include "ffx_fsr2_force16_begin.h"
+    #endif
+    // We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly)
+    FfxFloat32x2 fDstOutputPos = FfxFloat32x2(params.iPxHrPos) + FFX_BROADCAST_FLOAT32X2(0.5f);      // Destination resolution output pixel center position
+    FfxFloat32x2 fSrcOutputPos = fDstOutputPos * DownscaleFactor();                   // Source resolution output pixel center position
+    FfxInt32x2 iSrcInputPos = FfxInt32x2(floor(fSrcOutputPos));                     // TODO: what about weird upscale factors...
+
+    #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF
+    #include "ffx_fsr2_force16_end.h"
+    #endif
+
+    FfxFloat32x3 fSamples[iLanczos2SampleCount];
+
+    FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FfxFloat32x2(0.5f, 0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0
+
+    FfxInt32x2 offsetTL;
+    offsetTL.x = (fSrcUnjitteredPos.x > fSrcOutputPos.x) ? FfxInt32(-2) : FfxInt32(-1);
+    offsetTL.y = (fSrcUnjitteredPos.y > fSrcOutputPos.y) ? FfxInt32(-2) : FfxInt32(-1);
+
+    //Load samples
+    // If fSrcUnjitteredPos.y > fSrcOutputPos.y, indicates offsetTL.y = -2, sample offset Y will be [-2, 1], clipbox will be rows [1, 3].
+    // Flip row# for sampling offset in this case, so first 0~2 rows in the sampled array can always be used for computing the clipbox.
+    // This reduces branch or cmove on sampled colors, but moving this overhead to sample position / weight calculation time which apply to less values.
+    const FfxBoolean bFlipRow = fSrcUnjitteredPos.y > fSrcOutputPos.y;
+    const FfxBoolean bFlipCol = fSrcUnjitteredPos.x > fSrcOutputPos.x;
+
+    FfxFloat32x2 fOffsetTL = FfxFloat32x2(offsetTL);
+
+    FFX_UNROLL
+    for (FfxInt32 row = 0; row < 3; row++) {
+
+        FFX_UNROLL
+            for (FfxInt32 col = 0; col < 3; col++) {
+                FfxInt32 iSampleIndex = col + (row << 2);
+
+                FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row);
+                FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + offsetTL + sampleColRow;
+
+                const FfxInt32x2 sampleCoord = ClampLoad(iSrcSamplePos, FfxInt32x2(0, 0), FfxInt32x2(RenderSize()));
+
+                fSamples[iSampleIndex] = LoadPreparedInputColor(FfxInt32x2(sampleCoord));
+            }
+    }
+
+    FfxFloat32x4 fColorAndWeight = FfxFloat32x4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    FfxFloat32x2 fBaseSampleOffset = FfxFloat32x2(fSrcUnjitteredPos - fSrcOutputPos);
+
+    // Identify how much of each upsampled color to be used for this frame
+    const FfxFloat32 fKernelReactiveFactor = ffxMax(fReactiveFactor, FfxFloat32(params.bIsNewSample));
+    const FfxFloat32 fKernelBiasMax = ComputeMaxKernelWeight() * (1.0f - fKernelReactiveFactor);
+
+    const FfxFloat32 fKernelBiasMin = ffxMax(1.0f, ((1.0f + fKernelBiasMax) * 0.3f));
+    const FfxFloat32 fKernelBiasFactor = ffxMax(0.0f, ffxMax(0.25f * params.fDepthClipFactor, fKernelReactiveFactor));
+    const FfxFloat32 fKernelBias = ffxLerp(fKernelBiasMax, fKernelBiasMin, fKernelBiasFactor);
+
+    const FfxFloat32 fRectificationCurveBias = ffxLerp(-2.0f, -3.0f, ffxSaturate(params.fHrVelocity / 50.0f));
+
+    FFX_UNROLL
+    for (FfxInt32 row = 0; row < 3; row++) {
+        FFX_UNROLL
+        for (FfxInt32 col = 0; col < 3; col++) {
+            FfxInt32 iSampleIndex = col + (row << 2);
+
+            const FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row);
+            const FfxFloat32x2 fOffset = fOffsetTL + FfxFloat32x2(sampleColRow);
+            FfxFloat32x2 fSrcSampleOffset = fBaseSampleOffset + fOffset;
+
+            FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + FfxInt32x2(offsetTL) + sampleColRow;
+
+            const FfxFloat32 fOnScreenFactor = FfxFloat32(IsOnScreen(FfxInt32x2(iSrcSamplePos), FfxInt32x2(RenderSize())));
+            FfxFloat32 fSampleWeight = fOnScreenFactor * FfxFloat32(GetUpsampleLanczosWeight(fSrcSampleOffset, fKernelBias));
+
+            fColorAndWeight += FfxFloat32x4(fSamples[iSampleIndex] * fSampleWeight, fSampleWeight);
+
+            // Update rectification box
+            {
+                const FfxFloat32 fSrcSampleOffsetSq = dot(fSrcSampleOffset, fSrcSampleOffset);
+                const FfxFloat32 fBoxSampleWeight = exp(fRectificationCurveBias * fSrcSampleOffsetSq);
+
+                const FfxBoolean bInitialSample = (row == 0) && (col == 0);
+                RectificationBoxAddSample(bInitialSample, clippingBox, fSamples[iSampleIndex], fBoxSampleWeight);
+            }
+        }
+    }
+
+    RectificationBoxComputeVarianceBoxData(clippingBox);
+
+    fColorAndWeight.w *= FfxFloat32(fColorAndWeight.w > FSR2_EPSILON);
+
+    if (fColorAndWeight.w > FSR2_EPSILON) {
+        // Normalize for deringing (we need to compare colors)
+        fColorAndWeight.xyz = fColorAndWeight.xyz / fColorAndWeight.w;
+        fColorAndWeight.w *= fUpsampleLanczosWeightScale;
+
+        Deringing(clippingBox, fColorAndWeight.xyz);
+    }
+
+    #if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF
+    #include "ffx_fsr2_force16_end.h"
+    #endif
+
+    return fColorAndWeight;
+}
+
+#endif //!defined( FFX_FSR2_UPSAMPLE_H )
--- a/thirdparty/amd-fsr2/shaders/ffx_spd.h
+++ b/thirdparty/amd-fsr2/shaders/ffx_spd.h
@@ -0,0 +1,936 @@
+// This file is part of the FidelityFX SDK.
+//
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifdef FFX_CPU
+FFX_STATIC void SpdSetup(FfxUInt32x2    dispatchThreadGroupCountXY,  // CPU side: dispatch thread group count xy
+                         FfxUInt32x2    workGroupOffset,             // GPU side: pass in as constant
+                         FfxUInt32x2    numWorkGroupsAndMips,        // GPU side: pass in as constant
+                         FfxUInt32x4     rectInfo,                    // left, top, width, height
+                         FfxInt32 mips)                        // optional: if -1, calculate based on rect width and height
+{
+    workGroupOffset[0] = rectInfo[0] / 64;  // rectInfo[0] = left
+    workGroupOffset[1] = rectInfo[1] / 64;  // rectInfo[1] = top
+
+    FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64;  // rectInfo[0] = left, rectInfo[2] = width
+    FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64;  // rectInfo[1] = top, rectInfo[3] = height
+
+    dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
+    dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
+
+    numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
+
+    if (mips >= 0)
+    {
+        numWorkGroupsAndMips[1] = FfxUInt32(mips);
+    }
+    else
+    {  
+        // calculate based on rect width and height
+        FfxUInt32 resolution    = ffxMax(rectInfo[2], rectInfo[3]);
+        numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));
+    }
+}
+
+FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY,  // CPU side: dispatch thread group count xy
+                         FfxUInt32x2 workGroupOffset,             // GPU side: pass in as constant
+                         FfxUInt32x2 numWorkGroupsAndMips,        // GPU side: pass in as constant
+                         FfxUInt32x4  rectInfo)                    // left, top, width, height
+{
+    SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
+}
+#endif // #ifdef FFX_CPU
+
+
+//==============================================================================================================================
+//                                                     NON-PACKED VERSION
+//==============================================================================================================================
+#ifdef FFX_GPU
+#ifdef SPD_PACKED_ONLY
+// Avoid compiler error
+FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)
+{
+    return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+
+FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)
+{
+    return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)
+{
+}
+FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
+{
+    return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
+{
+}
+FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
+{
+    return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
+}
+#endif // #ifdef SPD_PACKED_ONLY
+
+//_____________________________________________________________/\_______________________________________________________________
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+#extension GL_KHR_shader_subgroup_quad:require
+#endif
+
+void SpdWorkgroupShuffleBarrier()
+{
+#ifdef FFX_GLSL
+    barrier();
+#endif
+#ifdef FFX_HLSL
+    GroupMemoryBarrierWithGroupSync();
+#endif
+}
+
+// Only last active workgroup should proceed
+bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)
+{
+    // global atomic counter
+    if (localInvocationIndex == 0)
+    {
+        SpdIncreaseAtomicCounter(slice);
+    }
+
+    SpdWorkgroupShuffleBarrier();
+    return (SpdGetAtomicCounter() != (numWorkGroups - 1));
+}
+
+// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);
+FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)
+{
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+
+    FfxFloat32x4 v0 = v;
+    FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);
+    FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);
+    FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);
+    return SpdReduce4(v0, v1, v2, v3);
+
+#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+
+    // requires SM6.0
+    FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
+    FfxFloat32x4     v0   = v;
+    FfxFloat32x4     v1   = WaveReadLaneAt(v, quad | 1);
+    FfxFloat32x4     v2   = WaveReadLaneAt(v, quad | 2);
+    FfxFloat32x4     v3   = WaveReadLaneAt(v, quad | 3);
+    return SpdReduce4(v0, v1, v2, v3);
+/*
+    // if SM6.0 is not available, you can use the AMD shader intrinsics
+    // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+    // https://gpuopen.com/amd-gpu-services-ags-library/
+    // works for DX11
+    FfxFloat32x4 v0 = v;
+    FfxFloat32x4 v1;
+    v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    FfxFloat32x4 v2;
+    v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    FfxFloat32x4 v3;
+    v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    return SpdReduce4(v0, v1, v2, v3);
+    */
+#endif
+    return v;
+}
+
+FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
+{
+    FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);
+    FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);
+    FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);
+    FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+    FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);
+    FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);
+    FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);
+    FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)
+{
+    return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+}
+
+FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+    FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);
+    FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);
+    FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);
+    FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+    return SpdLoadSourceImage(FfxInt32x2(base), slice);
+#else
+    return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+    FfxFloat32x4 v[4];
+
+    FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+    FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+    v[0]     = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[0], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[1], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[2], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[3], 0, slice);
+
+    if (mip <= 1)
+        return;
+
+    v[0] = SpdReduceQuad(v[0]);
+    v[1] = SpdReduceQuad(v[1]);
+    v[2] = SpdReduceQuad(v[2]);
+    v[3] = SpdReduceQuad(v[3]);
+
+    if ((localInvocationIndex % 4) == 0)
+    {
+        SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
+        SpdStoreIntermediate(x / 2, y / 2, v[0]);
+
+        SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
+        SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);
+
+        SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
+        SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);
+
+        SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
+        SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+    FfxFloat32x4 v[4];
+
+    FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+    FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+    v[0]     = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[0], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[1], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[2], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[3], 0, slice);
+
+    if (mip <= 1)
+        return;
+
+    for (FfxUInt32 i = 0; i < 4; i++)
+    {
+        SpdStoreIntermediate(x, y, v[i]);
+        SpdWorkgroupShuffleBarrier();
+        if (localInvocationIndex < 64)
+        {
+            v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+            SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+        }
+        SpdWorkgroupShuffleBarrier();
+    }
+
+    if (localInvocationIndex < 64)
+    {
+        SpdStoreIntermediate(x + 0, y + 0, v[0]);
+        SpdStoreIntermediate(x + 8, y + 0, v[1]);
+        SpdStoreIntermediate(x + 0, y + 8, v[2]);
+        SpdStoreIntermediate(x + 8, y + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
+#else
+    SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 64)
+    {
+        FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+        SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS, try to reduce bank conflicts
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // ...
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
+    }
+#else
+    FfxFloat32x4 v = SpdLoadIntermediate(x, y);
+    v        = SpdReduceQuad(v);
+    // quad index 0 stores result
+    if (localInvocationIndex % 4 == 0)
+    {
+        SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+        SpdStoreIntermediate(x + (y / 2) % 2, y, v);
+    }
+#endif
+}
+
+void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 16)
+    {
+        // x 0 x 0
+        // 0 0 0 0
+        // 0 x 0 x
+        // 0 0 0 0
+        FfxFloat32x4 v =
+            SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
+        SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS
+        // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+        // ...
+        // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+        // ...
+        // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+        // ...
+        SpdStoreIntermediate(x * 4 + y, y * 4, v);
+    }
+#else
+    if (localInvocationIndex < 64)
+    {
+        FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);
+        v        = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+            SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 4)
+    {
+        // x 0 0 0 x 0 0 0
+        // ...
+        // 0 x 0 0 0 x 0 0
+        FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+                                         FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+                                         FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+                                         FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
+        SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS
+        // x x x x 0 ...
+        // 0 ...
+        SpdStoreIntermediate(x + y * 2, 0, v);
+    }
+#else
+    if (localInvocationIndex < 16)
+    {
+        FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);
+        v        = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+            SpdStoreIntermediate(x / 2 + y, 0, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 1)
+    {
+        // x x x x 0 ...
+        // 0 ...
+        FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
+        SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
+    }
+#else
+    if (localInvocationIndex < 4)
+    {
+        FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);
+        v        = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
+{
+    FfxInt32x2   tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
+    FfxInt32x2   pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
+    FfxFloat32x4 v0  = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v0, 6, slice);
+
+    tex       = FfxInt32x2(x * 4 + 2, y * 4 + 0);
+    pix       = FfxInt32x2(x * 2 + 1, y * 2 + 0);
+    FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v1, 6, slice);
+
+    tex       = FfxInt32x2(x * 4 + 0, y * 4 + 2);
+    pix       = FfxInt32x2(x * 2 + 0, y * 2 + 1);
+    FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v2, 6, slice);
+
+    tex       = FfxInt32x2(x * 4 + 2, y * 4 + 2);
+    pix       = FfxInt32x2(x * 2 + 1, y * 2 + 1);
+    FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v3, 6, slice);
+
+    if (mips <= 7)
+        return;
+    // no barrier needed, working on values only from the same thread
+
+    FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);
+    SpdStore(FfxInt32x2(x, y), v, 7, slice);
+    SpdStoreIntermediate(x, y, v);
+}
+
+void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
+{
+    if (mips <= baseMip)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+    if (mips <= baseMip + 1)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+    if (mips <= baseMip + 2)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+    if (mips <= baseMip + 3)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
+{
+    FfxUInt32x2        sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
+    FfxUInt32 x      = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+    FfxUInt32 y      = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+    SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+    SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+    if (mips <= 6)
+        return;
+
+    if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
+        return;
+
+    SpdResetAtomicCounter(slice);
+
+    // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+    SpdDownsampleMips_6_7(x, y, mips, slice);
+
+    SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
+{
+    SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//==============================================================================================================================
+//                                                       PACKED VERSION
+//==============================================================================================================================
+
+#if FFX_HALF
+
+#ifdef FFX_GLSL
+#extension GL_EXT_shader_subgroup_extended_types_float16:require
+#endif
+
+FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)
+{
+#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    FfxFloat16x4 v0 = v;
+    FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);
+    FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);
+    FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);
+    return SpdReduce4H(v0, v1, v2, v3);
+#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    // requires SM6.0
+    FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
+    FfxFloat16x4        v0   = v;
+    FfxFloat16x4        v1   = WaveReadLaneAt(v, quad | 1);
+    FfxFloat16x4        v2   = WaveReadLaneAt(v, quad | 2);
+    FfxFloat16x4        v3   = WaveReadLaneAt(v, quad | 3);
+    return SpdReduce4H(v0, v1, v2, v3);
+/*
+    // if SM6.0 is not available, you can use the AMD shader intrinsics
+    // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+    // https://gpuopen.com/amd-gpu-services-ags-library/
+    // works for DX11
+    FfxFloat16x4 v0 = v;
+    FfxFloat16x4 v1;
+    v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    FfxFloat16x4 v2;
+    v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    FfxFloat16x4 v3;
+    v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    return SpdReduce4H(v0, v1, v2, v3);
+    */
+#endif
+    return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);
+}
+
+FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
+{
+    FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
+    FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
+    FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
+    FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+    FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);
+    FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);
+    FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);
+    FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)
+{
+    return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+}
+
+FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
+{
+    FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);
+    FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);
+    FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);
+    FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+    return SpdLoadSourceImageH(FfxInt32x2(base), slice);
+#else
+    return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+    FfxFloat16x4 v[4];
+
+    FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+    FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+    v[0]     = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[0], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[1], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[2], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[3], 0, slice);
+
+    if (mips <= 1)
+        return;
+
+    v[0] = SpdReduceQuadH(v[0]);
+    v[1] = SpdReduceQuadH(v[1]);
+    v[2] = SpdReduceQuadH(v[2]);
+    v[3] = SpdReduceQuadH(v[3]);
+
+    if ((localInvocationIndex % 4) == 0)
+    {
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
+        SpdStoreIntermediateH(x / 2, y / 2, v[0]);
+
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
+        SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
+
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
+        SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
+
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
+        SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+    FfxFloat16x4 v[4];
+
+    FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
+    FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
+    v[0]     = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[0], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[1], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[2], 0, slice);
+
+    tex  = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
+    pix  = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[3], 0, slice);
+
+    if (mips <= 1)
+        return;
+
+    for (FfxInt32 i = 0; i < 4; i++)
+    {
+        SpdStoreIntermediateH(x, y, v[i]);
+        SpdWorkgroupShuffleBarrier();
+        if (localInvocationIndex < 64)
+        {
+            v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+            SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+        }
+        SpdWorkgroupShuffleBarrier();
+    }
+
+    if (localInvocationIndex < 64)
+    {
+        SpdStoreIntermediateH(x + 0, y + 0, v[0]);
+        SpdStoreIntermediateH(x + 8, y + 0, v[1]);
+        SpdStoreIntermediateH(x + 0, y + 8, v[2]);
+        SpdStoreIntermediateH(x + 8, y + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#else
+    SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 64)
+    {
+        FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS, try to reduce bank conflicts
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // ...
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
+    }
+#else
+    FfxFloat16x4 v = SpdLoadIntermediateH(x, y);
+    v     = SpdReduceQuadH(v);
+    // quad index 0 stores result
+    if (localInvocationIndex % 4 == 0)
+    {
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+        SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
+    }
+#endif
+}
+
+void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 16)
+    {
+        // x 0 x 0
+        // 0 0 0 0
+        // 0 x 0 x
+        // 0 0 0 0
+        FfxFloat16x4 v =
+            SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS
+        // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+        // ...
+        // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+        // ...
+        // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+        // ...
+        SpdStoreIntermediateH(x * 4 + y, y * 4, v);
+    }
+#else
+    if (localInvocationIndex < 64)
+    {
+        FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
+        v     = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+            SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 4)
+    {
+        // x 0 0 0 x 0 0 0
+        // ...
+        // 0 x 0 0 0 x 0 0
+        FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+                                       FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+                                       FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+                                       FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
+        SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
+        // store to LDS
+        // x x x x 0 ...
+        // 0 ...
+        SpdStoreIntermediateH(x + y * 2, 0, v);
+    }
+#else
+    if (localInvocationIndex < 16)
+    {
+        FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
+        v     = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
+            SpdStoreIntermediateH(x / 2 + y, 0, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 1)
+    {
+        // x x x x 0 ...
+        // 0 ...
+        FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
+        SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
+    }
+#else
+    if (localInvocationIndex < 4)
+    {
+        FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
+        v     = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {
+            SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
+{
+    FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
+    FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
+    FfxFloat16x4  v0  = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v0, 6, slice);
+
+    tex    = FfxInt32x2(x * 4 + 2, y * 4 + 0);
+    pix    = FfxInt32x2(x * 2 + 1, y * 2 + 0);
+    FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v1, 6, slice);
+
+    tex    = FfxInt32x2(x * 4 + 0, y * 4 + 2);
+    pix    = FfxInt32x2(x * 2 + 0, y * 2 + 1);
+    FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v2, 6, slice);
+
+    tex    = FfxInt32x2(x * 4 + 2, y * 4 + 2);
+    pix    = FfxInt32x2(x * 2 + 1, y * 2 + 1);
+    FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v3, 6, slice);
+
+    if (mips < 8)
+        return;
+    // no barrier needed, working on values only from the same thread
+
+    FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);
+    SpdStoreH(FfxInt32x2(x, y), v, 7, slice);
+    SpdStoreIntermediateH(x, y, v);
+}
+
+void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
+{
+    if (mips <= baseMip)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+    if (mips <= baseMip + 1)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+    if (mips <= baseMip + 2)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+    if (mips <= baseMip + 3)
+        return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
+{
+    FfxUInt32x2        sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
+    FfxUInt32 x      = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+    FfxUInt32 y      = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+
+    SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+    SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+    if (mips < 7)
+        return;
+
+    if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
+        return;
+
+    SpdResetAtomicCounter(slice);
+
+    // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+    SpdDownsampleMips_6_7H(x, y, mips, slice);
+
+    SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
+{
+    SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+#endif // #if FFX_HALF
+#endif // #ifdef FFX_GPU
--- a/thirdparty/angle/LICENSE
+++ b/thirdparty/angle/LICENSE
@@ -0,0 +1,32 @@
+// Copyright 2018 The ANGLE Project Authors.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//     Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//     Redistributions in binary form must reproduce the above
+//     copyright notice, this list of conditions and the following
+//     disclaimer in the documentation and/or other materials provided
+//     with the distribution.
+//
+//     Neither the name of TransGaming Inc., Google Inc., 3DLabs Inc.
+//     Ltd., nor the names of their contributors may be used to endorse
+//     or promote products derived from this software without specific
+//     prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
--- a/thirdparty/angle/include/EGL/egl.h
+++ b/thirdparty/angle/include/EGL/egl.h
@@ -0,0 +1,342 @@
+#ifndef __egl_h_
+#define __egl_h_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** Copyright 2013-2020 The Khronos Group Inc.
+** SPDX-License-Identifier: Apache-2.0
+**
+** This header is generated from the Khronos EGL XML API Registry.
+** The current version of the Registry, generator scripts
+** used to make the header, and the header can be found at
+**   http://www.khronos.org/registry/egl
+**
+** Khronos $Git commit SHA1: 6fb1daea15 $ on $Git commit date: 2022-05-25 09:41:13 -0600 $
+*/
+
+#include <EGL/eglplatform.h>
+
+#ifndef EGL_EGL_PROTOTYPES
+#define EGL_EGL_PROTOTYPES 1
+#endif
+
+/* Generated on date 20220525 */
+
+/* Generated C header for:
+ * API: egl
+ * Versions considered: .*
+ * Versions emitted: .*
+ * Default extensions included: None
+ * Additional extensions included: _nomatch_^
+ * Extensions removed: _nomatch_^
+ */
+
+#ifndef EGL_VERSION_1_0
+#define EGL_VERSION_1_0 1
+typedef unsigned int EGLBoolean;
+typedef void *EGLDisplay;
+#include <KHR/khrplatform.h>
+#include <EGL/eglplatform.h>
+typedef void *EGLConfig;
+typedef void *EGLSurface;
+typedef void *EGLContext;
+typedef void (*__eglMustCastToProperFunctionPointerType)(void);
+#define EGL_ALPHA_SIZE                    0x3021
+#define EGL_BAD_ACCESS                    0x3002
+#define EGL_BAD_ALLOC                     0x3003
+#define EGL_BAD_ATTRIBUTE                 0x3004
+#define EGL_BAD_CONFIG                    0x3005
+#define EGL_BAD_CONTEXT                   0x3006
+#define EGL_BAD_CURRENT_SURFACE           0x3007
+#define EGL_BAD_DISPLAY                   0x3008
+#define EGL_BAD_MATCH                     0x3009
+#define EGL_BAD_NATIVE_PIXMAP             0x300A
+#define EGL_BAD_NATIVE_WINDOW             0x300B
+#define EGL_BAD_PARAMETER                 0x300C
+#define EGL_BAD_SURFACE                   0x300D
+#define EGL_BLUE_SIZE                     0x3022
+#define EGL_BUFFER_SIZE                   0x3020
+#define EGL_CONFIG_CAVEAT                 0x3027
+#define EGL_CONFIG_ID                     0x3028
+#define EGL_CORE_NATIVE_ENGINE            0x305B
+#define EGL_DEPTH_SIZE                    0x3025
+#define EGL_DONT_CARE                     EGL_CAST(EGLint,-1)
+#define EGL_DRAW                          0x3059
+#define EGL_EXTENSIONS                    0x3055
+#define EGL_FALSE                         0
+#define EGL_GREEN_SIZE                    0x3023
+#define EGL_HEIGHT                        0x3056
+#define EGL_LARGEST_PBUFFER               0x3058
+#define EGL_LEVEL                         0x3029
+#define EGL_MAX_PBUFFER_HEIGHT            0x302A
+#define EGL_MAX_PBUFFER_PIXELS            0x302B
+#define EGL_MAX_PBUFFER_WIDTH             0x302C
+#define EGL_NATIVE_RENDERABLE             0x302D
+#define EGL_NATIVE_VISUAL_ID              0x302E
+#define EGL_NATIVE_VISUAL_TYPE            0x302F
+#define EGL_NONE                          0x3038
+#define EGL_NON_CONFORMANT_CONFIG         0x3051
+#define EGL_NOT_INITIALIZED               0x3001
+#define EGL_NO_CONTEXT                    EGL_CAST(EGLContext,0)
+#define EGL_NO_DISPLAY                    EGL_CAST(EGLDisplay,0)
+#define EGL_NO_SURFACE                    EGL_CAST(EGLSurface,0)
+#define EGL_PBUFFER_BIT                   0x0001
+#define EGL_PIXMAP_BIT                    0x0002
+#define EGL_READ                          0x305A
+#define EGL_RED_SIZE                      0x3024
+#define EGL_SAMPLES                       0x3031
+#define EGL_SAMPLE_BUFFERS                0x3032
+#define EGL_SLOW_CONFIG                   0x3050
+#define EGL_STENCIL_SIZE                  0x3026
+#define EGL_SUCCESS                       0x3000
+#define EGL_SURFACE_TYPE                  0x3033
+#define EGL_TRANSPARENT_BLUE_VALUE        0x3035
+#define EGL_TRANSPARENT_GREEN_VALUE       0x3036
+#define EGL_TRANSPARENT_RED_VALUE         0x3037
+#define EGL_TRANSPARENT_RGB               0x3052
+#define EGL_TRANSPARENT_TYPE              0x3034
+#define EGL_TRUE                          1
+#define EGL_VENDOR                        0x3053
+#define EGL_VERSION                       0x3054
+#define EGL_WIDTH                         0x3057
+#define EGL_WINDOW_BIT                    0x0004
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCHOOSECONFIGPROC) (EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOPYBUFFERSPROC) (EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target);
+typedef EGLContext (EGLAPIENTRYP PFNEGLCREATECONTEXTPROC) (EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPBUFFERSURFACEPROC) (EGLDisplay dpy, EGLConfig config, const EGLint *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPIXMAPSURFACEPROC) (EGLDisplay dpy, EGLConfig config, EGLNativePixmapType pixmap, const EGLint *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEWINDOWSURFACEPROC) (EGLDisplay dpy, EGLConfig config, EGLNativeWindowType win, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYCONTEXTPROC) (EGLDisplay dpy, EGLContext ctx);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSURFACEPROC) (EGLDisplay dpy, EGLSurface surface);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETCONFIGATTRIBPROC) (EGLDisplay dpy, EGLConfig config, EGLint attribute, EGLint *value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETCONFIGSPROC) (EGLDisplay dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETCURRENTDISPLAYPROC) (void);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLGETCURRENTSURFACEPROC) (EGLint readdraw);
+typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDISPLAYPROC) (EGLNativeDisplayType display_id);
+typedef EGLint (EGLAPIENTRYP PFNEGLGETERRORPROC) (void);
+typedef __eglMustCastToProperFunctionPointerType (EGLAPIENTRYP PFNEGLGETPROCADDRESSPROC) (const char *procname);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLINITIALIZEPROC) (EGLDisplay dpy, EGLint *major, EGLint *minor);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLMAKECURRENTPROC) (EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYCONTEXTPROC) (EGLDisplay dpy, EGLContext ctx, EGLint attribute, EGLint *value);
+typedef const char *(EGLAPIENTRYP PFNEGLQUERYSTRINGPROC) (EGLDisplay dpy, EGLint name);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSURFACEPROC) (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint *value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSPROC) (EGLDisplay dpy, EGLSurface surface);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLTERMINATEPROC) (EGLDisplay dpy);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLWAITGLPROC) (void);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLWAITNATIVEPROC) (EGLint engine);
+#if EGL_EGL_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig (EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+EGLAPI EGLBoolean EGLAPIENTRY eglCopyBuffers (EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target);
+EGLAPI EGLContext EGLAPIENTRY eglCreateContext (EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface (EGLDisplay dpy, EGLConfig config, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurface (EGLDisplay dpy, EGLConfig config, EGLNativePixmapType pixmap, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreateWindowSurface (EGLDisplay dpy, EGLConfig config, EGLNativeWindowType win, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyContext (EGLDisplay dpy, EGLContext ctx);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySurface (EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigAttrib (EGLDisplay dpy, EGLConfig config, EGLint attribute, EGLint *value);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigs (EGLDisplay dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetCurrentDisplay (void);
+EGLAPI EGLSurface EGLAPIENTRY eglGetCurrentSurface (EGLint readdraw);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay (EGLNativeDisplayType display_id);
+EGLAPI EGLint EGLAPIENTRY eglGetError (void);
+EGLAPI __eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress (const char *procname);
+EGLAPI EGLBoolean EGLAPIENTRY eglInitialize (EGLDisplay dpy, EGLint *major, EGLint *minor);
+EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent (EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryContext (EGLDisplay dpy, EGLContext ctx, EGLint attribute, EGLint *value);
+EGLAPI const char *EGLAPIENTRY eglQueryString (EGLDisplay dpy, EGLint name);
+EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint *value);
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffers (EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglTerminate (EGLDisplay dpy);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitGL (void);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitNative (EGLint engine);
+#endif
+#endif /* EGL_VERSION_1_0 */
+
+#ifndef EGL_VERSION_1_1
+#define EGL_VERSION_1_1 1
+#define EGL_BACK_BUFFER                   0x3084
+#define EGL_BIND_TO_TEXTURE_RGB           0x3039
+#define EGL_BIND_TO_TEXTURE_RGBA          0x303A
+#define EGL_CONTEXT_LOST                  0x300E
+#define EGL_MIN_SWAP_INTERVAL             0x303B
+#define EGL_MAX_SWAP_INTERVAL             0x303C
+#define EGL_MIPMAP_TEXTURE                0x3082
+#define EGL_MIPMAP_LEVEL                  0x3083
+#define EGL_NO_TEXTURE                    0x305C
+#define EGL_TEXTURE_2D                    0x305F
+#define EGL_TEXTURE_FORMAT                0x3080
+#define EGL_TEXTURE_RGB                   0x305D
+#define EGL_TEXTURE_RGBA                  0x305E
+#define EGL_TEXTURE_TARGET                0x3081
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLBINDTEXIMAGEPROC) (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLRELEASETEXIMAGEPROC) (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSURFACEATTRIBPROC) (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPINTERVALPROC) (EGLDisplay dpy, EGLint interval);
+#if EGL_EGL_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglBindTexImage (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseTexImage (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+EGLAPI EGLBoolean EGLAPIENTRY eglSurfaceAttrib (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint value);
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapInterval (EGLDisplay dpy, EGLint interval);
+#endif
+#endif /* EGL_VERSION_1_1 */
+
+#ifndef EGL_VERSION_1_2
+#define EGL_VERSION_1_2 1
+typedef unsigned int EGLenum;
+typedef void *EGLClientBuffer;
+#define EGL_ALPHA_FORMAT                  0x3088
+#define EGL_ALPHA_FORMAT_NONPRE           0x308B
+#define EGL_ALPHA_FORMAT_PRE              0x308C
+#define EGL_ALPHA_MASK_SIZE               0x303E
+#define EGL_BUFFER_PRESERVED              0x3094
+#define EGL_BUFFER_DESTROYED              0x3095
+#define EGL_CLIENT_APIS                   0x308D
+#define EGL_COLORSPACE                    0x3087
+#define EGL_COLORSPACE_sRGB               0x3089
+#define EGL_COLORSPACE_LINEAR             0x308A
+#define EGL_COLOR_BUFFER_TYPE             0x303F
+#define EGL_CONTEXT_CLIENT_TYPE           0x3097
+#define EGL_DISPLAY_SCALING               10000
+#define EGL_HORIZONTAL_RESOLUTION         0x3090
+#define EGL_LUMINANCE_BUFFER              0x308F
+#define EGL_LUMINANCE_SIZE                0x303D
+#define EGL_OPENGL_ES_BIT                 0x0001
+#define EGL_OPENVG_BIT                    0x0002
+#define EGL_OPENGL_ES_API                 0x30A0
+#define EGL_OPENVG_API                    0x30A1
+#define EGL_OPENVG_IMAGE                  0x3096
+#define EGL_PIXEL_ASPECT_RATIO            0x3092
+#define EGL_RENDERABLE_TYPE               0x3040
+#define EGL_RENDER_BUFFER                 0x3086
+#define EGL_RGB_BUFFER                    0x308E
+#define EGL_SINGLE_BUFFER                 0x3085
+#define EGL_SWAP_BEHAVIOR                 0x3093
+#define EGL_UNKNOWN                       EGL_CAST(EGLint,-1)
+#define EGL_VERTICAL_RESOLUTION           0x3091
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLBINDAPIPROC) (EGLenum api);
+typedef EGLenum (EGLAPIENTRYP PFNEGLQUERYAPIPROC) (void);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPBUFFERFROMCLIENTBUFFERPROC) (EGLDisplay dpy, EGLenum buftype, EGLClientBuffer buffer, EGLConfig config, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLRELEASETHREADPROC) (void);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLWAITCLIENTPROC) (void);
+#if EGL_EGL_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglBindAPI (EGLenum api);
+EGLAPI EGLenum EGLAPIENTRY eglQueryAPI (void);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferFromClientBuffer (EGLDisplay dpy, EGLenum buftype, EGLClientBuffer buffer, EGLConfig config, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseThread (void);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitClient (void);
+#endif
+#endif /* EGL_VERSION_1_2 */
+
+#ifndef EGL_VERSION_1_3
+#define EGL_VERSION_1_3 1
+#define EGL_CONFORMANT                    0x3042
+#define EGL_CONTEXT_CLIENT_VERSION        0x3098
+#define EGL_MATCH_NATIVE_PIXMAP           0x3041
+#define EGL_OPENGL_ES2_BIT                0x0004
+#define EGL_VG_ALPHA_FORMAT               0x3088
+#define EGL_VG_ALPHA_FORMAT_NONPRE        0x308B
+#define EGL_VG_ALPHA_FORMAT_PRE           0x308C
+#define EGL_VG_ALPHA_FORMAT_PRE_BIT       0x0040
+#define EGL_VG_COLORSPACE                 0x3087
+#define EGL_VG_COLORSPACE_sRGB            0x3089
+#define EGL_VG_COLORSPACE_LINEAR          0x308A
+#define EGL_VG_COLORSPACE_LINEAR_BIT      0x0020
+#endif /* EGL_VERSION_1_3 */
+
+#ifndef EGL_VERSION_1_4
+#define EGL_VERSION_1_4 1
+#define EGL_DEFAULT_DISPLAY               EGL_CAST(EGLNativeDisplayType,0)
+#define EGL_MULTISAMPLE_RESOLVE_BOX_BIT   0x0200
+#define EGL_MULTISAMPLE_RESOLVE           0x3099
+#define EGL_MULTISAMPLE_RESOLVE_DEFAULT   0x309A
+#define EGL_MULTISAMPLE_RESOLVE_BOX       0x309B
+#define EGL_OPENGL_API                    0x30A2
+#define EGL_OPENGL_BIT                    0x0008
+#define EGL_SWAP_BEHAVIOR_PRESERVED_BIT   0x0400
+typedef EGLContext (EGLAPIENTRYP PFNEGLGETCURRENTCONTEXTPROC) (void);
+#if EGL_EGL_PROTOTYPES
+EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext (void);
+#endif
+#endif /* EGL_VERSION_1_4 */
+
+#ifndef EGL_VERSION_1_5
+#define EGL_VERSION_1_5 1
+typedef void *EGLSync;
+typedef intptr_t EGLAttrib;
+typedef khronos_utime_nanoseconds_t EGLTime;
+typedef void *EGLImage;
+#define EGL_CONTEXT_MAJOR_VERSION         0x3098
+#define EGL_CONTEXT_MINOR_VERSION         0x30FB
+#define EGL_CONTEXT_OPENGL_PROFILE_MASK   0x30FD
+#define EGL_CONTEXT_OPENGL_RESET_NOTIFICATION_STRATEGY 0x31BD
+#define EGL_NO_RESET_NOTIFICATION         0x31BE
+#define EGL_LOSE_CONTEXT_ON_RESET         0x31BF
+#define EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT 0x00000001
+#define EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT 0x00000002
+#define EGL_CONTEXT_OPENGL_DEBUG          0x31B0
+#define EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE 0x31B1
+#define EGL_CONTEXT_OPENGL_ROBUST_ACCESS  0x31B2
+#define EGL_OPENGL_ES3_BIT                0x00000040
+#define EGL_CL_EVENT_HANDLE               0x309C
+#define EGL_SYNC_CL_EVENT                 0x30FE
+#define EGL_SYNC_CL_EVENT_COMPLETE        0x30FF
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE  0x30F0
+#define EGL_SYNC_TYPE                     0x30F7
+#define EGL_SYNC_STATUS                   0x30F1
+#define EGL_SYNC_CONDITION                0x30F8
+#define EGL_SIGNALED                      0x30F2
+#define EGL_UNSIGNALED                    0x30F3
+#define EGL_SYNC_FLUSH_COMMANDS_BIT       0x0001
+#define EGL_FOREVER                       0xFFFFFFFFFFFFFFFFull
+#define EGL_TIMEOUT_EXPIRED               0x30F5
+#define EGL_CONDITION_SATISFIED           0x30F6
+#define EGL_NO_SYNC                       EGL_CAST(EGLSync,0)
+#define EGL_SYNC_FENCE                    0x30F9
+#define EGL_GL_COLORSPACE                 0x309D
+#define EGL_GL_COLORSPACE_SRGB            0x3089
+#define EGL_GL_COLORSPACE_LINEAR          0x308A
+#define EGL_GL_RENDERBUFFER               0x30B9
+#define EGL_GL_TEXTURE_2D                 0x30B1
+#define EGL_GL_TEXTURE_LEVEL              0x30BC
+#define EGL_GL_TEXTURE_3D                 0x30B2
+#define EGL_GL_TEXTURE_ZOFFSET            0x30BD
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X 0x30B3
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X 0x30B4
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y 0x30B5
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y 0x30B6
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z 0x30B7
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z 0x30B8
+#define EGL_IMAGE_PRESERVED               0x30D2
+#define EGL_NO_IMAGE                      EGL_CAST(EGLImage,0)
+typedef EGLSync (EGLAPIENTRYP PFNEGLCREATESYNCPROC) (EGLDisplay dpy, EGLenum type, const EGLAttrib *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCPROC) (EGLDisplay dpy, EGLSync sync);
+typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCPROC) (EGLDisplay dpy, EGLSync sync, EGLint flags, EGLTime timeout);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBPROC) (EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLAttrib *value);
+typedef EGLImage (EGLAPIENTRYP PFNEGLCREATEIMAGEPROC) (EGLDisplay dpy, EGLContext ctx, EGLenum target, EGLClientBuffer buffer, const EGLAttrib *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYIMAGEPROC) (EGLDisplay dpy, EGLImage image);
+typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETPLATFORMDISPLAYPROC) (EGLenum platform, void *native_display, const EGLAttrib *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPLATFORMWINDOWSURFACEPROC) (EGLDisplay dpy, EGLConfig config, void *native_window, const EGLAttrib *attrib_list);
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPLATFORMPIXMAPSURFACEPROC) (EGLDisplay dpy, EGLConfig config, void *native_pixmap, const EGLAttrib *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLWAITSYNCPROC) (EGLDisplay dpy, EGLSync sync, EGLint flags);
+#if EGL_EGL_PROTOTYPES
+EGLAPI EGLSync EGLAPIENTRY eglCreateSync (EGLDisplay dpy, EGLenum type, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySync (EGLDisplay dpy, EGLSync sync);
+EGLAPI EGLint EGLAPIENTRY eglClientWaitSync (EGLDisplay dpy, EGLSync sync, EGLint flags, EGLTime timeout);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttrib (EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLAttrib *value);
+EGLAPI EGLImage EGLAPIENTRY eglCreateImage (EGLDisplay dpy, EGLContext ctx, EGLenum target, EGLClientBuffer buffer, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyImage (EGLDisplay dpy, EGLImage image);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetPlatformDisplay (EGLenum platform, void *native_display, const EGLAttrib *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformWindowSurface (EGLDisplay dpy, EGLConfig config, void *native_window, const EGLAttrib *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurface (EGLDisplay dpy, EGLConfig config, void *native_pixmap, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitSync (EGLDisplay dpy, EGLSync sync, EGLint flags);
+#endif
+#endif /* EGL_VERSION_1_5 */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/thirdparty/angle/include/EGL/eglext.h
+++ b/thirdparty/angle/include/EGL/eglext.h
--- a/thirdparty/angle/include/EGL/eglext_angle.h
+++ b/thirdparty/angle/include/EGL/eglext_angle.h
@@ -0,0 +1,428 @@
+//
+// Copyright 2017 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// eglext_angle.h: ANGLE modifications to the eglext.h header file.
+//   Currently we don't include this file directly, we patch eglext.h
+//   to include it implicitly so it is visible throughout our code.
+
+#ifndef INCLUDE_EGL_EGLEXT_ANGLE_
+#define INCLUDE_EGL_EGLEXT_ANGLE_
+
+// clang-format off
+
+#ifndef EGL_ANGLE_robust_resource_initialization
+#define EGL_ANGLE_robust_resource_initialization 1
+#define EGL_ROBUST_RESOURCE_INITIALIZATION_ANGLE 0x3453
+#endif /* EGL_ANGLE_robust_resource_initialization */
+
+#ifndef EGL_ANGLE_keyed_mutex
+#define EGL_ANGLE_keyed_mutex 1
+#define EGL_DXGI_KEYED_MUTEX_ANGLE        0x33A2
+#endif /* EGL_ANGLE_keyed_mutex */
+
+#ifndef EGL_ANGLE_d3d_texture_client_buffer
+#define EGL_ANGLE_d3d_texture_client_buffer 1
+#define EGL_D3D_TEXTURE_ANGLE 0x33A3
+#define EGL_TEXTURE_OFFSET_X_ANGLE 0x3490
+#define EGL_TEXTURE_OFFSET_Y_ANGLE 0x3491
+#define EGL_D3D11_TEXTURE_PLANE_ANGLE 0x3492
+#define EGL_D3D11_TEXTURE_ARRAY_SLICE_ANGLE 0x3493
+#endif /* EGL_ANGLE_d3d_texture_client_buffer */
+
+#ifndef EGL_ANGLE_software_display
+#define EGL_ANGLE_software_display 1
+#define EGL_SOFTWARE_DISPLAY_ANGLE ((EGLNativeDisplayType)-1)
+#endif /* EGL_ANGLE_software_display */
+
+#ifndef EGL_ANGLE_direct3d_display
+#define EGL_ANGLE_direct3d_display 1
+#define EGL_D3D11_ELSE_D3D9_DISPLAY_ANGLE ((EGLNativeDisplayType)-2)
+#define EGL_D3D11_ONLY_DISPLAY_ANGLE ((EGLNativeDisplayType)-3)
+#endif /* EGL_ANGLE_direct3d_display */
+
+#ifndef EGL_ANGLE_direct_composition
+#define EGL_ANGLE_direct_composition 1
+#define EGL_DIRECT_COMPOSITION_ANGLE 0x33A5
+#endif /* EGL_ANGLE_direct_composition */
+
+#ifndef EGL_ANGLE_platform_angle
+#define EGL_ANGLE_platform_angle 1
+#define EGL_PLATFORM_ANGLE_ANGLE          0x3202
+#define EGL_PLATFORM_ANGLE_TYPE_ANGLE     0x3203
+#define EGL_PLATFORM_ANGLE_MAX_VERSION_MAJOR_ANGLE 0x3204
+#define EGL_PLATFORM_ANGLE_MAX_VERSION_MINOR_ANGLE 0x3205
+#define EGL_PLATFORM_ANGLE_TYPE_DEFAULT_ANGLE 0x3206
+#define EGL_PLATFORM_ANGLE_DEBUG_LAYERS_ENABLED_ANGLE 0x3451
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_ANGLE 0x3209
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_HARDWARE_ANGLE 0x320A
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_NULL_ANGLE 0x345E
+#define EGL_PLATFORM_ANGLE_NATIVE_PLATFORM_TYPE_ANGLE 0x348F
+#endif /* EGL_ANGLE_platform_angle */
+
+#ifndef EGL_ANGLE_platform_angle_d3d
+#define EGL_ANGLE_platform_angle_d3d 1
+#define EGL_PLATFORM_ANGLE_TYPE_D3D9_ANGLE 0x3207
+#define EGL_PLATFORM_ANGLE_TYPE_D3D11_ANGLE 0x3208
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_D3D_WARP_ANGLE 0x320B
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_D3D_REFERENCE_ANGLE 0x320C
+#define EGL_PLATFORM_ANGLE_ENABLE_AUTOMATIC_TRIM_ANGLE 0x320F
+#endif /* EGL_ANGLE_platform_angle_d3d */
+
+#ifndef EGL_ANGLE_platform_angle_d3d_luid
+#define EGL_ANGLE_platform_angle_d3d_luid 1
+#define EGL_PLATFORM_ANGLE_D3D_LUID_HIGH_ANGLE 0x34A0
+#define EGL_PLATFORM_ANGLE_D3D_LUID_LOW_ANGLE 0x34A1
+#endif /* EGL_ANGLE_platform_angle_d3d_luid */
+
+#ifndef EGL_ANGLE_platform_angle_d3d11on12
+#define EGL_ANGLE_platform_angle_d3d11on12 1
+#define EGL_PLATFORM_ANGLE_D3D11ON12_ANGLE 0x3488
+#endif /* EGL_ANGLE_platform_angle_d3d11on12 */
+
+#ifndef EGL_ANGLE_platform_angle_opengl
+#define EGL_ANGLE_platform_angle_opengl 1
+#define EGL_PLATFORM_ANGLE_TYPE_OPENGL_ANGLE 0x320D
+#define EGL_PLATFORM_ANGLE_TYPE_OPENGLES_ANGLE 0x320E
+#define EGL_PLATFORM_ANGLE_EGL_HANDLE_ANGLE 0x3480
+#endif /* EGL_ANGLE_platform_angle_opengl */
+
+#ifndef EGL_ANGLE_platform_angle_null
+#define EGL_ANGLE_platform_angle_null 1
+#define EGL_PLATFORM_ANGLE_TYPE_NULL_ANGLE 0x33AE
+#endif /* EGL_ANGLE_platform_angle_null */
+
+#ifndef EGL_ANGLE_platform_angle_vulkan
+#define EGL_ANGLE_platform_angle_vulkan 1
+#define EGL_PLATFORM_ANGLE_TYPE_VULKAN_ANGLE 0x3450
+#define EGL_PLATFORM_VULKAN_DISPLAY_MODE_SIMPLE_ANGLE 0x34A4
+#define EGL_PLATFORM_VULKAN_DISPLAY_MODE_HEADLESS_ANGLE 0x34A5
+#endif /* EGL_ANGLE_platform_angle_vulkan */
+
+#ifndef EGL_ANGLE_platform_angle_metal
+#define EGL_ANGLE_platform_angle_metal 1
+#define EGL_PLATFORM_ANGLE_TYPE_METAL_ANGLE 0x3489
+#endif /* EGL_ANGLE_platform_angle_metal  */
+
+#ifndef EGL_ANGLE_platform_angle_device_type_swiftshader
+#define EGL_ANGLE_platform_angle_device_type_swiftshader
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_SWIFTSHADER_ANGLE 0x3487
+#endif /* EGL_ANGLE_platform_angle_device_type_swiftshader */
+
+#ifndef EGL_ANGLE_platform_angle_device_type_egl_angle
+#define EGL_ANGLE_platform_angle_device_type_egl_angle
+#define EGL_PLATFORM_ANGLE_DEVICE_TYPE_EGL_ANGLE 0x348E
+#endif /* EGL_ANGLE_platform_angle_device_type_egl_angle */
+
+#ifndef EGL_ANGLE_context_virtualization
+#define EGL_ANGLE_context_virtualization 1
+#define EGL_CONTEXT_VIRTUALIZATION_GROUP_ANGLE 0x3481
+#endif /* EGL_ANGLE_context_virtualization */
+
+#ifndef EGL_ANGLE_platform_angle_device_context_volatile_eagl
+#define EGL_ANGLE_platform_angle_device_context_volatile_eagl 1
+#define EGL_PLATFORM_ANGLE_DEVICE_CONTEXT_VOLATILE_EAGL_ANGLE 0x34A2
+#endif /* EGL_ANGLE_platform_angle_device_context_volatile_eagl */
+
+#ifndef EGL_ANGLE_platform_angle_device_context_volatile_cgl
+#define EGL_ANGLE_platform_angle_device_context_volatile_cgl 1
+#define EGL_PLATFORM_ANGLE_DEVICE_CONTEXT_VOLATILE_CGL_ANGLE 0x34A3
+#endif /* EGL_ANGLE_platform_angle_device_context_volatile_cgl */
+
+#ifndef EGL_ANGLE_platform_angle_device_id
+#define EGL_ANGLE_platform_angle_device_id
+#define EGL_PLATFORM_ANGLE_DEVICE_ID_HIGH_ANGLE 0x34D6
+#define EGL_PLATFORM_ANGLE_DEVICE_ID_LOW_ANGLE 0x34D7
+#define EGL_PLATFORM_ANGLE_DISPLAY_KEY_ANGLE 0x34DC
+#endif /* EGL_ANGLE_platform_angle_device_id */
+
+#ifndef EGL_ANGLE_x11_visual
+#define EGL_ANGLE_x11_visual
+#define EGL_X11_VISUAL_ID_ANGLE 0x33A3
+#endif /* EGL_ANGLE_x11_visual */
+
+#ifndef EGL_ANGLE_surface_orientation
+#define EGL_ANGLE_surface_orientation
+#define EGL_OPTIMAL_SURFACE_ORIENTATION_ANGLE 0x33A7
+#define EGL_SURFACE_ORIENTATION_ANGLE 0x33A8
+#define EGL_SURFACE_ORIENTATION_INVERT_X_ANGLE 0x0001
+#define EGL_SURFACE_ORIENTATION_INVERT_Y_ANGLE 0x0002
+#endif /* EGL_ANGLE_surface_orientation */
+
+#ifndef EGL_ANGLE_experimental_present_path
+#define EGL_ANGLE_experimental_present_path
+#define EGL_EXPERIMENTAL_PRESENT_PATH_ANGLE 0x33A4
+#define EGL_EXPERIMENTAL_PRESENT_PATH_FAST_ANGLE 0x33A9
+#define EGL_EXPERIMENTAL_PRESENT_PATH_COPY_ANGLE 0x33AA
+#endif /* EGL_ANGLE_experimental_present_path */
+
+#ifndef EGL_ANGLE_stream_producer_d3d_texture
+#define EGL_ANGLE_stream_producer_d3d_texture
+#define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x33AB
+typedef EGLBoolean(EGLAPIENTRYP PFNEGLCREATESTREAMPRODUCERD3DTEXTUREANGLEPROC)(EGLDisplay dpy, EGLStreamKHR stream, const EGLAttrib *attrib_list);
+typedef EGLBoolean(EGLAPIENTRYP PFNEGLSTREAMPOSTD3DTEXTUREANGLEPROC)(EGLDisplay dpy, EGLStreamKHR stream, void *texture, const EGLAttrib *attrib_list);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglCreateStreamProducerD3DTextureANGLE(EGLDisplay dpy, EGLStreamKHR stream, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamPostD3DTextureANGLE(EGLDisplay dpy, EGLStreamKHR stream, void *texture, const EGLAttrib *attrib_list);
+#endif
+#endif /* EGL_ANGLE_stream_producer_d3d_texture */
+
+#ifndef EGL_ANGLE_create_context_webgl_compatibility
+#define EGL_ANGLE_create_context_webgl_compatibility 1
+#define EGL_CONTEXT_WEBGL_COMPATIBILITY_ANGLE 0x33AC
+#endif /* EGL_ANGLE_create_context_webgl_compatibility */
+
+#ifndef EGL_ANGLE_display_texture_share_group
+#define EGL_ANGLE_display_texture_share_group 1
+#define EGL_DISPLAY_TEXTURE_SHARE_GROUP_ANGLE 0x33AF
+#endif /* EGL_ANGLE_display_texture_share_group */
+
+#ifndef EGL_CHROMIUM_create_context_bind_generates_resource
+#define EGL_CHROMIUM_create_context_bind_generates_resource 1
+#define EGL_CONTEXT_BIND_GENERATES_RESOURCE_CHROMIUM 0x33AD
+#endif /* EGL_CHROMIUM_create_context_bind_generates_resource */
+
+#ifndef EGL_ANGLE_metal_create_context_ownership_identity
+#define EGL_ANGLE_metal_create_context_ownership_identity 1
+#define EGL_CONTEXT_METAL_OWNERSHIP_IDENTITY_ANGLE 0x34D2
+#endif /* EGL_ANGLE_metal_create_context_ownership_identity */
+
+#ifndef EGL_ANGLE_create_context_client_arrays
+#define EGL_ANGLE_create_context_client_arrays 1
+#define EGL_CONTEXT_CLIENT_ARRAYS_ENABLED_ANGLE 0x3452
+#endif /* EGL_ANGLE_create_context_client_arrays */
+
+#ifndef EGL_ANGLE_device_creation
+#define EGL_ANGLE_device_creation 1
+typedef EGLDeviceEXT(EGLAPIENTRYP PFNEGLCREATEDEVICEANGLEPROC) (EGLint device_type, void *native_device, const EGLAttrib *attrib_list);
+typedef EGLBoolean(EGLAPIENTRYP PFNEGLRELEASEDEVICEANGLEPROC) (EGLDeviceEXT device);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLDeviceEXT EGLAPIENTRY eglCreateDeviceANGLE(EGLint device_type, void *native_device, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseDeviceANGLE(EGLDeviceEXT device);
+#endif
+#endif /* EGL_ANGLE_device_creation */
+
+#ifndef EGL_ANGLE_program_cache_control
+#define EGL_ANGLE_program_cache_control 1
+#define EGL_PROGRAM_CACHE_SIZE_ANGLE 0x3455
+#define EGL_PROGRAM_CACHE_KEY_LENGTH_ANGLE 0x3456
+#define EGL_PROGRAM_CACHE_RESIZE_ANGLE 0x3457
+#define EGL_PROGRAM_CACHE_TRIM_ANGLE 0x3458
+#define EGL_CONTEXT_PROGRAM_BINARY_CACHE_ENABLED_ANGLE 0x3459
+typedef EGLint (EGLAPIENTRYP PFNEGLPROGRAMCACHEGETATTRIBANGLEPROC) (EGLDisplay dpy, EGLenum attrib);
+typedef void (EGLAPIENTRYP PFNEGLPROGRAMCACHEQUERYANGLEPROC) (EGLDisplay dpy, EGLint index, void *key, EGLint *keysize, void *binary, EGLint *binarysize);
+typedef void (EGLAPIENTRYP PFNEGLPROGRAMCACHEPOPULATEANGLEPROC) (EGLDisplay dpy, const void *key, EGLint keysize, const void *binary, EGLint binarysize);
+typedef EGLint (EGLAPIENTRYP PFNEGLPROGRAMCACHERESIZEANGLEPROC) (EGLDisplay dpy, EGLint limit, EGLint mode);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLint EGLAPIENTRY eglProgramCacheGetAttribANGLE(EGLDisplay dpy, EGLenum attrib);
+EGLAPI void EGLAPIENTRY eglProgramCacheQueryANGLE(EGLDisplay dpy, EGLint index, void *key, EGLint *keysize, void *binary, EGLint *binarysize);
+EGLAPI void EGLAPIENTRY eglProgramCachePopulateANGLE(EGLDisplay dpy, const void *key, EGLint keysize, const void *binary, EGLint binarysize);
+EGLAPI EGLint EGLAPIENTRY eglProgramCacheResizeANGLE(EGLDisplay dpy, EGLint limit, EGLint mode);
+#endif
+#endif /* EGL_ANGLE_program_cache_control */
+
+#ifndef EGL_ANGLE_iosurface_client_buffer
+#define EGL_ANGLE_iosurface_client_buffer 1
+#define EGL_IOSURFACE_ANGLE 0x3454
+#define EGL_IOSURFACE_PLANE_ANGLE 0x345A
+#define EGL_TEXTURE_RECTANGLE_ANGLE 0x345B
+#define EGL_TEXTURE_TYPE_ANGLE 0x345C
+#define EGL_TEXTURE_INTERNAL_FORMAT_ANGLE 0x345D
+#define EGL_IOSURFACE_USAGE_HINT_ANGLE 0x348A
+#define EGL_IOSURFACE_READ_HINT_ANGLE 0x0001
+#define EGL_IOSURFACE_WRITE_HINT_ANGLE 0x0002
+#define EGL_BIND_TO_TEXTURE_TARGET_ANGLE 0x348D
+#endif /* EGL_ANGLE_iosurface_client_buffer */
+
+#ifndef ANGLE_metal_texture_client_buffer
+#define ANGLE_metal_texture_client_buffer 1
+#define EGL_METAL_TEXTURE_ANGLE 0x34A7
+#endif /* ANGLE_metal_texture_client_buffer */
+
+#ifndef EGL_ANGLE_create_context_extensions_enabled
+#define EGL_ANGLE_create_context_extensions_enabled 1
+#define EGL_EXTENSIONS_ENABLED_ANGLE 0x345F
+#endif /* EGL_ANGLE_create_context_extensions_enabled */
+
+#ifndef EGL_CHROMIUM_sync_control
+#define EGL_CHROMIUM_sync_control 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCVALUESCHROMIUMPROC) (EGLDisplay dpy,
+                                                             EGLSurface surface,
+                                                             EGLuint64KHR *ust,
+                                                             EGLuint64KHR *msc,
+                                                             EGLuint64KHR *sbc);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncValuesCHROMIUM(EGLDisplay dpy,
+                                                             EGLSurface surface,
+                                                             EGLuint64KHR *ust,
+                                                             EGLuint64KHR *msc,
+                                                             EGLuint64KHR *sbc);
+#endif
+#endif /* EGL_CHROMIUM_sync_control */
+
+#ifndef EGL_ANGLE_sync_control_rate
+#define EGL_ANGLE_sync_control_rate 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETMSCRATEANGLEPROC) (EGLDisplay dpy,
+                                                             EGLSurface surface,
+                                                             EGLint *numerator,
+                                                             EGLint *denominator);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglGetMscRateANGLE(EGLDisplay dpy,
+                                                             EGLSurface surface,
+                                                             EGLint *numerator,
+                                                             EGLint *denominator);
+#endif
+#endif /* EGL_ANGLE_sync_control_rate */
+
+#ifndef EGL_ANGLE_power_preference
+#define EGL_ANGLE_power_preference 1
+#define EGL_POWER_PREFERENCE_ANGLE 0x3482
+#define EGL_LOW_POWER_ANGLE 0x0001
+#define EGL_HIGH_POWER_ANGLE 0x0002
+typedef void(EGLAPIENTRYP PFNEGLRELEASEHIGHPOWERGPUANGLEPROC) (EGLDisplay dpy, EGLContext ctx);
+typedef void(EGLAPIENTRYP PFNEGLREACQUIREHIGHPOWERGPUANGLEPROC) (EGLDisplay dpy, EGLContext ctx);
+typedef void(EGLAPIENTRYP PFNEGLHANDLEGPUSWITCHANGLEPROC) (EGLDisplay dpy);
+typedef void(EGLAPIENTRYP PFNEGLFORCEGPUSWITCHANGLEPROC) (EGLDisplay dpy, EGLint gpuIDHigh, EGLint gpuIDLow);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI void EGLAPIENTRY eglReleaseHighPowerGPUANGLE(EGLDisplay dpy, EGLContext ctx);
+EGLAPI void EGLAPIENTRY eglReacquireHighPowerGPUANGLE(EGLDisplay dpy, EGLContext ctx);
+EGLAPI void EGLAPIENTRY eglHandleGPUSwitchANGLE(EGLDisplay dpy);
+EGLAPI void EGLAPIENTRY eglForceGPUSwitchANGLE(EGLDisplay dpy, EGLint gpuIDHigh, EGLint gpuIDLow);
+#endif
+#endif /* EGL_ANGLE_power_preference */
+
+#ifndef EGL_ANGLE_wait_until_work_scheduled
+#define EGL_ANGLE_wait_until_work_scheduled 1
+typedef void(EGLAPIENTRYP PFNEGLWAITUNTILWORKSCHEDULEDANGLEPROC) (EGLDisplay dpy);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI void EGLAPIENTRY eglWaitUntilWorkScheduledANGLE(EGLDisplay dpy);
+#endif
+#endif /* EGL_ANGLE_wait_until_work_scheduled */
+
+#ifndef EGL_ANGLE_feature_control
+#define EGL_ANGLE_feature_control 1
+#define EGL_FEATURE_NAME_ANGLE 0x3460
+#define EGL_FEATURE_CATEGORY_ANGLE 0x3461
+#define EGL_FEATURE_DESCRIPTION_ANGLE 0x3462
+#define EGL_FEATURE_BUG_ANGLE 0x3463
+#define EGL_FEATURE_STATUS_ANGLE 0x3464
+#define EGL_FEATURE_COUNT_ANGLE 0x3465
+#define EGL_FEATURE_OVERRIDES_ENABLED_ANGLE 0x3466
+#define EGL_FEATURE_OVERRIDES_DISABLED_ANGLE 0x3467
+#define EGL_FEATURE_CONDITION_ANGLE 0x3468
+#define EGL_FEATURE_ALL_DISABLED_ANGLE 0x3469
+typedef const char *(EGLAPIENTRYP PFNEGLQUERYSTRINGIANGLEPROC) (EGLDisplay dpy, EGLint name, EGLint index);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDISPLAYATTRIBANGLEPROC) (EGLDisplay dpy, EGLint attribute, EGLAttrib *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI const char *EGLAPIENTRY eglQueryStringiANGLE(EGLDisplay dpy, EGLint name, EGLint index);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDisplayAttribANGLE(EGLDisplay dpy, EGLint attribute, EGLAttrib *value);
+#endif
+#endif /* EGL_ANGLE_feature_control */
+
+#ifndef EGL_ANGLE_image_d3d11_texture
+#define EGL_D3D11_TEXTURE_ANGLE 0x3484
+#define EGL_TEXTURE_INTERNAL_FORMAT_ANGLE 0x345D
+#endif /* EGL_ANGLE_image_d3d11_texture */
+
+#ifndef EGL_ANGLE_create_context_backwards_compatible
+#define EGL_ANGLE_create_context_backwards_compatible 1
+#define EGL_CONTEXT_OPENGL_BACKWARDS_COMPATIBLE_ANGLE 0x3483
+#endif /* EGL_ANGLE_create_context_backwards_compatible */
+
+#ifndef EGL_ANGLE_device_cgl
+#define EGL_ANGLE_device_cgl 1
+#define EGL_CGL_CONTEXT_ANGLE 0x3485
+#define EGL_CGL_PIXEL_FORMAT_ANGLE 0x3486
+#endif
+
+#ifndef EGL_ANGLE_ggp_stream_descriptor
+#define EGL_ANGLE_ggp_stream_descriptor 1
+#define EGL_GGP_STREAM_DESCRIPTOR_ANGLE 0x348B
+#endif /* EGL_ANGLE_ggp_stream_descriptor */
+
+#ifndef EGL_ANGLE_swap_with_frame_token
+#define EGL_ANGLE_swap_with_frame_token 1
+typedef khronos_uint64_t EGLFrameTokenANGLE;
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSWITHFRAMETOKENANGLEPROC)(EGLDisplay dpy, EGLSurface surface, EGLFrameTokenANGLE frametoken);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersWithFrameTokenANGLE(EGLDisplay dpy, EGLSurface surface, EGLFrameTokenANGLE frametoken);
+#endif
+#endif /* EGL_ANGLE_swap_with_frame_token */
+
+#ifndef EGL_ANGLE_prepare_swap_buffers
+#define EGL_ANGLE_prepare_swap_buffers 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLPREPARESWAPBUFFERSANGLEPROC)(EGLDisplay dpy, EGLSurface surface);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglPrepareSwapBuffersANGLE(EGLDisplay dpy, EGLSurface surface);
+#endif
+#endif /* EGL_ANGLE_prepare_swap_buffers */
+
+#ifndef EGL_ANGLE_device_eagl
+#define EGL_ANGLE_device_eagl 1
+#define EGL_EAGL_CONTEXT_ANGLE 0x348C
+#endif
+
+#ifndef EGL_ANGLE_device_metal
+#define EGL_ANGLE_device_metal 1
+#define EGL_METAL_DEVICE_ANGLE 0x34A6
+#endif /* EGL_ANGLE_device_metal */
+
+#ifndef EGL_ANGLE_display_semaphore_share_group
+#define EGL_ANGLE_display_semaphore_share_group 1
+#define EGL_DISPLAY_SEMAPHORE_SHARE_GROUP_ANGLE 0x348D
+#endif /* EGL_ANGLE_display_semaphore_share_group */
+
+#ifndef EGL_ANGLE_external_context_and_surface
+#define EGL_ANGLE_external_context_and_surface 1
+#define EGL_EXTERNAL_CONTEXT_ANGLE 0x348E
+#define EGL_EXTERNAL_SURFACE_ANGLE 0x348F
+#define EGL_EXTERNAL_CONTEXT_SAVE_STATE_ANGLE 0x3490
+#endif /* EGL_ANGLE_external_context_and_surface */
+
+#ifndef EGL_ANGLE_create_surface_swap_interval
+#define EGL_ANGLE_create_surface_swap_interval 1
+#define EGL_SWAP_INTERVAL_ANGLE 0x322F
+#endif /* EGL_ANGLE_create_surface_swap_interval */
+
+#ifndef EGL_ANGLE_device_vulkan
+#define EGL_ANGLE_device_vulkan 1
+#define EGL_VULKAN_VERSION_ANGLE 0x34A8
+#define EGL_VULKAN_INSTANCE_ANGLE 0x34A9
+#define EGL_VULKAN_INSTANCE_EXTENSIONS_ANGLE 0x34AA
+#define EGL_VULKAN_PHYSICAL_DEVICE_ANGLE 0x34AB
+#define EGL_VULKAN_DEVICE_ANGLE 0x34AC
+#define EGL_VULKAN_DEVICE_EXTENSIONS_ANGLE 0x34AD
+#define EGL_VULKAN_FEATURES_ANGLE 0x34AE
+#define EGL_VULKAN_QUEUE_ANGLE 0x34AF
+#define EGL_VULKAN_QUEUE_FAMILIY_INDEX_ANGLE 0x34D0
+#define EGL_VULKAN_GET_INSTANCE_PROC_ADDR 0x34D1
+#endif /* EGL_ANGLE_device_vulkan */
+
+#ifndef EGL_ANGLE_vulkan_image
+#define EGL_ANGLE_vulkan_image
+#define EGL_VULKAN_IMAGE_ANGLE 0x34D3
+#define EGL_VULKAN_IMAGE_CREATE_INFO_HI_ANGLE 0x34D4
+#define EGL_VULKAN_IMAGE_CREATE_INFO_LO_ANGLE 0x34D5
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLEXPORTVKIMAGEANGLEPROC)(EGLDisplay dpy, EGLImage image, void* vk_image, void* vk_image_create_info);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglExportVkImageANGLE(EGLDisplay dpy, EGLImage image, void* vk_image, void* vk_image_create_info);
+#endif
+#endif /* EGL_ANGLE_vulkan_image */
+
+#ifndef EGL_ANGLE_metal_shared_event_sync
+#define EGL_ANGLE_metal_hared_event_sync 1
+#define EGL_SYNC_METAL_SHARED_EVENT_ANGLE 0x34D8
+#define EGL_SYNC_METAL_SHARED_EVENT_OBJECT_ANGLE 0x34D9
+#define EGL_SYNC_METAL_SHARED_EVENT_SIGNAL_VALUE_LO_ANGLE 0x34DA
+#define EGL_SYNC_METAL_SHARED_EVENT_SIGNAL_VALUE_HI_ANGLE 0x34DB
+#define EGL_SYNC_METAL_SHARED_EVENT_SIGNALED_ANGLE 0x34DC
+typedef void* (EGLAPIENTRYP PFNEGLCOPYMETALSHAREDEVENTANGLEPROC)(EGLDisplay dpy, EGLSync sync);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI void *EGLAPIENTRY eglCopyMetalSharedEventANGLE(EGLDisplay dpy, EGLSync sync);
+#endif
+#endif /* EGL_ANGLE_metal_shared_event_sync */
+
+// clang-format on
+
+#endif  // INCLUDE_EGL_EGLEXT_ANGLE_
--- a/thirdparty/angle/include/EGL/eglplatform.h
+++ b/thirdparty/angle/include/EGL/eglplatform.h
@@ -0,0 +1,175 @@
+#ifndef __eglplatform_h_
+#define __eglplatform_h_
+
+/*
+** Copyright 2007-2020 The Khronos Group Inc.
+** SPDX-License-Identifier: Apache-2.0
+*/
+
+/* Platform-specific types and definitions for egl.h
+ *
+ * Adopters may modify khrplatform.h and this file to suit their platform.
+ * You are encouraged to submit all modifications to the Khronos group so that
+ * they can be included in future versions of this file.  Please submit changes
+ * by filing an issue or pull request on the public Khronos EGL Registry, at
+ * https://www.github.com/KhronosGroup/EGL-Registry/
+ */
+
+#include <KHR/khrplatform.h>
+
+/* Macros used in EGL function prototype declarations.
+ *
+ * EGL functions should be prototyped as:
+ *
+ * EGLAPI return-type EGLAPIENTRY eglFunction(arguments);
+ * typedef return-type (EXPAPIENTRYP PFNEGLFUNCTIONPROC) (arguments);
+ *
+ * KHRONOS_APICALL and KHRONOS_APIENTRY are defined in KHR/khrplatform.h
+ */
+
+#ifndef EGLAPI
+#define EGLAPI KHRONOS_APICALL
+#endif
+
+#ifndef EGLAPIENTRY
+#define EGLAPIENTRY  KHRONOS_APIENTRY
+#endif
+#define EGLAPIENTRYP EGLAPIENTRY*
+
+/* The types NativeDisplayType, NativeWindowType, and NativePixmapType
+ * are aliases of window-system-dependent types, such as X Display * or
+ * Windows Device Context. They must be defined in platform-specific
+ * code below. The EGL-prefixed versions of Native*Type are the same
+ * types, renamed in EGL 1.3 so all types in the API start with "EGL".
+ *
+ * Khronos STRONGLY RECOMMENDS that you use the default definitions
+ * provided below, since these changes affect both binary and source
+ * portability of applications using EGL running on different EGL
+ * implementations.
+ */
+
+#if defined(EGL_NO_PLATFORM_SPECIFIC_TYPES)
+
+typedef void *EGLNativeDisplayType;
+typedef void *EGLNativePixmapType;
+typedef void *EGLNativeWindowType;
+
+#elif defined(_WIN32) || defined(__VC32__) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__) /* Win32 and WinCE */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+
+typedef HDC     EGLNativeDisplayType;
+typedef HBITMAP EGLNativePixmapType;
+
+#if !defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP) /* Windows Desktop */
+typedef HWND    EGLNativeWindowType;
+#else /* Windows Store */
+#include <inspectable.h>
+typedef IInspectable* EGLNativeWindowType;
+#endif
+
+#elif defined(__EMSCRIPTEN__)
+
+typedef int EGLNativeDisplayType;
+typedef int EGLNativePixmapType;
+typedef int EGLNativeWindowType;
+
+#elif defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
+
+typedef int   EGLNativeDisplayType;
+typedef void *EGLNativePixmapType;
+typedef void *EGLNativeWindowType;
+
+#elif defined(WL_EGL_PLATFORM)
+
+typedef struct wl_display     *EGLNativeDisplayType;
+typedef struct wl_egl_pixmap  *EGLNativePixmapType;
+typedef struct wl_egl_window  *EGLNativeWindowType;
+
+#elif defined(__GBM__)
+
+typedef struct gbm_device  *EGLNativeDisplayType;
+typedef struct gbm_bo      *EGLNativePixmapType;
+typedef void               *EGLNativeWindowType;
+
+#elif defined(__ANDROID__) || defined(ANDROID)
+
+struct ANativeWindow;
+struct egl_native_pixmap_t;
+
+typedef void*                           EGLNativeDisplayType;
+typedef struct egl_native_pixmap_t*     EGLNativePixmapType;
+typedef struct ANativeWindow*           EGLNativeWindowType;
+
+#elif defined(USE_OZONE)
+
+typedef intptr_t EGLNativeDisplayType;
+typedef intptr_t EGLNativePixmapType;
+typedef intptr_t EGLNativeWindowType;
+
+#elif defined(USE_X11)
+
+/* X11 (tentative)  */
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+
+typedef Display *EGLNativeDisplayType;
+typedef Pixmap   EGLNativePixmapType;
+typedef Window   EGLNativeWindowType;
+
+#elif defined(__unix__)
+
+typedef void             *EGLNativeDisplayType;
+typedef khronos_uintptr_t EGLNativePixmapType;
+typedef khronos_uintptr_t EGLNativeWindowType;
+
+#elif defined(__APPLE__)
+
+typedef int   EGLNativeDisplayType;
+typedef void *EGLNativePixmapType;
+typedef void *EGLNativeWindowType;
+
+#elif defined(__HAIKU__)
+
+#include <kernel/image.h>
+
+typedef void              *EGLNativeDisplayType;
+typedef khronos_uintptr_t  EGLNativePixmapType;
+typedef khronos_uintptr_t  EGLNativeWindowType;
+
+#elif defined(__Fuchsia__)
+
+typedef void              *EGLNativeDisplayType;
+typedef khronos_uintptr_t  EGLNativePixmapType;
+typedef khronos_uintptr_t  EGLNativeWindowType;
+
+#else
+#error "Platform not recognized"
+#endif
+
+/* EGL 1.2 types, renamed for consistency in EGL 1.3 */
+typedef EGLNativeDisplayType NativeDisplayType;
+typedef EGLNativePixmapType  NativePixmapType;
+typedef EGLNativeWindowType  NativeWindowType;
+
+
+/* Define EGLint. This must be a signed integral type large enough to contain
+ * all legal attribute names and values passed into and out of EGL, whether
+ * their type is boolean, bitmask, enumerant (symbolic constant), integer,
+ * handle, or other.  While in general a 32-bit integer will suffice, if
+ * handles are 64 bit types, then EGLint should be defined as a signed 64-bit
+ * integer type.
+ */
+typedef khronos_int32_t EGLint;
+
+
+/* C++ / C typecast macros for special EGL handle values */
+#if defined(__cplusplus)
+#define EGL_CAST(type, value) (static_cast<type>(value))
+#else
+#define EGL_CAST(type, value) ((type) (value))
+#endif
+
+#endif /* __eglplatform_h */
--- a/thirdparty/angle/include/KHR/khrplatform.h
+++ b/thirdparty/angle/include/KHR/khrplatform.h
@@ -0,0 +1,290 @@
+#ifndef __khrplatform_h_
+#define __khrplatform_h_
+
+/*
+** Copyright (c) 2008-2018 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a
+** copy of this software and/or associated documentation files (the
+** "Materials"), to deal in the Materials without restriction, including
+** without limitation the rights to use, copy, modify, merge, publish,
+** distribute, sublicense, and/or sell copies of the Materials, and to
+** permit persons to whom the Materials are furnished to do so, subject to
+** the following conditions:
+**
+** The above copyright notice and this permission notice shall be included
+** in all copies or substantial portions of the Materials.
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+/* Khronos platform-specific types and definitions.
+ *
+ * The master copy of khrplatform.h is maintained in the Khronos EGL
+ * Registry repository at https://github.com/KhronosGroup/EGL-Registry
+ * The last semantic modification to khrplatform.h was at commit ID:
+ *      67a3e0864c2d75ea5287b9f3d2eb74a745936692
+ *
+ * Adopters may modify this file to suit their platform. Adopters are
+ * encouraged to submit platform specific modifications to the Khronos
+ * group so that they can be included in future versions of this file.
+ * Please submit changes by filing pull requests or issues on
+ * the EGL Registry repository linked above.
+ *
+ *
+ * See the Implementer's Guidelines for information about where this file
+ * should be located on your system and for more details of its use:
+ *    http://www.khronos.org/registry/implementers_guide.pdf
+ *
+ * This file should be included as
+ *        #include <KHR/khrplatform.h>
+ * by Khronos client API header files that use its types and defines.
+ *
+ * The types in khrplatform.h should only be used to define API-specific types.
+ *
+ * Types defined in khrplatform.h:
+ *    khronos_int8_t              signed   8  bit
+ *    khronos_uint8_t             unsigned 8  bit
+ *    khronos_int16_t             signed   16 bit
+ *    khronos_uint16_t            unsigned 16 bit
+ *    khronos_int32_t             signed   32 bit
+ *    khronos_uint32_t            unsigned 32 bit
+ *    khronos_int64_t             signed   64 bit
+ *    khronos_uint64_t            unsigned 64 bit
+ *    khronos_intptr_t            signed   same number of bits as a pointer
+ *    khronos_uintptr_t           unsigned same number of bits as a pointer
+ *    khronos_ssize_t             signed   size
+ *    khronos_usize_t             unsigned size
+ *    khronos_float_t             signed   32 bit floating point
+ *    khronos_time_ns_t           unsigned 64 bit time in nanoseconds
+ *    khronos_utime_nanoseconds_t unsigned time interval or absolute time in
+ *                                         nanoseconds
+ *    khronos_stime_nanoseconds_t signed time interval in nanoseconds
+ *    khronos_boolean_enum_t      enumerated boolean type. This should
+ *      only be used as a base type when a client API's boolean type is
+ *      an enum. Client APIs which use an integer or other type for
+ *      booleans cannot use this as the base type for their boolean.
+ *
+ * Tokens defined in khrplatform.h:
+ *
+ *    KHRONOS_FALSE, KHRONOS_TRUE Enumerated boolean false/true values.
+ *
+ *    KHRONOS_SUPPORT_INT64 is 1 if 64 bit integers are supported; otherwise 0.
+ *    KHRONOS_SUPPORT_FLOAT is 1 if floats are supported; otherwise 0.
+ *
+ * Calling convention macros defined in this file:
+ *    KHRONOS_APICALL
+ *    KHRONOS_APIENTRY
+ *    KHRONOS_APIATTRIBUTES
+ *
+ * These may be used in function prototypes as:
+ *
+ *      KHRONOS_APICALL void KHRONOS_APIENTRY funcname(
+ *                                  int arg1,
+ *                                  int arg2) KHRONOS_APIATTRIBUTES;
+ */
+
+#if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC)
+#   define KHRONOS_STATIC 1
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APICALL
+ *-------------------------------------------------------------------------
+ * This precedes the return type of the function in the function prototype.
+ */
+#if defined(KHRONOS_STATIC)
+    /* If the preprocessor constant KHRONOS_STATIC is defined, make the
+     * header compatible with static linking. */
+#   define KHRONOS_APICALL
+#elif defined(_WIN32)
+#   define KHRONOS_APICALL __declspec(dllimport)
+#elif defined (__SYMBIAN32__)
+#   define KHRONOS_APICALL IMPORT_C
+#elif defined(__ANDROID__)
+#   define KHRONOS_APICALL __attribute__((visibility("default")))
+#else
+#   define KHRONOS_APICALL
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIENTRY
+ *-------------------------------------------------------------------------
+ * This follows the return type of the function  and precedes the function
+ * name in the function prototype.
+ */
+#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
+    /* Win32 but not WinCE */
+#   define KHRONOS_APIENTRY __stdcall
+#else
+#   define KHRONOS_APIENTRY
+#endif
+
+/*-------------------------------------------------------------------------
+ * Definition of KHRONOS_APIATTRIBUTES
+ *-------------------------------------------------------------------------
+ * This follows the closing parenthesis of the function prototype arguments.
+ */
+#if defined (__ARMCC_2__)
+#define KHRONOS_APIATTRIBUTES __softfp
+#else
+#define KHRONOS_APIATTRIBUTES
+#endif
+
+/*-------------------------------------------------------------------------
+ * basic type definitions
+ *-----------------------------------------------------------------------*/
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__GNUC__) || defined(__SCO__) || defined(__USLC__)
+
+
+/*
+ * Using <stdint.h>
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(__VMS ) || defined(__sgi)
+
+/*
+ * Using <inttypes.h>
+ */
+#include <inttypes.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(_WIN32) && !defined(__SCITECH_SNAP__)
+
+/*
+ * Win32
+ */
+typedef __int32                 khronos_int32_t;
+typedef unsigned __int32        khronos_uint32_t;
+typedef __int64                 khronos_int64_t;
+typedef unsigned __int64        khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif defined(__sun__) || defined(__digital__)
+
+/*
+ * Sun or Digital
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#if defined(__arch64__) || defined(_LP64)
+typedef long int                khronos_int64_t;
+typedef unsigned long int       khronos_uint64_t;
+#else
+typedef long long int           khronos_int64_t;
+typedef unsigned long long int  khronos_uint64_t;
+#endif /* __arch64__ */
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#elif 0
+
+/*
+ * Hypothetical platform with no float or int64 support
+ */
+typedef int                     khronos_int32_t;
+typedef unsigned int            khronos_uint32_t;
+#define KHRONOS_SUPPORT_INT64   0
+#define KHRONOS_SUPPORT_FLOAT   0
+
+#else
+
+/*
+ * Generic fallback
+ */
+#include <stdint.h>
+typedef int32_t                 khronos_int32_t;
+typedef uint32_t                khronos_uint32_t;
+typedef int64_t                 khronos_int64_t;
+typedef uint64_t                khronos_uint64_t;
+#define KHRONOS_SUPPORT_INT64   1
+#define KHRONOS_SUPPORT_FLOAT   1
+
+#endif
+
+
+/*
+ * Types that are (so far) the same on all platforms
+ */
+typedef signed   char          khronos_int8_t;
+typedef unsigned char          khronos_uint8_t;
+typedef signed   short int     khronos_int16_t;
+typedef unsigned short int     khronos_uint16_t;
+
+/*
+ * Types that differ between LLP64 and LP64 architectures - in LLP64,
+ * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
+ * to be the only LLP64 architecture in current use.
+ */
+#ifdef _WIN64
+typedef signed   long long int khronos_intptr_t;
+typedef unsigned long long int khronos_uintptr_t;
+typedef signed   long long int khronos_ssize_t;
+typedef unsigned long long int khronos_usize_t;
+#else
+typedef signed   long  int     khronos_intptr_t;
+typedef unsigned long  int     khronos_uintptr_t;
+typedef signed   long  int     khronos_ssize_t;
+typedef unsigned long  int     khronos_usize_t;
+#endif
+
+#if KHRONOS_SUPPORT_FLOAT
+/*
+ * Float type
+ */
+typedef          float         khronos_float_t;
+#endif
+
+#if KHRONOS_SUPPORT_INT64
+/* Time types
+ *
+ * These types can be used to represent a time interval in nanoseconds or
+ * an absolute Unadjusted System Time.  Unadjusted System Time is the number
+ * of nanoseconds since some arbitrary system event (e.g. since the last
+ * time the system booted).  The Unadjusted System Time is an unsigned
+ * 64 bit value that wraps back to 0 every 584 years.  Time intervals
+ * may be either signed or unsigned.
+ */
+typedef khronos_uint64_t       khronos_utime_nanoseconds_t;
+typedef khronos_int64_t        khronos_stime_nanoseconds_t;
+#endif
+
+/*
+ * Dummy value used to pad enum types to 32 bits.
+ */
+#ifndef KHRONOS_MAX_ENUM
+#define KHRONOS_MAX_ENUM 0x7FFFFFFF
+#endif
+
+/*
+ * Enumerated boolean type
+ *
+ * Values other than zero should be considered to be true.  Therefore
+ * comparisons should not be made against KHRONOS_TRUE.
+ */
+typedef enum {
+    KHRONOS_FALSE = 0,
+    KHRONOS_TRUE  = 1,
+    KHRONOS_BOOLEAN_ENUM_FORCE_SIZE = KHRONOS_MAX_ENUM
+} khronos_boolean_enum_t;
+
+#endif /* __khrplatform_h_ */
--- a/thirdparty/astcenc/LICENSE.txt
+++ b/thirdparty/astcenc/LICENSE.txt
@@ -0,0 +1,175 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
--- a/thirdparty/astcenc/astcenc.h
+++ b/thirdparty/astcenc/astcenc.h
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief The core astcenc codec library interface.
+ *
+ * This interface is the entry point to the core astcenc codec. It aims to be easy to use for
+ * non-experts, but also to allow experts to have fine control over the compressor heuristics if
+ * needed. The core codec only handles compression and decompression, transferring all inputs and
+ * outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
+ * security and stability problems, all transfer buffers are explicitly sized.
+ *
+ * While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
+ * interface tied to a specific source version. We are not trying to maintain backwards
+ * compatibility across codec versions.
+ *
+ * The API state management is based around an explicit context object, which is the context for all
+ * allocated memory resources needed to compress and decompress a single image. A context can be
+ * used to sequentially compress multiple images using the same configuration, allowing setup
+ * overheads to be amortized over multiple images, which is particularly important when images are
+ * small.
+ *
+ * Multi-threading can be used two ways.
+ *
+ *     * An application wishing to process multiple images in parallel can allocate multiple
+ *       contexts and assign each context to a thread.
+ *     * An application wishing to process a single image in using multiple threads can configure
+ *       contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
+ *       for faster processing. The caller is responsible for creating the worker threads, and
+ *       synchronizing between images.
+ *
+ * Extended instruction set support
+ * ================================
+ *
+ * This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
+ * enabled at compile time when building the library. There is no runtime checking in the core
+ * library that the instruction sets used are actually available. Checking compatibility is the
+ * responsibility of the calling code.
+ *
+ * Threading
+ * =========
+ *
+ * In pseudo-code, the usage for manual user threading looks like this:
+ *
+ *     // Configure the compressor run
+ *     astcenc_config my_config;
+ *     astcenc_config_init(..., &my_config);
+ *
+ *     // Power users can tweak <my_config> settings here ...
+ *
+ *     // Allocate working state given config and thread_count
+ *     astcenc_context* my_context;
+ *     astcenc_context_alloc(&my_config, thread_count, &my_context);
+ *
+ *     // Compress each image using these config settings
+ *     foreach image:
+ *         // For each thread in the thread pool
+ *         for i in range(0, thread_count):
+ *             astcenc_compress_image(my_context, &my_input, my_output, i);
+ *
+ *         astcenc_compress_reset(my_context);
+ *
+ *     // Clean up
+ *     astcenc_context_free(my_context);
+ *
+ * Images
+ * ======
+ *
+ * The codec supports compressing single images, which can be either 2D images or volumetric 3D
+ * images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
+ * texture arrays, or sliced 3D textures.
+ *
+ * Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
+ * half-float, or 32-bit float, as indicated by the data_type field.
+ *
+ * Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
+ *
+ * Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
+ * within an image slice is always tightly packed without padding. Addressing looks like this:
+ *
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4    ]   // Red
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1]   // Green
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2]   // Blue
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3]   // Alpha
+ *
+ * Common compressor usage
+ * =======================
+ *
+ * One of the most important things for coding image quality is to align the input data component
+ * count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
+ * actually need in the endpoint colors.
+ *
+ *         | Input data   | Encoding swizzle | Sampling swizzle |
+ *         | ------------ | ---------------- | ---------------- |
+ *         | 1 component  | RRR1             | .[rgb]           |
+ *         | 2 components | RRRG             | .[rgb]a          |
+ *         | 3 components | RGB1             | .rgb             |
+ *         | 4 components | RGBA             | .rgba            |
+ *
+ * The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
+ * provide best compatibility with other texture formats where the green component may be stored at
+ * higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
+ * the luminance endpoint component will be returned for all three.
+ *
+ * When using the normal map compression mode ASTC will store normals as a two component X+Y map.
+ * Input images must contain unit-length normalized and should be passed in using a two component
+ * swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
+ * to use GGGR for compatability with BC5n which will work just as well. The Z component can be
+ * recovered programmatically in shader code, using knowledge that the vector is unit length and
+ * that Z must be positive for a tangent-space normal map.
+ *
+ * Decompress-only usage
+ * =====================
+ *
+ * For some use cases it is useful to have a cut-down context and/or library which supports
+ * decompression but not compression.
+ *
+ * A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
+ * is allocated. These contexts have lower dynamic memory footprint than a full context.
+ *
+ * The entire library can be made decompress-only by building the files with the define
+ * ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
+ * exclude the functionality which is only needed for compression. This reduces the binary size by
+ * ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
+ *
+ * Note that context structures returned by a library built as decompress-only are incompatible with
+ * a library built with compression included, and visa versa, as they have different sizes and
+ * memory layout.
+ *
+ * Self-decompress-only usage
+ * ==========================
+ *
+ * ASTC is a complex format with a large search space. The parts of this search space that are
+ * searched is determined by heuristics that are, in part, tied to the quality level used when
+ * creating the context.
+ *
+ * A normal context is capable of decompressing any ASTC texture, including those generated by other
+ * compressors with unknown heuristics. This is the most flexible implementation, but forces the
+ * data tables used by the codec to include entries that are not needed during compression. This
+ * can slow down context creation by a significant amount, especially for the faster compression
+ * modes where few data table entries are actually used. To optimize this use case the context can
+ * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
+ * only be asked to decompress images that it compressed itself, allowing the data tables to
+ * exclude entries that are not needed by the current compression configuration. This reduces the
+ * size of the context data tables in memory and improves context creation performance. Note that,
+ * as of the 3.6 release, this flag no longer affects compression performance.
+ *
+ * Using this flag while attempting to decompress an valid image which was created by another
+ * compressor, or even another astcenc compressor version or configuration, may result in blocks
+ * returning as solid magenta or NaN value error blocks.
+ */
+
+#ifndef ASTCENC_INCLUDED
+#define ASTCENC_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ASTCENC_DYNAMIC_LIBRARY)
+	#if defined(_MSC_VER)
+		#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
+	#else
+		#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
+	#endif
+#else
+	#define ASTCENC_PUBLIC
+#endif
+
+/* ============================================================================
+    Data declarations
+============================================================================ */
+
+/**
+ * @brief An opaque structure; see astcenc_internal.h for definition.
+ */
+struct astcenc_context;
+
+/**
+ * @brief A codec API error code.
+ */
+enum astcenc_error {
+	/** @brief The call was successful. */
+	ASTCENC_SUCCESS = 0,
+	/** @brief The call failed due to low memory, or undersized I/O buffers. */
+	ASTCENC_ERR_OUT_OF_MEM,
+	/** @brief The call failed due to the build using fast math. */
+	ASTCENC_ERR_BAD_CPU_FLOAT,
+	/** @brief The call failed due to an out-of-spec parameter. */
+	ASTCENC_ERR_BAD_PARAM,
+	/** @brief The call failed due to an out-of-spec block size. */
+	ASTCENC_ERR_BAD_BLOCK_SIZE,
+	/** @brief The call failed due to an out-of-spec color profile. */
+	ASTCENC_ERR_BAD_PROFILE,
+	/** @brief The call failed due to an out-of-spec quality value. */
+	ASTCENC_ERR_BAD_QUALITY,
+	/** @brief The call failed due to an out-of-spec component swizzle. */
+	ASTCENC_ERR_BAD_SWIZZLE,
+	/** @brief The call failed due to an out-of-spec flag set. */
+	ASTCENC_ERR_BAD_FLAGS,
+	/** @brief The call failed due to the context not supporting the operation. */
+	ASTCENC_ERR_BAD_CONTEXT,
+	/** @brief The call failed due to unimplemented functionality. */
+	ASTCENC_ERR_NOT_IMPLEMENTED,
+	/** @brief The call failed due to an out-of-spec decode mode flag set. */
+	ASTCENC_ERR_BAD_DECODE_MODE,
+#if defined(ASTCENC_DIAGNOSTICS)
+	/** @brief The call failed due to an issue with diagnostic tracing. */
+	ASTCENC_ERR_DTRACE_FAILURE,
+#endif
+};
+
+/**
+ * @brief A codec color profile.
+ */
+enum astcenc_profile {
+	/** @brief The LDR sRGB color profile. */
+	ASTCENC_PRF_LDR_SRGB = 0,
+	/** @brief The LDR linear color profile. */
+	ASTCENC_PRF_LDR,
+	/** @brief The HDR RGB with LDR alpha color profile. */
+	ASTCENC_PRF_HDR_RGB_LDR_A,
+	/** @brief The HDR RGBA color profile. */
+	ASTCENC_PRF_HDR
+};
+
+/** @brief The fastest, lowest quality, search preset. */
+static const float ASTCENC_PRE_FASTEST = 0.0f;
+
+/** @brief The fast search preset. */
+static const float ASTCENC_PRE_FAST = 10.0f;
+
+/** @brief The medium quality search preset. */
+static const float ASTCENC_PRE_MEDIUM = 60.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_THOROUGH = 98.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
+
+/** @brief The exhaustive, highest quality, search preset. */
+static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
+
+/**
+ * @brief A codec component swizzle selector.
+ */
+enum astcenc_swz
+{
+	/** @brief Select the red component. */
+	ASTCENC_SWZ_R = 0,
+	/** @brief Select the green component. */
+	ASTCENC_SWZ_G = 1,
+	/** @brief Select the blue component. */
+	ASTCENC_SWZ_B = 2,
+	/** @brief Select the alpha component. */
+	ASTCENC_SWZ_A = 3,
+	/** @brief Use a constant zero component. */
+	ASTCENC_SWZ_0 = 4,
+	/** @brief Use a constant one component. */
+	ASTCENC_SWZ_1 = 5,
+	/** @brief Use a reconstructed normal vector Z component. */
+	ASTCENC_SWZ_Z = 6
+};
+
+/**
+ * @brief A texel component swizzle.
+ */
+struct astcenc_swizzle
+{
+	/** @brief The red component selector. */
+	astcenc_swz r;
+	/** @brief The green component selector. */
+	astcenc_swz g;
+	/** @brief The blue component selector. */
+	astcenc_swz b;
+	/** @brief The alpha component selector. */
+	astcenc_swz a;
+};
+
+/**
+ * @brief A texel component data format.
+ */
+enum astcenc_type
+{
+	/** @brief Unorm 8-bit data per component. */
+	ASTCENC_TYPE_U8 = 0,
+	/** @brief 16-bit float per component. */
+	ASTCENC_TYPE_F16 = 1,
+	/** @brief 32-bit float per component. */
+	ASTCENC_TYPE_F32 = 2
+};
+
+/**
+ * @brief Function pointer type for compression progress reporting callback.
+ */
+extern "C" typedef void (*astcenc_progress_callback)(float);
+
+/**
+ * @brief Enable normal map compression.
+ *
+ * Input data will be treated a two component normal map, storing X and Y, and the codec will
+ * optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
+ * be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
+ * used by BC5n).
+ */
+static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
+
+/**
+ * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
+ *
+ * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
+ * flag during compression will allow the compressor to use the correct rounding when selecting
+ * encodings. This will improve the compressed image quality if your application is using the
+ * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
+ *
+ * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
+ * this setting.
+ */
+static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8        = 1 << 1;
+
+/**
+ * @brief Enable alpha weighting.
+ *
+ * The input alpha value is used for transparency, so errors in the RGB components are weighted by
+ * the transparency level. This allows the codec to more accurately encode the alpha value in areas
+ * where the color value is less significant.
+ */
+static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT     = 1 << 2;
+
+/**
+ * @brief Enable perceptual error metrics.
+ *
+ * This mode enables perceptual compression mode, which will optimize for perceptual error rather
+ * than best PSNR. Only some input modes support perceptual error metrics.
+ */
+static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL       = 1 << 3;
+
+/**
+ * @brief Create a decompression-only context.
+ *
+ * This mode disables support for compression. This enables context allocation to skip some
+ * transient buffer allocation, resulting in lower memory usage.
+ */
+static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY      = 1 << 4;
+
+/**
+ * @brief Create a self-decompression context.
+ *
+ * This mode configures the compressor so that it is only guaranteed to be able to decompress images
+ * that were actually created using the current context. This is the common case for compression use
+ * cases, and setting this flag enables additional optimizations, but does mean that the context
+ * cannot reliably decompress arbitrary ASTC images.
+ */
+static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
+
+/**
+ * @brief Enable RGBM map compression.
+ *
+ * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
+ * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
+ * compression function, this flag is only used to control the use of RGBM-specific heuristics and
+ * error metrics.
+ *
+ * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
+ * M values can round to zero due to quantization and result in black or white pixels. It is highly
+ * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
+ * 16 or 32). Applying this threshold reduces the number of very dark colors that can be
+ * represented, but is still higher precision than 8-bit LDR.
+ *
+ * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
+ * factor used during reconstruction. This defaults to 5 when in RGBM mode.
+ *
+ * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
+ * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
+ * matching the default scale factor.
+ */
+static const unsigned int ASTCENC_FLG_MAP_RGBM             = 1 << 6;
+
+/**
+ * @brief The bit mask of all valid flags.
+ */
+static const unsigned int ASTCENC_ALL_FLAGS =
+                              ASTCENC_FLG_MAP_NORMAL |
+                              ASTCENC_FLG_MAP_RGBM |
+                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
+                              ASTCENC_FLG_USE_PERCEPTUAL |
+                              ASTCENC_FLG_USE_DECODE_UNORM8 |
+                              ASTCENC_FLG_DECOMPRESS_ONLY |
+                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+
+/**
+ * @brief The config structure.
+ *
+ * This structure will initially be populated by a call to astcenc_config_init, but power users may
+ * modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
+ * documentation of the power-user settings.
+ *
+ * Note for any settings which are associated with a specific color component, the value in the
+ * config applies to the component that exists after any compression data swizzle is applied.
+ */
+struct astcenc_config
+{
+	/** @brief The color profile. */
+	astcenc_profile profile;
+
+	/** @brief The set of set flags. */
+	unsigned int flags;
+
+	/** @brief The ASTC block size X dimension. */
+	unsigned int block_x;
+
+	/** @brief The ASTC block size Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The ASTC block size Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The red component weight scale for error weighting (-cw). */
+	float cw_r_weight;
+
+	/** @brief The green component weight scale for error weighting (-cw). */
+	float cw_g_weight;
+
+	/** @brief The blue component weight scale for error weighting (-cw). */
+	float cw_b_weight;
+
+	/** @brief The alpha component weight scale for error weighting (-cw). */
+	float cw_a_weight;
+
+	/**
+	 * @brief The radius for any alpha-weight scaling (-a).
+	 *
+	 * It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
+	 * will be sampled using linear texture filtering to minimize color bleed out of transparent
+	 * texels that are adjacent to non-transparent texels.
+	 */
+	unsigned int a_scale_radius;
+
+	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
+	float rgbm_m_scale;
+
+	/**
+	 * @brief The maximum number of partitions searched (-partitioncountlimit).
+	 *
+	 * Valid values are between 1 and 4.
+	 */
+	unsigned int tune_partition_count_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_2partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_3partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_4partition_index_limit;
+
+	/**
+	 * @brief The maximum centile for block modes searched (-blockmodelimit).
+	 *
+	 * Valid values are between 1 and 100.
+	 */
+	unsigned int tune_block_mode_limit;
+
+	/**
+	 * @brief The maximum iterative refinements applied (-refinementlimit).
+	 *
+	 * Valid values are between 1 and N; there is no technical upper limit
+	 * but little benefit is expected after N=4.
+	 */
+	unsigned int tune_refinement_limit;
+
+	/**
+	 * @brief The number of trial candidates per mode search (-candidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
+	 */
+	unsigned int tune_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_2partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_3partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_4partitioning_candidate_limit;
+
+	/**
+	 * @brief The dB threshold for stopping block search (-dblimit).
+	 *
+	 * This option is ineffective for HDR textures.
+	 */
+	float tune_db_limit;
+
+	/**
+	 * @brief The amount of MSE overshoot needed to early-out trials.
+	 *
+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
+	 * the high probability block modes. This can short-cut compression for simple blocks.
+	 *
+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
+	 * reached.
+	 */
+	float tune_mse_overshoot;
+
+	/**
+	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_2partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_3partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
+	 *
+	 * This option is ineffective for normal maps.
+	 */
+	float tune_2plane_early_out_limit_correlation;
+
+	/**
+	 * @brief The config enable for the mode0 fast-path search.
+	 *
+	 * If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
+	 * search is enabled. This option is ineffective for 3D block sizes.
+	 */
+	float tune_search_mode0_enable;
+
+	/**
+	 * @brief The progress callback, can be @c nullptr.
+	 *
+	 * If this is specified the codec will peridocially report progress for
+	 * compression as a percentage between 0 and 100. The callback is called from one
+	 * of the compressor threads, so doing significant work in the callback will
+	 * reduce compression performance.
+	 */
+	astcenc_progress_callback progress_callback;
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	/**
+	 * @brief The path to save the diagnostic trace data to.
+	 *
+	 * This option is not part of the public API, and requires special builds
+	 * of the library.
+	 */
+	const char* trace_file_path;
+#endif
+};
+
+/**
+ * @brief An uncompressed 2D or 3D image.
+ *
+ * 3D image are passed in as an array of 2D slices. Each slice has identical
+ * size and color format.
+ */
+struct astcenc_image
+{
+	/** @brief The X dimension of the image, in texels. */
+	unsigned int dim_x;
+
+	/** @brief The Y dimension of the image, in texels. */
+	unsigned int dim_y;
+
+	/** @brief The Z dimension of the image, in texels. */
+	unsigned int dim_z;
+
+	/** @brief The data type per component. */
+	astcenc_type data_type;
+
+	/** @brief The array of 2D slices, of length @c dim_z. */
+	void** data;
+};
+
+/**
+ * @brief A block encoding metadata query result.
+ *
+ * If the block is an error block or a constant color block or an error block all fields other than
+ * the profile, block dimensions, and error/constant indicator will be zero.
+ */
+struct astcenc_block_info
+{
+	/** @brief The block encoding color profile. */
+	astcenc_profile profile;
+
+	/** @brief The number of texels in the X dimension. */
+	unsigned int block_x;
+
+	/** @brief The number of texels in the Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The number of texel in the Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The number of texels in the block. */
+	unsigned int texel_count;
+
+	/** @brief True if this block is an error block. */
+	bool is_error_block;
+
+	/** @brief True if this block is a constant color block. */
+	bool is_constant_block;
+
+	/** @brief True if this block is an HDR block. */
+	bool is_hdr_block;
+
+	/** @brief True if this block uses two weight planes. */
+	bool is_dual_plane_block;
+
+	/** @brief The number of partitions if not constant color. */
+	unsigned int partition_count;
+
+	/** @brief The partition index if 2 - 4 partitions used. */
+	unsigned int partition_index;
+
+	/** @brief The component index of the second plane if dual plane. */
+	unsigned int dual_plane_component;
+
+	/** @brief The color endpoint encoding mode for each partition. */
+	unsigned int color_endpoint_modes[4];
+
+	/** @brief The number of color endpoint quantization levels. */
+	unsigned int color_level_count;
+
+	/** @brief The number of weight quantization levels. */
+	unsigned int weight_level_count;
+
+	/** @brief The number of weights in the X dimension. */
+	unsigned int weight_x;
+
+	/** @brief The number of weights in the Y dimension. */
+	unsigned int weight_y;
+
+	/** @brief The number of weights in the Z dimension. */
+	unsigned int weight_z;
+
+	/** @brief The unpacked color endpoints for each partition. */
+	float color_endpoints[4][2][4];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane1[216];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane2[216];
+
+	/** @brief The per-texel partition assignments for the block. */
+	uint8_t partition_assignment[216];
+};
+
+/**
+ * Populate a codec config based on default settings.
+ *
+ * Power users can edit the returned config struct to fine tune before allocating the context.
+ *
+ * @param      profile   Color profile.
+ * @param      block_x   ASTC block size X dimension.
+ * @param      block_y   ASTC block size Y dimension.
+ * @param      block_z   ASTC block size Z dimension.
+ * @param      quality   Search quality preset / effort level. Either an
+ *                       @c ASTCENC_PRE_* value, or a effort level between 0
+ *                       and 100. Performance is not linear between 0 and 100.
+
+ * @param      flags     A valid set of @c ASTCENC_FLG_* flag bits.
+ * @param[out] config    Output config struct to populate.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
+ * either individually, or in combination.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_config_init(
+	astcenc_profile profile,
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z,
+	float quality,
+	unsigned int flags,
+	astcenc_config* config);
+
+/**
+ * @brief Allocate a new codec context based on a config.
+ *
+ * This function allocates all of the memory resources and threads needed by the codec. This can be
+ * slow, so it is recommended that contexts are reused to serially compress or decompress multiple
+ * images to amortize setup cost.
+ *
+ * Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
+ * flag when creating the configuration. The compression functions will fail if invoked. For a
+ * decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
+ * any context.
+ *
+ * @param[in]  config         Codec config.
+ * @param      thread_count   Thread count to configure for.
+ * @param[out] context        Location to store an opaque context pointer.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
+	const astcenc_config* config,
+	unsigned int thread_count,
+	astcenc_context** context);
+
+/**
+ * @brief Compress an image.
+ *
+ * A single context can only compress or decompress a single image at a time.
+ *
+ * For a context configured for multi-threading, any set of the N threads can call this function.
+ * Work will be dynamically scheduled across the threads available. Each thread must have a unique
+ * @c thread_index.
+ *
+ * @param         context        Codec context.
+ * @param[in,out] image          An input image, in 2D slices.
+ * @param         swizzle        Compression data swizzle, applied before compression.
+ * @param[out]    data_out       Pointer to output data array.
+ * @param         data_len       Length of the output data array.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
+	astcenc_context* context,
+	astcenc_image* image,
+	const astcenc_swizzle* swizzle,
+	uint8_t* data_out,
+	size_t data_len,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new compression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_compress_image() function for image N,
+ * but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
+	astcenc_context* context);
+
+/**
+ * @brief Cancel any pending compression operation.
+ *
+ * The caller must behave as if the compression completed normally, even though the data will be
+ * undefined. They are still responsible for synchronizing threads in the worker thread pool, and
+ * must call reset before starting another compression.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
+	astcenc_context* context);
+
+/**
+ * @brief Decompress an image.
+ *
+ * @param         context        Codec context.
+ * @param[in]     data           Pointer to compressed data.
+ * @param         data_len       Length of the compressed data, in bytes.
+ * @param[in,out] image_out      Output image.
+ * @param         swizzle        Decompression data swizzle, applied after decompression.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
+	astcenc_context* context,
+	const uint8_t* data,
+	size_t data_len,
+	astcenc_image* image_out,
+	const astcenc_swizzle* swizzle,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new decompression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_decompress_image() function for image
+ * N, but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
+	astcenc_context* context);
+
+/**
+ * Free the compressor context.
+ *
+ * @param context   The codec context.
+ */
+ASTCENC_PUBLIC void astcenc_context_free(
+	astcenc_context* context);
+
+/**
+ * @brief Provide a high level summary of a block's encoding.
+ *
+ * This feature is primarily useful for codec developers but may be useful for developers building
+ * advanced content packaging pipelines.
+ *
+ * @param context   Codec context.
+ * @param data      One block of compressed ASTC data.
+ * @param info      The output info structure to populate.
+ *
+ * @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
+ *         function will return success even if the block itself was an error block encoding, as the
+ *         decode was correctly handled.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
+	astcenc_context* context,
+	const uint8_t data[16],
+	astcenc_block_info* info);
+
+/**
+ * @brief Get a printable string for specific status code.
+ *
+ * @param status   The status value.
+ *
+ * @return A human readable nul-terminated string.
+ */
+ASTCENC_PUBLIC const char* astcenc_get_error_string(
+	astcenc_error status);
+
+#endif
--- a/thirdparty/astcenc/astcenc_averages_and_directions.cpp
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for finding dominant direction of a set of colors.
+ */
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Compute the average RGB color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized, and lane<3> will be zero.
+ */
+static void compute_partition_averages_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	size_t texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean.swz<0, 1, 2>();
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloatacc pp_avg_rgb[3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
+		                           hadd_s(pp_avg_rgb[1]),
+		                           hadd_s(pp_avg_rgb[2]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloatacc pp_avg_rgb[2][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloatacc pp_avg_rgb[3][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
+		                           hadd_s(pp_avg_rgb[2][1]),
+		                           hadd_s(pp_avg_rgb[2][2]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/**
+ * @brief Compute the average RGBA color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized.
+ */
+static void compute_partition_averages_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	size_t texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean;
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloat4 pp_avg_rgba[4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
+		                           hadd_s(pp_avg_rgba[1]),
+		                           hadd_s(pp_avg_rgba[2]),
+		                           hadd_s(pp_avg_rgba[3]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloat4 pp_avg_rgba[2][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloat4 pp_avg_rgba[3][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
+		                           hadd_s(pp_avg_rgba[2][1]),
+		                           hadd_s(pp_avg_rgba[2][2]),
+		                           hadd_s(pp_avg_rgba[2][3]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_4_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+		vfloat4 sum_wp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = blk.texel(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+
+			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
+			sum_wp += select(zero, texel_datum, tdm3);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+		vfloat4 prod_wp = dot(sum_wp, sum_wp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+		best_sum = select(best_sum, prod_zp, mask);
+
+		mask = prod_wp > best_sum;
+		best_vector = select(best_vector, sum_wp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int omitted_component,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	const float* data_vr = blk.data_r;
+	const float* data_vg = blk.data_g;
+	const float* data_vb = blk.data_b;
+
+	// TODO: Data-driven permute would be useful to avoid this ...
+	if (omitted_component == 0)
+	{
+		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 1)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
+
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 2)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
+
+		data_vb = blk.data_a;
+	}
+	else
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
+	}
+
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = vfloat3(data_vr[iwt],
+			                              data_vg[iwt],
+			                              data_vb[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgb(pi, blk, partition_averages);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = blk.texel3(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_2_comp(
+	const partition_info& pt,
+	const image_block& blk,
+	unsigned int component1,
+	unsigned int component2,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	vfloat4 average;
+
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+
+	if (component1 == 0 && component2 == 1)
+	{
+		average = blk.data_mean.swz<0, 1>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+	}
+	else if (component1 == 0 && component2 == 2)
+	{
+		average = blk.data_mean.swz<0, 2>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+	}
+	else // (component1 == 1 && component2 == 2)
+	{
+		assert(component1 == 1 && component2 == 2);
+
+		average = blk.data_mean.swz<1, 2>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+	}
+
+	size_t partition_count = pt.partition_count;
+	promise(partition_count > 0);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
+		size_t texel_count = pt.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Only compute a partition mean if more than one partition
+		if (partition_count > 1)
+		{
+			average = vfloat4::zero();
+			for (size_t i = 0; i < texel_count; i++)
+			{
+				unsigned int iwt = texel_indexes[i];
+				average += vfloat2(data_vr[iwt], data_vg[iwt]);
+			}
+
+			average = average / static_cast<float>(texel_count);
+		}
+
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
+	float line_lengths[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+
+		processed_line4 l_uncor = uncor_plines[partition];
+		processed_line4 l_samec = samec_plines[partition];
+
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+		vfloat l_samec_bs3(l_samec.bs.lane<3>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+		vfloat ew_a(blk.channel_weight.lane<3>());
+
+		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
+		// array to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
+
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2)
+			                   + (data_a * l_uncor_bs3);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+			                   + (uncor_param * l_uncor_bs3);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2)
+			                 + (ew_a * uncor_dist3 * uncor_dist3);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2)
+			                   + (data_a * l_samec_bs3);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2)
+			                 + (ew_a * samec_dist3 * samec_dist3);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		// Turn very small numbers and NaNs into a small number
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		partition_lines3& pl = plines[partition];
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		processed_line3 l_uncor = pl.uncor_pline;
+		processed_line3 l_samec = pl.samec_pline;
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+
+		// This implementation over-shoots, but this is safe as we initialize the weights array
+		// to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
+
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		// Turn very small numbers and NaNs into a small number
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		pl.line_length = astc::max(uncor_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_block_sizes.cpp
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
--- a/thirdparty/astcenc/astcenc_color_quantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_quantize.cpp
--- a/thirdparty/astcenc/astcenc_color_unquantize.cpp
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include <utility>
+
+/**
+ * @brief Functions for color unquantization.
+ */
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Un-blue-contract a color.
+ *
+ * This function reverses any applied blue contraction.
+ *
+ * @param input   The input color that has been blue-contracted.
+ *
+ * @return The uncontracted color.
+ */
+static ASTCENC_SIMD_INLINE vint4 uncontract_color(
+	vint4 input
+) {
+	vmask4 mask(true, true, false, false);
+	vint4 bc0 = asr<1>(input + input.lane<2>());
+	return select(input, bc0, mask);
+}
+
+void rgba_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply bit transfer
+	bit_transfer_signed(input1, input0);
+
+	// Apply blue-uncontraction if needed
+	int rgb_sum = hadd_rgb_s(input1);
+	input1 = input1 + input0;
+	if (rgb_sum < 0)
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = clamp(0, 255, input0);
+	output1 = clamp(0, 255, input1);
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses delta encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_delta_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+void rgba_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply blue-uncontraction if needed
+	if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = input0;
+	output1 = input1;
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses direct encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses scaled encoding.
+ *
+ * Note only the RGB channels use the scaled encoding, alpha uses direct.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      alpha1    The packed endpoint 1 alpha value.
+ * @param      scale     The packed quantized scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_alpha_unpack(
+	vint4 input0,
+	uint8_t alpha1,
+	uint8_t scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(alpha1);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(input0.lane<3>());
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses scaled encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      scale     The packed scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_unpack(
+	vint4 input0,
+	int scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(255);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses direct encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	output0 = vint4(lum0, lum0, lum0, 255);
+	output1 = vint4(lum1, lum1, lum1, 255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses delta encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints (L0, L1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_delta_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int l0 = (v0 >> 2) | (v1 & 0xC0);
+	int l1 = l0 + (v1 & 0x3F);
+
+	l1 = astc::min(l1, 255);
+
+	output0 = vint4(l0, l0, l0, 255);
+	output1 = vint4(l1, l1, l1, 255);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses direct encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses delta encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_delta_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+
+	lum0 |= (lum1 & 0x80) << 1;
+	alpha0 |= (alpha1 & 0x80) << 1;
+	lum1 &= 0x7F;
+	alpha1 &= 0x7F;
+
+	if (lum1 & 0x40)
+	{
+		lum1 -= 0x80;
+	}
+
+	if (alpha1 & 0x40)
+	{
+		alpha1 -= 0x80;
+	}
+
+	lum0 >>= 1;
+	lum1 >>= 1;
+	alpha0 >>= 1;
+	alpha1 >>= 1;
+	lum1 += lum0;
+	alpha1 += alpha0;
+
+	lum1 = astc::clamp(lum1, 0, 255);
+	alpha1 = astc::clamp(alpha1, 0, 255);
+
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an HDR RGB + offset encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgbo_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+
+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
+
+	int majcomp;
+	int mode;
+	if ((modeval & 0xC) != 0xC)
+	{
+		majcomp = modeval >> 2;
+		mode = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		majcomp = modeval & 3;
+		mode = 4;
+	}
+	else
+	{
+		majcomp = 0;
+		mode = 5;
+	}
+
+	int red = v0 & 0x3F;
+	int green = v1 & 0x1F;
+	int blue = v2 & 0x1F;
+	int scale = v3 & 0x1F;
+
+	int bit0 = (v1 >> 6) & 1;
+	int bit1 = (v1 >> 5) & 1;
+	int bit2 = (v2 >> 6) & 1;
+	int bit3 = (v2 >> 5) & 1;
+	int bit4 = (v3 >> 7) & 1;
+	int bit5 = (v3 >> 6) & 1;
+	int bit6 = (v3 >> 5) & 1;
+
+	int ohcomp = 1 << mode;
+
+	if (ohcomp & 0x30)
+		green |= bit0 << 6;
+	if (ohcomp & 0x3A)
+		green |= bit1 << 5;
+	if (ohcomp & 0x30)
+		blue |= bit2 << 6;
+	if (ohcomp & 0x3A)
+		blue |= bit3 << 5;
+
+	if (ohcomp & 0x3D)
+		scale |= bit6 << 5;
+	if (ohcomp & 0x2D)
+		scale |= bit5 << 6;
+	if (ohcomp & 0x04)
+		scale |= bit4 << 7;
+
+	if (ohcomp & 0x3B)
+		red |= bit4 << 6;
+	if (ohcomp & 0x04)
+		red |= bit3 << 6;
+
+	if (ohcomp & 0x10)
+		red |= bit5 << 7;
+	if (ohcomp & 0x0F)
+		red |= bit2 << 7;
+
+	if (ohcomp & 0x05)
+		red |= bit1 << 8;
+	if (ohcomp & 0x0A)
+		red |= bit0 << 8;
+
+	if (ohcomp & 0x05)
+		red |= bit0 << 9;
+	if (ohcomp & 0x02)
+		red |= bit6 << 9;
+
+	if (ohcomp & 0x01)
+		red |= bit3 << 10;
+	if (ohcomp & 0x02)
+		red |= bit5 << 10;
+
+	// expand to 12 bits.
+	static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
+	int shamt = shamts[mode];
+	red <<= shamt;
+	green <<= shamt;
+	blue <<= shamt;
+	scale <<= shamt;
+
+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
+	// not absolute values.
+	if (mode != 5)
+	{
+		green = red - green;
+		blue = red - blue;
+	}
+
+	// switch around components.
+	int temp;
+	switch (majcomp)
+	{
+	case 1:
+		temp = red;
+		red = green;
+		green = temp;
+		break;
+	case 2:
+		temp = red;
+		red = blue;
+		blue = temp;
+		break;
+	default:
+		break;
+	}
+
+	int red0 = red - scale;
+	int green0 = green - scale;
+	int blue0 = blue - scale;
+
+	// clamp to [0,0xFFF].
+	if (red < 0)
+		red = 0;
+	if (green < 0)
+		green = 0;
+	if (blue < 0)
+		blue = 0;
+
+	if (red0 < 0)
+		red0 = 0;
+	if (green0 < 0)
+		green0 = 0;
+	if (blue0 < 0)
+		blue0 = 0;
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_unpack(
+	const uint8_t input[6],
+	vint4& output0,
+	vint4& output1
+) {
+
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+	int v4 = input[4];
+	int v5 = input[5];
+
+	// extract all the fixed-placement bitfields
+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
+
+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
+
+	if (majcomp == 3)
+	{
+		output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
+		output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
+		return;
+	}
+
+	int a = v0 | ((v1 & 0x40) << 2);
+	int b0 = v2 & 0x3f;
+	int b1 = v3 & 0x3f;
+	int c = v1 & 0x3f;
+	int d0 = v4 & 0x7f;
+	int d1 = v5 & 0x7f;
+
+	// get hold of the number of bits in 'd0' and 'd1'
+	static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
+	int dbits = dbits_tab[modeval];
+
+	// extract six variable-placement bits
+	int bit0 = (v2 >> 6) & 1;
+	int bit1 = (v3 >> 6) & 1;
+	int bit2 = (v4 >> 6) & 1;
+	int bit3 = (v5 >> 6) & 1;
+	int bit4 = (v4 >> 5) & 1;
+	int bit5 = (v5 >> 5) & 1;
+
+	// and prepend the variable-placement bits depending on mode.
+	int ohmod = 1 << modeval;	// one-hot-mode
+	if (ohmod & 0xA4)
+		a |= bit0 << 9;
+	if (ohmod & 0x8)
+		a |= bit2 << 9;
+	if (ohmod & 0x50)
+		a |= bit4 << 9;
+
+	if (ohmod & 0x50)
+		a |= bit5 << 10;
+	if (ohmod & 0xA0)
+		a |= bit1 << 10;
+
+	if (ohmod & 0xC0)
+		a |= bit2 << 11;
+
+	if (ohmod & 0x4)
+		c |= bit1 << 6;
+	if (ohmod & 0xE8)
+		c |= bit3 << 6;
+
+	if (ohmod & 0x20)
+		c |= bit2 << 7;
+
+	if (ohmod & 0x5B)
+	{
+		b0 |= bit0 << 6;
+		b1 |= bit1 << 6;
+	}
+
+	if (ohmod & 0x12)
+	{
+		b0 |= bit2 << 7;
+		b1 |= bit3 << 7;
+	}
+
+	if (ohmod & 0xAF)
+	{
+		d0 |= bit4 << 5;
+		d1 |= bit5 << 5;
+	}
+
+	if (ohmod & 0x5)
+	{
+		d0 |= bit2 << 6;
+		d1 |= bit3 << 6;
+	}
+
+	// sign-extend 'd0' and 'd1'
+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
+	int32_t d0x = d0;
+	int32_t d1x = d1;
+	int sx_shamt = 32 - dbits;
+	d0x <<= sx_shamt;
+	d0x >>= sx_shamt;
+	d1x <<= sx_shamt;
+	d1x >>= sx_shamt;
+	d0 = d0x;
+	d1 = d1x;
+
+	// expand all values to 12 bits, with left-shift as needed.
+	int val_shamt = (modeval >> 1) ^ 3;
+	a <<= val_shamt;
+	b0 <<= val_shamt;
+	b1 <<= val_shamt;
+	c <<= val_shamt;
+	d0 <<= val_shamt;
+	d1 <<= val_shamt;
+
+	// then compute the actual color values.
+	int red1 = a;
+	int green1 = a - b0;
+	int blue1 = a - b1;
+	int red0 = a - c;
+	int green0 = a - b0 - c - d0;
+	int blue0 = a - b1 - c - d1;
+
+	// clamp the color components to [0,2^12 - 1]
+	red0 = astc::clamp(red0, 0, 4095);
+	green0 = astc::clamp(green0, 0, 4095);
+	blue0 = astc::clamp(blue0, 0, 4095);
+
+	red1 = astc::clamp(red1, 0, 4095);
+	green1 = astc::clamp(green1, 0, 4095);
+	blue1 = astc::clamp(blue1, 0, 4095);
+
+	// switch around the color components
+	int temp0, temp1;
+	switch (majcomp)
+	{
+	case 1:					// switch around red and green
+		temp0 = red0;
+		temp1 = red1;
+		red0 = green0;
+		red1 = green1;
+		green0 = temp0;
+		green1 = temp1;
+		break;
+	case 2:					// switch around red and blue
+		temp0 = red0;
+		temp1 = red1;
+		red0 = blue0;
+		red1 = blue1;
+		blue0 = temp0;
+		blue1 = temp1;
+		break;
+	case 0:					// no switch
+		break;
+	}
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB + LDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_ldr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int v6 = input[6];
+	int v7 = input[7];
+	output0.set_lane<3>(v6);
+	output1.set_lane<3>(v7);
+}
+
+/**
+ * @brief Unpack an HDR L (small range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_small_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v0 & 0x80)
+	{
+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+		y1 = (v1 & 0x1F) << 2;
+	}
+	else
+	{
+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+		y1 = (v1 & 0xF) << 1;
+	}
+
+	y1 += y0;
+	if (y1 > 0xFFF)
+	{
+		y1 = 0xFFF;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR L (large range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_large_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v1 >= v0)
+	{
+		y0 = v0 << 4;
+		y1 = v1 << 4;
+	}
+	else
+	{
+		y0 = (v1 << 4) + 8;
+		y1 = (v0 << 4) - 8;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_alpha_unpack(
+	const uint8_t input[2],
+	int& output0,
+	int& output1
+) {
+
+	int v6 = input[0];
+	int v7 = input[1];
+
+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+	v6 &= 0x7F;
+	v7 &= 0x7F;
+	if (selector == 3)
+	{
+		output0 = v6 << 5;
+		output1 = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (selector + 1)) & 0x780;
+		v7 &= (0x3f >> selector);
+		v7 ^= 32 >> selector;
+		v7 -= 32 >> selector;
+		v6 <<= (4 - selector);
+		v7 <<= (4 - selector);
+		v7 += v6;
+
+		if (v7 < 0)
+		{
+			v7 = 0;
+		}
+		else if (v7 > 0xFFF)
+		{
+			v7 = 0xFFF;
+		}
+
+		output0 = v6;
+		output1 = v7;
+	}
+
+	output0 <<= 4;
+	output1 <<= 4;
+}
+
+/**
+ * @brief Unpack an HDR RGBA direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_hdr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int alpha0, alpha1;
+	hdr_alpha_unpack(input + 6, alpha0, alpha1);
+
+	output0.set_lane<3>(alpha0);
+	output1.set_lane<3>(alpha1);
+}
+
+/* See header for documentation. */
+void unpack_color_endpoints(
+	astcenc_profile decode_mode,
+	int format,
+	const uint8_t* input,
+	bool& rgb_hdr,
+	bool& alpha_hdr,
+	vint4& output0,
+	vint4& output1
+) {
+	// Assume no NaNs and LDR endpoints unless set later
+	rgb_hdr = false;
+	alpha_hdr = false;
+
+	bool alpha_hdr_default = false;
+
+	switch (format)
+	{
+	case FMT_LUMINANCE:
+		luminance_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_DELTA:
+		luminance_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_small_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_large_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		luminance_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA_DELTA:
+		luminance_alpha_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE:
+		{
+			vint4 input0q(input[0], input[1], input[2], 0);
+			uint8_t scale = input[3];
+			rgb_scale_unpack(input0q, scale, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		{
+			vint4 input0q(input[0], input[1], input[2], input[4]);
+			uint8_t alpha1q = input[5];
+			uint8_t scaleq = input[3];
+			rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgbo_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgb_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGBA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGBA_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		rgb_hdr = true;
+		hdr_rgb_ldr_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_RGBA:
+		rgb_hdr = true;
+		alpha_hdr = true;
+		hdr_rgb_hdr_alpha_unpack(input, output0, output1);
+		break;
+	}
+
+	// Assign a correct default alpha
+	if (alpha_hdr_default)
+	{
+		if (decode_mode == ASTCENC_PRF_HDR)
+		{
+			output0.set_lane<3>(0x7800);
+			output1.set_lane<3>(0x7800);
+			alpha_hdr = true;
+		}
+		else
+		{
+			output0.set_lane<3>(0x00FF);
+			output1.set_lane<3>(0x00FF);
+			alpha_hdr = false;
+		}
+	}
+
+	// Handle endpoint errors and expansion
+
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	if (decode_mode == ASTCENC_PRF_LDR)
+	{
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
+		{
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+
+		output0 = output0 * 257;
+		output1 = output1 * 257;
+	}
+	// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
+	//  - RGB = shift left by 8 bits and OR with 0x80
+	//  - A = replication
+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
+		{
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+
+		output0 = lsl<8>(output0) | vint4(0x80);
+		output1 = lsl<8>(output1) | vint4(0x80);
+	}
+	// An HDR profile decode, but may be using linear LDR endpoints
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	// HDR endpoints are already 16-bit
+	else
+	{
+		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
+		vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
+		output0 = output0 * output_scale;
+		output1 = output1 * output_scale;
+	}
+}
--- a/thirdparty/astcenc/astcenc_compress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
--- a/thirdparty/astcenc/astcenc_compute_variance.cpp
+++ b/thirdparty/astcenc/astcenc_compute_variance.cpp
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions to calculate variance per component in a NxN footprint.
+ *
+ * We need N to be parametric, so the routine below uses summed area tables in order to execute in
+ * O(1) time independent of how big N is.
+ *
+ * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
+ * perform a binary reduction, and then distributes the results. This method means that there is no
+ * serial dependency between a given element and the next one, and also significantly improves
+ * numerical stability allowing us to use floats rather than doubles.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
+ *
+ * This will take an input array of the form:
+ *     v0, v1, v2, ...
+ * ... and modify in-place to turn it into a prefix-sum array of the form:
+ *     v0, v0+v1, v0+v1+v2, ...
+ *
+ * @param d      The array to prefix-sum.
+ * @param items  The number of items in the array.
+ * @param stride The item spacing in the array; i.e. dense arrays should use 1.
+ */
+static void brent_kung_prefix_sum(
+	vfloat4* d,
+	size_t items,
+	int stride
+) {
+	if (items < 2)
+		return;
+
+	size_t lc_stride = 2;
+	size_t log2_stride = 1;
+
+	// The reduction-tree loop
+	do {
+		size_t step = lc_stride >> 1;
+		size_t start = lc_stride - 1;
+		size_t iters = items >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+
+		log2_stride += 1;
+		lc_stride <<= 1;
+	} while (lc_stride <= items);
+
+	// The expansion-tree loop
+	do {
+		log2_stride -= 1;
+		lc_stride >>= 1;
+
+		size_t step = lc_stride >> 1;
+		size_t start = step + lc_stride - 1;
+		size_t iters = (items - step) >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+	} while (lc_stride > 2);
+}
+
+/* See header for documentation. */
+void compute_pixel_region_variance(
+	astcenc_contexti& ctx,
+	const pixel_region_args& arg
+) {
+	// Unpack the memory structure into local variables
+	const astcenc_image* img = arg.img;
+	astcenc_swizzle swz = arg.swz;
+	bool have_z = arg.have_z;
+
+	int size_x = arg.size_x;
+	int size_y = arg.size_y;
+	int size_z = arg.size_z;
+
+	int offset_x = arg.offset_x;
+	int offset_y = arg.offset_y;
+	int offset_z = arg.offset_z;
+
+	int alpha_kernel_radius = arg.alpha_kernel_radius;
+
+	float*   input_alpha_averages = ctx.input_alpha_averages;
+	vfloat4* work_memory = arg.work_memory;
+
+	// Compute memory sizes and dimensions that we need
+	int kernel_radius = alpha_kernel_radius;
+	int kerneldim = 2 * kernel_radius + 1;
+	int kernel_radius_xy = kernel_radius;
+	int kernel_radius_z = have_z ? kernel_radius : 0;
+
+	int padsize_x = size_x + kerneldim;
+	int padsize_y = size_y + kerneldim;
+	int padsize_z = size_z + (have_z ? kerneldim : 0);
+	int sizeprod = padsize_x * padsize_y * padsize_z;
+
+	int zd_start = have_z ? 1 : 0;
+
+	vfloat4 *varbuf1 = work_memory;
+	vfloat4 *varbuf2 = work_memory + sizeprod;
+
+	// Scaling factors to apply to Y and Z for accesses into the work buffers
+	int yst = padsize_x;
+	int zst = padsize_x * padsize_y;
+
+	// Scaling factors to apply to Y and Z for accesses into result buffers
+	int ydt = img->dim_x;
+	int zdt = img->dim_x * img->dim_y;
+
+	// Macros to act as accessor functions for the work-memory
+	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
+	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
+
+	// Load N and N^2 values into the work buffers
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE
+		uint8_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 255;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					uint8_t r = data[swz.r];
+					uint8_t g = data[swz.g];
+					uint8_t b = data[swz.b];
+					uint8_t a = data[swz.a];
+
+					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
+					                     g * (1.0f / 255.0f),
+					                     b * (1.0f / 255.0f),
+					                     a * (1.0f / 255.0f));
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		uint16_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 0x3C00;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					vfloat4 d = float16_to_float(di);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		float data[6];
+		data[ASTCENC_SWZ_0] = 0.0f;
+		data[ASTCENC_SWZ_1] = 1.0f;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			float* data32 = static_cast<float*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					float r = data[swz.r];
+					float g = data[swz.g];
+					float b = data[swz.b];
+					float a = data[swz.a];
+
+					vfloat4 d(r, g, b, a);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+
+	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
+	vfloat4 vbz = vfloat4::zero();
+	for (int z = 0; z < padsize_z; z++)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			VARBUF1(z, y, 0) = vbz;
+			VARBUF2(z, y, 0) = vbz;
+		}
+
+		for (int x = 0; x < padsize_x; x++)
+		{
+			VARBUF1(z, 0, x) = vbz;
+			VARBUF2(z, 0, x) = vbz;
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			for (int x = 0; x < padsize_x; x++)
+			{
+				VARBUF1(0, y, x) = vbz;
+				VARBUF2(0, y, x) = vbz;
+			}
+		}
+	}
+
+	// Generate summed-area tables for N and N^2; this is done in-place, using
+	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
+			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
+		}
+	}
+
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int x = 1; x < padsize_x; x++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
+			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			for (int x = 1; x < padsize_x; x++)
+			{
+				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
+				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
+			}
+		}
+	}
+
+	// Compute a few constants used in the variance-calculation.
+	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
+	float alpha_rsamples;
+
+	if (have_z)
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
+	}
+	else
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
+	}
+
+	// Use the summed-area tables to compute variance for each neighborhood
+	if (have_z)
+	{
+		for (int z = 0; z < size_z; z++)
+		{
+			int z_src = z + kernel_radius_z;
+			int z_dst = z + offset_z;
+			int z_low  = z_src - alpha_kernel_radius;
+			int z_high = z_src + alpha_kernel_radius + 1;
+
+			for (int y = 0; y < size_y; y++)
+			{
+				int y_src = y + kernel_radius_xy;
+				int y_dst = y + offset_y;
+				int y_low  = y_src - alpha_kernel_radius;
+				int y_high = y_src + alpha_kernel_radius + 1;
+
+				for (int x = 0; x < size_x; x++)
+				{
+					int x_src = x + kernel_radius_xy;
+					int x_dst = x + offset_x;
+					int x_low  = x_src - alpha_kernel_radius;
+					int x_high = x_src + alpha_kernel_radius + 1;
+
+					// Summed-area table lookups for alpha average
+					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
+					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
+					               - VARBUF1(z_high, y_high, x_low).lane<3>()
+					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
+					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
+					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
+					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
+					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
+
+					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
+					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+				}
+			}
+		}
+	}
+	else
+	{
+		for (int y = 0; y < size_y; y++)
+		{
+			int y_src = y + kernel_radius_xy;
+			int y_dst = y + offset_y;
+			int y_low  = y_src - alpha_kernel_radius;
+			int y_high = y_src + alpha_kernel_radius + 1;
+
+			for (int x = 0; x < size_x; x++)
+			{
+				int x_src = x + kernel_radius_xy;
+				int x_dst = x + offset_x;
+				int x_low  = x_src - alpha_kernel_radius;
+				int x_high = x_src + alpha_kernel_radius + 1;
+
+				// Summed-area table lookups for alpha average
+				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
+				            - VARBUF1(0, y_low,  x_high).lane<3>()
+				            - VARBUF1(0, y_high, x_low).lane<3>()
+				            + VARBUF1(0, y_high, x_high).lane<3>();
+
+				int out_index = y_dst * ydt + x_dst;
+				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+unsigned int init_compute_averages(
+	const astcenc_image& img,
+	unsigned int alpha_kernel_radius,
+	const astcenc_swizzle& swz,
+	avg_args& ag
+) {
+	unsigned int size_x = img.dim_x;
+	unsigned int size_y = img.dim_y;
+	unsigned int size_z = img.dim_z;
+
+	// Compute maximum block size and from that the working memory buffer size
+	unsigned int kernel_radius = alpha_kernel_radius;
+	unsigned int kerneldim = 2 * kernel_radius + 1;
+
+	bool have_z = (size_z > 1);
+	unsigned int max_blk_size_xy = have_z ? 16 : 32;
+	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
+
+	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
+	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
+
+	// Perform block-wise averages calculations across the image
+	// Initialize fields which are not populated until later
+	ag.arg.size_x = 0;
+	ag.arg.size_y = 0;
+	ag.arg.size_z = 0;
+	ag.arg.offset_x = 0;
+	ag.arg.offset_y = 0;
+	ag.arg.offset_z = 0;
+	ag.arg.work_memory = nullptr;
+
+	ag.arg.img = &img;
+	ag.arg.swz = swz;
+	ag.arg.have_z = have_z;
+	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
+
+	ag.img_size_x = size_x;
+	ag.img_size_y = size_y;
+	ag.img_size_z = size_z;
+	ag.blk_size_xy = max_blk_size_xy;
+	ag.blk_size_z = max_blk_size_z;
+	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
+
+	// The parallel task count
+	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
+	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
+	return z_tasks * y_tasks;
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions to decompress a symbolic block.
+ */
+
+#include "astcenc_internal.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+/**
+ * @brief Compute the integer linear interpolation of two color endpoints.
+ *
+ * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
+ * @param color0        The endpoint0 color.
+ * @param color1        The endpoint1 color.
+ * @param weights       The interpolation weight (between 0 and 64).
+ *
+ * @return The interpolated color.
+ */
+static vint4 lerp_color_int(
+	vmask4 u8_mask,
+	vint4 color0,
+	vint4 color1,
+	vint4 weights
+) {
+	vint4 weight1 = weights;
+	vint4 weight0 = vint4(64) - weight1;
+
+	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
+	color = asr<6>(color);
+
+	// For decode_unorm8 values force the codec to bit replicate. This allows the
+	// rest of the codec to assume the full 0xFFFF range for everything and ignore
+	// the decode_mode setting
+	vint4 color_u8 = asr<8>(color) * vint4(257);
+	color = select(color, color_u8, u8_mask);
+
+	return color;
+}
+
+/**
+ * @brief Convert integer color value into a float value for the decoder.
+ *
+ * @param data       The integer color value post-interpolation.
+ * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
+ *
+ * @return The float color value.
+ */
+static inline vfloat4 decode_texel(
+	vint4 data,
+	vmask4 lns_mask
+) {
+	vint4 color_lns = vint4::zero();
+	vint4 color_unorm = vint4::zero();
+
+	if (any(lns_mask))
+	{
+		color_lns = lns_to_sf16(data);
+	}
+
+	if (!all(lns_mask))
+	{
+		color_unorm = unorm16_to_sf16(data);
+	}
+
+	// Pick components and then convert to FP16
+	vint4 datai = select(color_unorm, color_lns, lns_mask);
+	return float16_to_float(datai);
+}
+
+/* See header for documentation. */
+void unpack_weights(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const decimation_info& di,
+	bool is_dual_plane,
+	int weights_plane1[BLOCK_MAX_TEXELS],
+	int weights_plane2[BLOCK_MAX_TEXELS]
+) {
+	// Safe to overshoot as all arrays are allocated to full size
+	if (!is_dual_plane)
+	{
+		// Build full 64-entry weight lookup table
+		vtable_64x8 table;
+		vtable_prepare(table, scb.weights);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint summed_value(8);
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax_s(weight_count);
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(summed_value), weights_plane1 + i);
+		}
+	}
+	else
+	{
+		// Build a 32-entry weight lookup table per plane
+		// Plane 1
+		vtable_32x8 tab_plane1;
+		vtable_prepare(tab_plane1, scb.weights);
+
+		// Plane 2
+		vtable_32x8 tab_plane2;
+		vtable_prepare(tab_plane2, scb.weights + 32);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint sum_plane1(8);
+			vint sum_plane2(8);
+
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax_s(weight_count);
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(sum_plane1), weights_plane1 + i);
+			store(lsr<4>(sum_plane2), weights_plane2 + i);
+		}
+	}
+}
+
+/**
+ * @brief Return an FP32 NaN value for use in error colors.
+ *
+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
+ *
+ * @return The float color value.
+ */
+static float error_color_nan()
+{
+	if32 v;
+	v.u = 0xFFFFE000U;
+	return v.f;
+}
+
+/* See header for documentation. */
+void decompress_symbolic_block(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	int xpos,
+	int ypos,
+	int zpos,
+	const symbolic_compressed_block& scb,
+	image_block& blk
+) {
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	blk.data_min = vfloat4::zero();
+	blk.data_mean = vfloat4::zero();
+	blk.data_max = vfloat4::zero();
+	blk.grayscale = false;
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = error_color_nan();
+			blk.data_g[i] = error_color_nan();
+			blk.data_b[i] = error_color_nan();
+			blk.data_a[i] = error_color_nan();
+			blk.rgb_lns[i] = 0;
+			blk.alpha_lns[i] = 0;
+		}
+
+		return;
+	}
+
+	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
+	    (scb.block_type == SYM_BTYPE_CONST_U16))
+	{
+		vfloat4 color;
+		uint8_t use_lns = 0;
+
+		// UNORM16 constant color block
+		if (scb.block_type == SYM_BTYPE_CONST_U16)
+		{
+			vint4 colori(scb.constant_color);
+
+			// Determine the UNORM8 rounding on the decode
+			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+			// The real decoder would just use the top 8 bits, but we rescale
+			// in to a 16-bit value that rounds correctly.
+			vint4 colori_u8 = asr<8>(colori) * 257;
+			colori = select(colori, colori_u8, u8_mask);
+
+			vint4 colorf16 = unorm16_to_sf16(colori);
+			color = float16_to_float(colorf16);
+		}
+		// FLOAT16 constant color block
+		else
+		{
+			switch (decode_mode)
+			{
+			case ASTCENC_PRF_LDR_SRGB:
+			case ASTCENC_PRF_LDR:
+				color = vfloat4(error_color_nan());
+				break;
+			case ASTCENC_PRF_HDR_RGB_LDR_A:
+			case ASTCENC_PRF_HDR:
+				// Constant-color block; unpack from FP16 to FP32.
+				color = float16_to_float(vint4(scb.constant_color));
+				use_lns = 1;
+				break;
+			}
+		}
+
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = color.lane<0>();
+			blk.data_g[i] = color.lane<1>();
+			blk.data_b[i] = color.lane<2>();
+			blk.data_a[i] = color.lane<3>();
+			blk.rgb_lns[i] = use_lns;
+			blk.alpha_lns[i] = use_lns;
+		}
+
+		return;
+	}
+
+	// Get the appropriate partition-table entry
+	int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptors
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
+
+	// Now that we have endpoint colors and weights, we can unpack texel colors
+	int plane2_component = scb.plane2_component;
+	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
+
+	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+
+		int texel_count = pi.partition_texel_count[i];
+		for (int j = 0; j < texel_count; j++)
+		{
+			int tix = pi.texels_of_partition[i][j];
+			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
+			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
+			vfloat4 colorf = decode_texel(color, lns_mask);
+
+			blk.data_r[tix] = colorf.lane<0>();
+			blk.data_g[tix] = colorf.lane<1>();
+			blk.data_b[tix] = colorf.lane<2>();
+			blk.data_a[tix] = colorf.lane<3>();
+		}
+	}
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_2plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(scb.partition_count == 1);
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
+
+	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
+
+	vfloat4 summa = vfloat4::zero();
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	// Unpack and compute error for each texel in the partition
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
+		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
+
+		vfloat4 color = int_to_float(colori);
+		vfloat4 oldColor = blk.texel(i);
+
+		// Compare error using a perceptual decode metric for RGBM textures
+		if (config.flags & ASTCENC_FLG_MAP_RGBM)
+		{
+			// Fail encodings that result in zero weight M pixels. Note that this can cause
+			// "interesting" artifacts if we reject all useful encodings - we typically get max
+			// brightness encodings instead which look just as bad. We recommend users apply a
+			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+			// getting small M values post-quantization, but we can't prove it would never
+			// happen, especially at low bit rates ...
+			if (color.lane<3>() == 0.0f)
+			{
+				return -ERROR_CALC_DEFAULT;
+			}
+
+			// Compute error based on decoded RGBM color
+			color = vfloat4(
+				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+
+			oldColor = vfloat4(
+				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+		}
+
+		vfloat4 error = oldColor - color;
+		error = min(abs(error), 1e15f);
+		error = error * error;
+
+		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+
+	// Get the appropriate partition-table entry
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	vfloat4 summa = vfloat4::zero();
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(config.profile,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		// Unpack and compute error for each texel in the partition
+		unsigned int texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
+			                              vint4(plane1_weights[tix]));
+
+			vfloat4 color = int_to_float(colori);
+			vfloat4 oldColor = blk.texel(tix);
+
+			// Compare error using a perceptual decode metric for RGBM textures
+			if (config.flags & ASTCENC_FLG_MAP_RGBM)
+			{
+				// Fail encodings that result in zero weight M pixels. Note that this can cause
+				// "interesting" artifacts if we reject all useful encodings - we typically get max
+				// brightness encodings instead which look just as bad. We recommend users apply a
+				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+				// getting small M values post-quantization, but we can't prove it would never
+				// happen, especially at low bit rates ...
+				if (color.lane<3>() == 0.0f)
+				{
+					return -ERROR_CALC_DEFAULT;
+				}
+
+				// Compute error based on decoded RGBM color
+				color = vfloat4(
+					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+
+				oldColor = vfloat4(
+					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+			}
+
+			vfloat4 error = oldColor - color;
+			error = min(abs(error), 1e15f);
+			error = error * error;
+
+			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+		}
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane_1partition(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	// Unpack and compute error for each texel in the partition
+	vfloatacc summav = vfloatacc::zero();
+
+	vint lane_id = vint::lane_id();
+
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Compute EP1 contribution
+		vint weight1 = vint::loada(plane1_weights + i);
+		vint ep1_r = vint(ep1.lane<0>()) * weight1;
+		vint ep1_g = vint(ep1.lane<1>()) * weight1;
+		vint ep1_b = vint(ep1.lane<2>()) * weight1;
+		vint ep1_a = vint(ep1.lane<3>()) * weight1;
+
+		// Compute EP0 contribution
+		vint weight0 = vint(64) - weight1;
+		vint ep0_r = vint(ep0.lane<0>()) * weight0;
+		vint ep0_g = vint(ep0.lane<1>()) * weight0;
+		vint ep0_b = vint(ep0.lane<2>()) * weight0;
+		vint ep0_a = vint(ep0.lane<3>()) * weight0;
+
+		// Combine contributions
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
+
+		// If using a U8 decode mode bit replicate top 8 bits
+		// so rest of codec can assume 0xFFFF max range everywhere
+		vint colori_r8 = asr<8>(colori_r) * vint(257);
+		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
+
+		vint colori_g8 = asr<8>(colori_g) * vint(257);
+		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
+
+		vint colori_b8 = asr<8>(colori_b) * vint(257);
+		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
+
+		vint colori_a8 = asr<8>(colori_a) * vint(257);
+		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
+
+		// Compute color diff
+		vfloat color_r = int_to_float(colori_r);
+		vfloat color_g = int_to_float(colori_g);
+		vfloat color_b = int_to_float(colori_b);
+		vfloat color_a = int_to_float(colori_a);
+
+		vfloat color_orig_r = loada(blk.data_r + i);
+		vfloat color_orig_g = loada(blk.data_g + i);
+		vfloat color_orig_b = loada(blk.data_b + i);
+		vfloat color_orig_a = loada(blk.data_a + i);
+
+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+		              + color_error_g * blk.channel_weight.lane<1>()
+		              + color_error_b * blk.channel_weight.lane<2>()
+		              + color_error_a * blk.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
+	}
+
+	return hadd_s(summav);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for the library entrypoint.
+ */
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <cmath>
+#include <limits>
+#include <string>
+
+#include "astcenc_diagnostic_trace.h"
+
+/** @brief The global trace logger. */
+static TraceLog* g_TraceLog = nullptr;
+
+/** @brief The JSON indentation level. */
+static const size_t g_trace_indent = 2;
+
+TraceLog::TraceLog(
+	const char* file_name):
+	m_file(file_name, std::ofstream::out | std::ofstream::binary)
+{
+	assert(!g_TraceLog);
+	g_TraceLog = this;
+	m_root = new TraceNode("root");
+}
+
+/* See header for documentation. */
+TraceNode* TraceLog::get_current_leaf()
+{
+	if (m_stack.size())
+	{
+		return m_stack.back();
+	}
+
+	return nullptr;
+}
+
+/* See header for documentation. */
+size_t TraceLog::get_depth()
+{
+	return m_stack.size();
+}
+
+/* See header for documentation. */
+TraceLog::~TraceLog()
+{
+	assert(g_TraceLog == this);
+	delete m_root;
+	g_TraceLog = nullptr;
+}
+
+/* See header for documentation. */
+TraceNode::TraceNode(
+	const char* format,
+	...
+) {
+	// Format the name string
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	// Generate the node
+	TraceNode* parent = g_TraceLog->get_current_leaf();
+	size_t depth = g_TraceLog->get_depth();
+	g_TraceLog->m_stack.push_back(this);
+
+	bool comma = parent && parent->m_attrib_count;
+	auto& out = g_TraceLog->m_file;
+
+	if (parent)
+	{
+		parent->m_attrib_count++;
+	}
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	if (depth)
+	{
+		out << '\n';
+	}
+
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
+	out << in_indents << "[";
+}
+
+/* See header for documentation. */
+void TraceNode::add_attrib(
+	std::string type,
+	std::string key,
+	std::string value
+) {
+	(void)type;
+
+	size_t depth = g_TraceLog->get_depth();
+	size_t indent = (depth * 2) * g_trace_indent;
+	auto& out = g_TraceLog->m_file;
+	bool comma = m_attrib_count;
+	m_attrib_count++;
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	out << '\n';
+	out << std::string(indent, ' ') << "[ "
+	                                << "\"" << key << "\", "
+	                                << value << " ]";
+}
+
+/* See header for documentation. */
+TraceNode::~TraceNode()
+{
+	g_TraceLog->m_stack.pop_back();
+
+	auto& out = g_TraceLog->m_file;
+	size_t depth = g_TraceLog->get_depth();
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	if (m_attrib_count)
+	{
+		out << "\n" << in_indents;
+	}
+	out << "]\n";
+
+	out << out_indents << "]";
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	const char* format,
+	...
+) {
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	std::string value = "\"" + std::string(buffer) + "\"";
+
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("str", key, value);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	float value
+) {
+	// Turn infinities into parseable values
+	if (std::isinf(value))
+	{
+		if (value > 0.0f)
+		{
+			value = std::numeric_limits<float>::max();
+		}
+		else
+		{
+			value = -std::numeric_limits<float>::max();
+		}
+	}
+
+	char buffer[256];
+	sprintf(buffer, "%.20g", (double)value);
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("float", key, buffer);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	unsigned int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_diagnostic_trace.h
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.h
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief This module provides a set of diagnostic tracing utilities.
+ *
+ * Overview
+ * ========
+ *
+ * The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
+ * hierarchy contains three levels:
+ *
+ *    - block
+ *        - pass
+ *           - candidate
+ *
+ * One block node exists for each compressed block in the image. One pass node exists for each major
+ * pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
+ * encoding candidate trialed for a pass.
+ *
+ * Each node contains both the hierarchy but also a number of attributes which explain the behavior.
+ * For example, the block node contains the block coordinates in the image, the pass explains the
+ * pass configuration, and the candidate will explain the candidate encoding such as weight
+ * decimation, refinement error, etc.
+ *
+ * Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
+ * Constructing a trace node on the stack will automatically add it to the current node as a child,
+ * and then make it the current node. Destroying the current node will pop the stack and set the
+ * parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
+ * tree structure.
+ *
+ * A set of utility macros are provided to add attribute annotations to the current trace node.
+ *
+ * Usage
+ * =====
+ *
+ * Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
+ * in builds with diagnostics disabled.
+ *
+ * Add annotations to the current trace node using the @c trace_add_data() macro. This will
+ * similarly compile out completely in builds with diagnostics disabled.
+ *
+ * If you need to add additional code to support diagnostics-only behavior wrap
+ * it in preprocessor guards:
+ *
+ *     #if defined(ASTCENC_DIAGNOSTICS)
+ *     #endif
+ */
+
+#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+/**
+ * @brief Class representing a single node in the trace hierarchy.
+ */
+class TraceNode
+{
+public:
+	/**
+	 * @brief Construct a new node.
+	 *
+	 * Constructing a node will push to the the top of the stack, automatically making it a child of
+	 * the current node, and then setting it to become the current node.
+	 *
+	 * @param format   The format template for the node name.
+	 * @param ...      The format parameters.
+	 */
+	TraceNode(const char* format, ...);
+
+	/**
+	 * @brief Add an attribute to this node.
+	 *
+	 * Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
+	 * the caller.
+	 *
+	 * @param type    The type of the attribute.
+	 * @param key     The key of the attribute.
+	 * @param value   The value of the attribute.
+	 */
+	void add_attrib(std::string type, std::string key, std::string value);
+
+	/**
+	 * @brief Destroy this node.
+	 *
+	 * Destroying a node will pop it from the top of the stack, making its parent the current node.
+	 * It is invalid behavior to destroy a node that is not the current node; usage must conform to
+	 * stack push-pop semantics.
+	 */
+	~TraceNode();
+
+	/**
+	 * @brief The number of attributes and child nodes in this node.
+	 */
+	unsigned int m_attrib_count { 0 };
+};
+
+/**
+ * @brief Class representing the trace log file being written.
+ */
+class TraceLog
+{
+public:
+	/**
+	 * @brief Create a new trace log.
+	 *
+	 * The trace log is global; there can be only one at a time.
+	 *
+	 * @param file_name   The name of the file to write.
+	 */
+	TraceLog(const char* file_name);
+
+	/**
+	 * @brief Detroy the trace log.
+	 *
+	 * Trace logs MUST be cleanly destroyed to ensure the file gets written.
+	 */
+	~TraceLog();
+
+	/**
+	 * @brief Get the current child node.
+	 *
+	 * @return The current leaf node.
+	 */
+	TraceNode* get_current_leaf();
+
+	/**
+	 * @brief Get the stack depth of the current child node.
+	 *
+	 * @return The current leaf node stack depth.
+	 */
+	size_t get_depth();
+
+	/**
+	 * @brief The file stream to write to.
+	 */
+	std::ofstream m_file;
+
+	/**
+	 * @brief The stack of nodes (newest at the back).
+	 */
+	std::vector<TraceNode*> m_stack;
+
+private:
+	/**
+	 * @brief The root node in the JSON file.
+	 */
+	TraceNode* m_root;
+};
+
+/**
+ * @brief Utility macro to create a trace node on the stack.
+ *
+ * @param name     The variable name to use.
+ * @param ...      The name template and format parameters.
+ */
+#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
+
+/**
+ * @brief Add a string annotation to the current node.
+ *
+ * @param key      The name of the attribute.
+ * @param format   The format template for the attribute value.
+ * @param ...      The format parameters.
+ */
+void trace_add_data(const char* key, const char* format, ...);
+
+/**
+ * @brief Add a float annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, float value);
+
+/**
+ * @brief Add an integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, int value);
+
+/**
+ * @brief Add an unsigned integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, unsigned int value);
+
+#else
+
+#define TRACE_NODE(name, ...)
+
+#define trace_add_data(...)
+
+#endif
+
+#endif
--- a/thirdparty/astcenc/astcenc_entry.cpp
+++ b/thirdparty/astcenc/astcenc_entry.cpp
--- a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best partition for a block.
+ *
+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
+ * texels into an ideal partitioning for the requested partition count, and then compares that
+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
+ * partitionings that actually generate fewer than the requested partition count, but only the top
+ * N candidates are actually put through a more detailed search. N is determined by the compressor
+ * quality preset.
+ *
+ * For the detailed search, each candidate is checked against two possible encoding methods:
+ *
+ *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
+ *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
+ *
+ * This is implemented by computing the compute mean color and dominant direction for each
+ * partition. This defines two lines, both of which go through the mean color value.
+ *
+ * - One line has a direction defined by the dominant direction; this is used to assess the error
+ *   from using an uncorrelated color representation.
+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
+ *   (RGB + scale) color representation.
+ *
+ * The best candidate is selected by computing the squared-errors that result from using these
+ * lines for endpoint selection.
+ */
+
+#include <limits>
+#include "astcenc_internal.h"
+
+/**
+ * @brief Pick some initial kmeans cluster centers.
+ *
+ * @param      blk               The image block color data to compress.
+ * @param      texel_count       The number of texels in the block.
+ * @param      partition_count   The number of partitions in the block.
+ * @param[out] cluster_centers   The initial partition cluster center colors.
+ */
+static void kmeans_init(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	unsigned int clusters_selected = 0;
+	float distances[BLOCK_MAX_TEXELS];
+
+	// Pick a random sample as first cluster center; 145897 from random.org
+	unsigned int sample = 145897 % texel_count;
+	vfloat4 center_color = blk.texel(sample);
+	cluster_centers[clusters_selected] = center_color;
+	clusters_selected++;
+
+	// Compute the distance to the first cluster center
+	float distance_sum = 0.0f;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vfloat4 color = blk.texel(i);
+		vfloat4 diff = color - center_color;
+		float distance = dot_s(diff * diff, blk.channel_weight);
+		distance_sum += distance;
+		distances[i] = distance;
+	}
+
+	// More numbers from random.org for weighted-random center selection
+	const float cluster_cutoffs[9] {
+		0.626220f, 0.932770f, 0.275454f,
+		0.318558f, 0.240113f, 0.009190f,
+		0.347661f, 0.731960f, 0.156391f
+	};
+
+	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
+
+	// Pick the remaining samples as needed
+	while (true)
+	{
+		// Pick the next center in a weighted-random fashion.
+		float summa = 0.0f;
+		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
+		for (sample = 0; sample < texel_count; sample++)
+		{
+			summa += distances[sample];
+			if (summa >= distance_cutoff)
+			{
+				break;
+			}
+		}
+
+		// Clamp to a valid range and store the selected cluster center
+		sample = astc::min(sample, texel_count - 1);
+
+		center_color = blk.texel(sample);
+		cluster_centers[clusters_selected++] = center_color;
+		if (clusters_selected >= partition_count)
+		{
+			break;
+		}
+
+		// Compute the distance to the new cluster center, keep the min dist
+		distance_sum = 0.0f;
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			vfloat4 color = blk.texel(i);
+			vfloat4 diff = color - center_color;
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			distance = astc::min(distance, distances[i]);
+			distance_sum += distance;
+			distances[i] = distance;
+		}
+	}
+}
+
+/**
+ * @brief Assign texels to clusters, based on a set of chosen center points.
+ *
+ * @param      blk                  The image block color data to compress.
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_count      The number of partitions in the block.
+ * @param      cluster_centers      The partition cluster center colors.
+ * @param[out] partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_assign(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the best partition for every texel
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		float best_distance = std::numeric_limits<float>::max();
+		unsigned int best_partition = 0;
+
+		vfloat4 color = blk.texel(i);
+		for (unsigned int j = 0; j < partition_count; j++)
+		{
+			vfloat4 diff = color - cluster_centers[j];
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			if (distance < best_distance)
+			{
+				best_distance = distance;
+				best_partition = j;
+			}
+		}
+
+		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
+		partition_texel_count[best_partition]++;
+	}
+
+	// It is possible to get a situation where a partition ends up without any texels. In this case,
+	// assign texel N to partition N. This is silly, but ensures that every partition retains at
+	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
+	// so if we actually did a reassignment, run the whole loop over again.
+	bool problem_case;
+	do
+	{
+		problem_case = false;
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			if (partition_texel_count[i] == 0)
+			{
+				partition_texel_count[partition_of_texel[i]]--;
+				partition_texel_count[i]++;
+				partition_of_texel[i] = static_cast<uint8_t>(i);
+				problem_case = true;
+			}
+		}
+	} while (problem_case);
+}
+
+/**
+ * @brief Compute new cluster centers based on their center of gravity.
+ *
+ * @param       blk                  The image block color data to compress.
+ * @param       texel_count          The number of texels in the block.
+ * @param       partition_count      The number of partitions in the block.
+ * @param[out]  cluster_centers      The new cluster center colors.
+ * @param       partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_update(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero()
+	};
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the center of gravity in each cluster
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint8_t partition = partition_of_texel[i];
+		color_sum[partition] += blk.texel(i);
+		partition_texel_count[partition]++;
+	}
+
+	// Set the center of gravity to be the new cluster center
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
+		cluster_centers[i] = color_sum[i] * scale;
+	}
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch2(
+	const uint64_t a[2],
+	const uint64_t b[2]
+) {
+	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
+	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch3(
+	const uint64_t a[3],
+	const uint64_t b[3]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+
+	int s0 = p11 + p22;
+	int s1 = p12 + p21;
+	int v0 = astc::min(s0, s1) + p00;
+
+	int s2 = p10 + p22;
+	int s3 = p12 + p20;
+	int v1 = astc::min(s2, s3) + p01;
+
+	int s4 = p10 + p21;
+	int s5 = p11 + p20;
+	int v2 = astc::min(s4, s5) + p02;
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch4(
+	const uint64_t a[4],
+	const uint64_t b[4]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+	int p03 = popcount(a[0] ^ b[3]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+	int p13 = popcount(a[1] ^ b[3]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+	int p23 = popcount(a[2] ^ b[3]);
+
+	int p30 = popcount(a[3] ^ b[0]);
+	int p31 = popcount(a[3] ^ b[1]);
+	int p32 = popcount(a[3] ^ b[2]);
+	int p33 = popcount(a[3] ^ b[3]);
+
+	int mx23 = astc::min(p22 + p33, p23 + p32);
+	int mx13 = astc::min(p21 + p33, p23 + p31);
+	int mx12 = astc::min(p21 + p32, p22 + p31);
+	int mx03 = astc::min(p20 + p33, p23 + p30);
+	int mx02 = astc::min(p20 + p32, p22 + p30);
+	int mx01 = astc::min(p21 + p30, p20 + p31);
+
+	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
+	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
+	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
+	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
+}
+
+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
+
+/**
+ * @brief Count the partition table mismatches vs the data clustering.
+ *
+ * @param      bsd               The block size information.
+ * @param      partition_count   The number of partitions in the block.
+ * @param      bitmaps           The block texel partition assignment patterns.
+ * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
+ */
+static void count_partition_mismatch_bits(
+	const block_size_descriptor& bsd,
+	unsigned int partition_count,
+	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
+	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
+) {
+	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+	promise(active_count > 0);
+
+	if (partition_count == 2)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+	else if (partition_count == 3)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+}
+
+/**
+ * @brief Use counting sort on the mismatch array to sort partition candidates.
+ *
+ * @param      partitioning_count   The number of packed partitionings.
+ * @param      mismatch_count       Partitioning mismatch counts, in index order.
+ * @param[out] partition_ordering   Partition index values, in mismatch order.
+ *
+ * @return The number of active partitions in this selection.
+ */
+static unsigned int get_partition_ordering_by_mismatch_bits(
+	unsigned int texel_count,
+	unsigned int partitioning_count,
+	const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
+	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	promise(partitioning_count > 0);
+	uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
+
+	// Create the histogram of mismatch counts
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		mscount[mismatch_count[i]]++;
+	}
+
+	// Create a running sum from the histogram array
+	// Indices store previous values only; i.e. exclude self after sum
+	uint16_t sum = 0;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint16_t cnt = mscount[i];
+		mscount[i] = sum;
+		sum += cnt;
+	}
+
+	// Use the running sum as the index, incrementing after read to allow
+	// sequential entries with the same count
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		unsigned int idx = mscount[mismatch_count[i]]++;
+		partition_ordering[idx] = static_cast<uint16_t>(i);
+	}
+
+	return partitioning_count;
+}
+
+/**
+ * @brief Use k-means clustering to compute a partition ordering for a block..
+ *
+ * @param      bsd                  The block size information.
+ * @param      blk                  The image block color data to compress.
+ * @param      partition_count      The desired number of partitions in the block.
+ * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
+ *
+ * @return The number of active partitionings in this selection.
+ */
+static unsigned int compute_kmeans_partition_ordering(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
+	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
+
+	// Use three passes of k-means clustering to partition the block data
+	for (unsigned int i = 0; i < 3; i++)
+	{
+		if (i == 0)
+		{
+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
+		}
+		else
+		{
+			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+		}
+
+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+	}
+
+	// Construct the block bitmaps of texel assignments to each partition
+	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
+	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+	promise(texels_to_process > 0);
+	for (unsigned int i = 0; i < texels_to_process; i++)
+	{
+		unsigned int idx = bsd.kmeans_texels[i];
+		bitmaps[texel_partitions[idx]] |= 1ULL << i;
+	}
+
+	// Count the mismatch between the block and the format's partition tables
+	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
+	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
+
+	// Sort the partitions based on the number of mismatched bits
+	return get_partition_ordering_by_mismatch_bits(
+	    texels_to_process,
+	    bsd.partitioning_count_selected[partition_count - 1],
+	    mismatch_counts, partition_ordering);
+}
+
+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param      max_values      The max number of entries in the best result arrays.
+ * @param      this_error      The error of the new entry.
+ * @param      this_partition  The partition ID of the new entry.
+ * @param[out] best_errors     The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+	unsigned int max_values,
+	float this_error,
+	unsigned int this_partition,
+	float* best_errors,
+	unsigned int* best_partitions)
+{
+	promise(max_values > 0);
+
+	// Don't bother searching if the current worst error beats the new error
+	if (this_error >= best_errors[max_values - 1])
+	{
+		return;
+	}
+
+	// Else insert into the list in error-order
+	for (unsigned int i = 0; i < max_values; i++)
+	{
+		// Existing result is better - move on ...
+		if (this_error > best_errors[i])
+		{
+			continue;
+		}
+
+		// Move existing results down one
+		for (unsigned int j = max_values - 1; j > i; j--)
+		{
+			best_errors[j] = best_errors[j - 1];
+			best_partitions[j] = best_partitions[j - 1];
+		}
+
+		// Insert new result
+		best_errors[i] = this_error;
+		best_partitions[i] = this_partition;
+		break;
+	}
+}
+
+/* See header for documentation. */
+unsigned int find_best_partition_candidates(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_search_limit,
+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+	unsigned int requested_candidates
+) {
+	// Constant used to estimate quantization error for a given partitioning; the optimal value for
+	// this depends on bitrate. These values have been determined empirically.
+	unsigned int texels_per_block = bsd.texel_count;
+	float weight_imprecision_estim = 0.055f;
+	if (texels_per_block <= 20)
+	{
+		weight_imprecision_estim = 0.03f;
+	}
+	else if (texels_per_block <= 31)
+	{
+		weight_imprecision_estim = 0.04f;
+	}
+	else if (texels_per_block <= 41)
+	{
+		weight_imprecision_estim = 0.05f;
+	}
+
+	promise(partition_count > 0);
+	promise(partition_search_limit > 0);
+
+	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
+
+	uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
+	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
+	partition_search_limit = astc::min(partition_search_limit, sequence_len);
+	requested_candidates = astc::min(partition_search_limit, requested_candidates);
+
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	// Partitioning errors assuming uncorrelated-chrominance endpoints
+	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	// Partitioning errors assuming same-chrominance endpoints
+	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
+	}
+
+	if (uses_alpha)
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
+			line4 samec_lines[BLOCK_MAX_PARTITIONS];
+
+			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
+			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
+
+			float line_lengths[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+
+				uncor_lines[j].a = pm.avg;
+				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
+
+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+				uncor_plines[j].bs = uncor_lines[j].b;
+
+				samec_lines[j].a = vfloat4::zero();
+				samec_lines[j].b = normalize_safe(pm.avg, unit4());
+
+				samec_plines[j].amod = vfloat4::zero();
+				samec_plines[j].bs = samec_lines[j].b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgba(pi,
+			                           blk,
+			                           uncor_plines,
+			                           samec_plines,
+			                           line_lengths,
+			                           uncor_error,
+			                           samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
+				vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
+
+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+				partition_lines3& pl = plines[j];
+
+				pl.uncor_line.a = pm.avg;
+				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
+
+				pl.samec_line.a = vfloat4::zero();
+				pl.samec_line.b = normalize_safe(pm.avg, unit3());
+
+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+				pl.uncor_pline.bs   = pl.uncor_line.b;
+
+				pl.samec_pline.amod = vfloat4::zero();
+				pl.samec_pline.bs   = pl.samec_line.b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgb(pi,
+			                          blk,
+			                          plines,
+			                          uncor_error,
+			                          samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_lines3& pl = plines[j];
+
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
+				vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
+
+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+
+	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+		interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+	}
+
+	uint64_t bitmasks[1024/64] { 0 };
+	unsigned int emitted = 0;
+
+	// Deduplicate the first "requested" entries
+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
+	{
+		unsigned int partition = interleave[i];
+
+		unsigned int word = partition / 64;
+		unsigned int bit = partition % 64;
+
+		bool written = bitmasks[word] & (1ull << bit);
+
+		if (!written)
+		{
+			best_partitions[emitted] = partition;
+			bitmasks[word] |= 1ull << bit;
+			emitted++;
+
+			if (emitted == requested_candidates)
+			{
+				break;
+			}
+		}
+	}
+
+	return emitted;
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
--- a/thirdparty/astcenc/astcenc_image.cpp
+++ b/thirdparty/astcenc/astcenc_image.cpp
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for creating in-memory ASTC image structures.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Loader pipeline function type for data fetch from memory.
+ */
+using pixel_loader = vfloat4(*)(const void*, int);
+
+/**
+ * @brief Loader pipeline function type for swizzling data in a vector.
+ */
+using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
+
+/**
+ * @brief Loader pipeline function type for converting data in a vector to LNS.
+ */
+using pixel_converter = vfloat4(*)(vfloat4, vmask4);
+
+/**
+ * @brief Load a 8-bit UNORM texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_u8(
+	const void* data,
+	int base_offset
+) {
+	const uint8_t* data8 = static_cast<const uint8_t*>(data);
+	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
+}
+
+/**
+ * @brief Load a 16-bit fp16 texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f16(
+	const void* data,
+	int base_offset
+) {
+	const uint16_t* data16 = static_cast<const uint16_t*>(data);
+	int r = data16[base_offset    ];
+	int g = data16[base_offset + 1];
+	int b = data16[base_offset + 2];
+	int a = data16[base_offset + 3];
+	return float16_to_float(vint4(r, g, b, a));
+}
+
+/**
+ * @brief Load a 32-bit float texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f32(
+	const void* data,
+	int base_offset
+) {
+	const float* data32 = static_cast<const float*>(data);
+	return vfloat4(data32 + base_offset);
+}
+
+/**
+ * @brief Dummy no-op swizzle function.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel_skip(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	return data;
+}
+
+/**
+ * @brief Swizzle a texel into a new arrangement.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	ASTCENC_ALIGNAS float datas[6];
+
+	storea(data, datas);
+	datas[ASTCENC_SWZ_0] = 0.0f;
+	datas[ASTCENC_SWZ_1] = 1.0f;
+
+	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
+}
+
+/**
+ * @brief Encode a texel that is entirely LDR linear.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_unorm(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	(void)lns_mask;
+	return data * 65535.0f;
+}
+
+/**
+ * @brief Encode a texel that includes at least some HDR LNS texels.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_lns(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	vfloat4 datav_unorm = data * 65535.0f;
+	vfloat4 datav_lns = float_to_lns(data);
+	return select(datav_unorm, datav_lns, lns_mask);
+}
+
+/* See header for documentation. */
+void load_image_block(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+	unsigned int zsize = img.dim_z;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	int idx = 0;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean(0.0f);
+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+
+	// This works because we impose the same choice everywhere during encode
+	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
+	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
+	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
+	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
+	vmask4 lns_mask = use_lns != vint4::zero();
+
+	// Set up the function pointers for loading pipeline as needed
+	pixel_loader loader = load_texel_u8;
+	if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		loader = load_texel_f16;
+	}
+	else if  (img.data_type == ASTCENC_TYPE_F32)
+	{
+		loader = load_texel_f32;
+	}
+
+	pixel_swizzler swizzler = swz_texel_skip;
+	if (needs_swz)
+	{
+		swizzler = swz_texel;
+	}
+
+	pixel_converter converter = encode_texel_unorm;
+	if (any(lns_mask))
+	{
+		converter = encode_texel_lns;
+	}
+
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		unsigned int zi = astc::min(zpos + z, zsize - 1);
+		void* plane = img.data[zi];
+
+		for (unsigned int y = 0; y < bsd.ydim; y++)
+		{
+			unsigned int yi = astc::min(ypos + y, ysize - 1);
+
+			for (unsigned int x = 0; x < bsd.xdim; x++)
+			{
+				unsigned int xi = astc::min(xpos + x, xsize - 1);
+
+				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
+				datav = swizzler(datav, swz);
+				datav = converter(datav, lns_mask);
+
+				// Compute block metadata
+				data_min = min(data_min, datav);
+				data_mean += datav * data_mean_scale;
+				data_max = max(data_max, datav);
+
+				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+				blk.data_r[idx] = datav.lane<0>();
+				blk.data_g[idx] = datav.lane<1>();
+				blk.data_b[idx] = datav.lane<2>();
+				blk.data_a[idx] = datav.lane<3>();
+
+				blk.rgb_lns[idx] = rgb_lns;
+				blk.alpha_lns[idx] = a_lns;
+
+				idx++;
+			}
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	vfloat4 data_enc = blk.texel(0);
+	vfloat4 data_enc_unorm = data_enc / 65535.0f;
+	vfloat4 data_enc_lns = vfloat4::zero();
+
+	if (rgb_lns || a_lns)
+	{
+		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
+	}
+
+	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
+
+	// Store block metadata
+	blk.data_min = data_min;
+	blk.data_mean = data_mean;
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void load_image_block_fast_ldr(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	(void)decode_mode;
+
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean = vfloat4::zero();
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+	int idx = 0;
+
+	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
+	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
+	{
+		unsigned int yi = astc::min(y, ysize - 1);
+
+		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
+		{
+			unsigned int xi = astc::min(x, xsize - 1);
+
+			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
+			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
+
+			// Compute block metadata
+			data_min = min(data_min, datav);
+			data_mean += datav;
+			data_max = max(data_max, datav);
+
+			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+			blk.data_r[idx] = datav.lane<0>();
+			blk.data_g[idx] = datav.lane<1>();
+			blk.data_b[idx] = datav.lane<2>();
+			blk.data_a[idx] = datav.lane<3>();
+
+			idx++;
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	blk.origin_texel = blk.texel(0) / 65535.0f;
+
+	// Store block metadata
+	blk.rgb_lns[0] = 0;
+	blk.alpha_lns[0] = 0;
+	blk.data_min = data_min;
+	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void store_image_block(
+	astcenc_image& img,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int x_size = img.dim_x;
+	unsigned int x_start = xpos;
+	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
+	unsigned int x_count = x_end - x_start;
+	unsigned int x_nudge = bsd.xdim - x_count;
+
+	unsigned int y_size = img.dim_y;
+	unsigned int y_start = ypos;
+	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
+	unsigned int y_count = y_end - y_start;
+	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+
+	unsigned int z_size = img.dim_z;
+	unsigned int z_start = zpos;
+	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	// True if any swizzle uses Z reconstruct
+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
+	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+
+	int idx = 0;
+	if (img.data_type == ASTCENC_TYPE_U8)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
+				{
+					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
+					unsigned int used_texels = astc::min(x_count - x, max_texels);
+
+					// Unaligned load as rows are not always SIMD_WIDTH long
+					vfloat data_r(blk.data_r + idx);
+					vfloat data_g(blk.data_g + idx);
+					vfloat data_b(blk.data_b + idx);
+					vfloat data_a(blk.data_a + idx);
+
+					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
+					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
+					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
+					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+
+					if (needs_swz)
+					{
+						vint swizzle_table[7];
+						swizzle_table[ASTCENC_SWZ_0] = vint(0);
+						swizzle_table[ASTCENC_SWZ_1] = vint(255);
+						swizzle_table[ASTCENC_SWZ_R] = data_ri;
+						swizzle_table[ASTCENC_SWZ_G] = data_gi;
+						swizzle_table[ASTCENC_SWZ_B] = data_bi;
+						swizzle_table[ASTCENC_SWZ_A] = data_ai;
+
+						if (needs_z)
+						{
+							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
+							data_z = max(data_z, 0.0f);
+							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
+
+							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
+						}
+
+						data_ri = swizzle_table[swz.r];
+						data_gi = swizzle_table[swz.g];
+						data_bi = swizzle_table[swz.b];
+						data_ai = swizzle_table[swz.a];
+					}
+
+					// Errors are NaN encoded - convert to magenta error color
+					// Branch is OK here - it is almost never true so predicts well
+					vmask nan_mask = data_r != data_r;
+					if (any(nan_mask))
+					{
+						data_ri = select(data_ri, vint(0xFF), nan_mask);
+						data_gi = select(data_gi, vint(0x00), nan_mask);
+						data_bi = select(data_bi, vint(0xFF), nan_mask);
+						data_ai = select(data_ai, vint(0xFF), nan_mask);
+					}
+
+					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
+					vmask store_mask = vint::lane_id() < vint(used_texels);
+					store_lanes_masked(data8_row, data_rgbai, store_mask);
+
+					data8_row += ASTCENC_SIMD_WIDTH * 4;
+					idx += used_texels;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vint4 color;
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = blk.data_r[idx];
+						data[ASTCENC_SWZ_G] = blk.data_g[idx];
+						data[ASTCENC_SWZ_B] = blk.data_b[idx];
+						data[ASTCENC_SWZ_A] = blk.data_a[idx];
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+						color = float_to_float16(colorf);
+					}
+					else
+					{
+						vfloat4 colorf = blk.texel(idx);
+						color = float_to_float16(colorf);
+					}
+
+					// TODO: Vectorize with store N shorts?
+					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
+					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
+					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
+					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
+					data16_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else // if (img.data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img.data_type == ASTCENC_TYPE_F32);
+
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			float* data32 = static_cast<float*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vfloat4 color = blk.texel(idx);
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = color.lane<0>();
+						data[ASTCENC_SWZ_G] = color.lane<1>();
+						data[ASTCENC_SWZ_B] = color.lane<2>();
+						data[ASTCENC_SWZ_A] = color.lane<3>();
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					}
+
+					store(color, data32_row);
+					data32_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+}
--- a/thirdparty/astcenc/astcenc_integer_sequence.cpp
+++ b/thirdparty/astcenc/astcenc_integer_sequence.cpp
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
+ */
+
+#include "astcenc_internal.h"
+
+#include <array>
+
+/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t quints_of_integer[128][3] {
+	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
+	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
+	{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
+	{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
+	{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
+	{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
+	{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
+	{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
+	{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
+	{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
+	{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
+	{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
+	{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
+	{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
+	{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
+	{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
+	{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
+	{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
+	{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
+	{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
+	{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
+	{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
+	{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
+	{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
+	{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
+	{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
+	{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
+	{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
+	{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
+	{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
+	{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
+	{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
+};
+
+/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
+static const uint8_t integer_of_quints[5][5][5] {
+	{
+		{0, 1, 2, 3, 4},
+		{8, 9, 10, 11, 12},
+		{16, 17, 18, 19, 20},
+		{24, 25, 26, 27, 28},
+		{5, 13, 21, 29, 6}
+	},
+	{
+		{32, 33, 34, 35, 36},
+		{40, 41, 42, 43, 44},
+		{48, 49, 50, 51, 52},
+		{56, 57, 58, 59, 60},
+		{37, 45, 53, 61, 14}
+	},
+	{
+		{64, 65, 66, 67, 68},
+		{72, 73, 74, 75, 76},
+		{80, 81, 82, 83, 84},
+		{88, 89, 90, 91, 92},
+		{69, 77, 85, 93, 22}
+	},
+	{
+		{96, 97, 98, 99, 100},
+		{104, 105, 106, 107, 108},
+		{112, 113, 114, 115, 116},
+		{120, 121, 122, 123, 124},
+		{101, 109, 117, 125, 30}
+	},
+	{
+		{102, 103, 70, 71, 38},
+		{110, 111, 78, 79, 46},
+		{118, 119, 86, 87, 54},
+		{126, 127, 94, 95, 62},
+		{39, 47, 55, 63, 31}
+	}
+};
+
+/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t trits_of_integer[256][5] {
+	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
+	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
+	{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
+	{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
+	{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
+	{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
+	{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
+	{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
+	{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
+	{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
+	{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
+	{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
+	{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
+	{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
+	{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
+	{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
+	{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
+	{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
+	{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
+	{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
+	{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
+	{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
+	{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
+	{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
+	{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
+	{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
+	{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
+	{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
+	{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
+	{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
+	{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
+	{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
+	{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
+	{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
+	{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
+	{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
+	{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
+	{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
+	{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
+	{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
+	{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
+	{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
+	{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
+	{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
+};
+
+/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
+static const uint8_t integer_of_trits[3][3][3][3][3] {
+	{
+		{
+			{
+				{0, 1, 2},
+				{4, 5, 6},
+				{8, 9, 10}
+			},
+			{
+				{16, 17, 18},
+				{20, 21, 22},
+				{24, 25, 26}
+			},
+			{
+				{3, 7, 15},
+				{19, 23, 27},
+				{12, 13, 14}
+			}
+		},
+		{
+			{
+				{32, 33, 34},
+				{36, 37, 38},
+				{40, 41, 42}
+			},
+			{
+				{48, 49, 50},
+				{52, 53, 54},
+				{56, 57, 58}
+			},
+			{
+				{35, 39, 47},
+				{51, 55, 59},
+				{44, 45, 46}
+			}
+		},
+		{
+			{
+				{64, 65, 66},
+				{68, 69, 70},
+				{72, 73, 74}
+			},
+			{
+				{80, 81, 82},
+				{84, 85, 86},
+				{88, 89, 90}
+			},
+			{
+				{67, 71, 79},
+				{83, 87, 91},
+				{76, 77, 78}
+			}
+		}
+	},
+	{
+		{
+			{
+				{128, 129, 130},
+				{132, 133, 134},
+				{136, 137, 138}
+			},
+			{
+				{144, 145, 146},
+				{148, 149, 150},
+				{152, 153, 154}
+			},
+			{
+				{131, 135, 143},
+				{147, 151, 155},
+				{140, 141, 142}
+			}
+		},
+		{
+			{
+				{160, 161, 162},
+				{164, 165, 166},
+				{168, 169, 170}
+			},
+			{
+				{176, 177, 178},
+				{180, 181, 182},
+				{184, 185, 186}
+			},
+			{
+				{163, 167, 175},
+				{179, 183, 187},
+				{172, 173, 174}
+			}
+		},
+		{
+			{
+				{192, 193, 194},
+				{196, 197, 198},
+				{200, 201, 202}
+			},
+			{
+				{208, 209, 210},
+				{212, 213, 214},
+				{216, 217, 218}
+			},
+			{
+				{195, 199, 207},
+				{211, 215, 219},
+				{204, 205, 206}
+			}
+		}
+	},
+	{
+		{
+			{
+				{96, 97, 98},
+				{100, 101, 102},
+				{104, 105, 106}
+			},
+			{
+				{112, 113, 114},
+				{116, 117, 118},
+				{120, 121, 122}
+			},
+			{
+				{99, 103, 111},
+				{115, 119, 123},
+				{108, 109, 110}
+			}
+		},
+		{
+			{
+				{224, 225, 226},
+				{228, 229, 230},
+				{232, 233, 234}
+			},
+			{
+				{240, 241, 242},
+				{244, 245, 246},
+				{248, 249, 250}
+			},
+			{
+				{227, 231, 239},
+				{243, 247, 251},
+				{236, 237, 238}
+			}
+		},
+		{
+			{
+				{28, 29, 30},
+				{60, 61, 62},
+				{92, 93, 94}
+			},
+			{
+				{156, 157, 158},
+				{188, 189, 190},
+				{220, 221, 222}
+			},
+			{
+				{31, 63, 127},
+				{159, 191, 255},
+				{252, 253, 254}
+			}
+		}
+	}
+};
+
+/**
+ * @brief The number of bits, trits, and quints needed for a quant level.
+ */
+struct btq_count
+{
+	/** @brief The number of bits. */
+	uint8_t bits:6;
+
+	/** @brief The number of trits. */
+	uint8_t trits:1;
+
+	/** @brief The number of quints. */
+	uint8_t quints:1;
+};
+
+/**
+ * @brief The table of bits, trits, and quints needed for a quant encode.
+ */
+static const std::array<btq_count, 21> btq_counts {{
+	{ 1, 0, 0 }, // QUANT_2
+	{ 0, 1, 0 }, // QUANT_3
+	{ 2, 0, 0 }, // QUANT_4
+	{ 0, 0, 1 }, // QUANT_5
+	{ 1, 1, 0 }, // QUANT_6
+	{ 3, 0, 0 }, // QUANT_8
+	{ 1, 0, 1 }, // QUANT_10
+	{ 2, 1, 0 }, // QUANT_12
+	{ 4, 0, 0 }, // QUANT_16
+	{ 2, 0, 1 }, // QUANT_20
+	{ 3, 1, 0 }, // QUANT_24
+	{ 5, 0, 0 }, // QUANT_32
+	{ 3, 0, 1 }, // QUANT_40
+	{ 4, 1, 0 }, // QUANT_48
+	{ 6, 0, 0 }, // QUANT_64
+	{ 4, 0, 1 }, // QUANT_80
+	{ 5, 1, 0 }, // QUANT_96
+	{ 7, 0, 0 }, // QUANT_128
+	{ 5, 0, 1 }, // QUANT_160
+	{ 6, 1, 0 }, // QUANT_192
+	{ 8, 0, 0 }  // QUANT_256
+}};
+
+/**
+ * @brief The sequence scale, round, and divisors needed to compute sizing.
+ *
+ * The length of a quantized sequence in bits is:
+ *     (scale * <sequence_len> + round) / divisor
+ */
+struct ise_size
+{
+	/** @brief The scaling parameter. */
+	uint8_t scale:6;
+
+	/** @brief The divisor parameter. */
+	uint8_t divisor:2;
+};
+
+/**
+ * @brief The table of scale, round, and divisors needed for quant sizing.
+ */
+static const std::array<ise_size, 21> ise_sizes {{
+	{  1, 0 }, // QUANT_2
+	{  8, 2 }, // QUANT_3
+	{  2, 0 }, // QUANT_4
+	{  7, 1 }, // QUANT_5
+	{ 13, 2 }, // QUANT_6
+	{  3, 0 }, // QUANT_8
+	{ 10, 1 }, // QUANT_10
+	{ 18, 2 }, // QUANT_12
+	{  4, 0 }, // QUANT_16
+	{ 13, 1 }, // QUANT_20
+	{ 23, 2 }, // QUANT_24
+	{  5, 0 }, // QUANT_32
+	{ 16, 1 }, // QUANT_40
+	{ 28, 2 }, // QUANT_48
+	{  6, 0 }, // QUANT_64
+	{ 19, 1 }, // QUANT_80
+	{ 33, 2 }, // QUANT_96
+	{  7, 0 }, // QUANT_128
+	{ 22, 1 }, // QUANT_160
+	{ 38, 2 }, // QUANT_192
+	{  8, 0 }  // QUANT_256
+}};
+
+/* See header for documentation. */
+unsigned int get_ise_sequence_bitcount(
+	unsigned int character_count,
+	quant_method quant_level
+) {
+	// Cope with out-of bounds values - input might be invalid
+	if (static_cast<size_t>(quant_level) >= ise_sizes.size())
+	{
+		// Arbitrary large number that's more than an ASTC block can hold
+		return 1024;
+	}
+
+	auto& entry = ise_sizes[quant_level];
+	unsigned int divisor = (entry.divisor << 1) + 1;
+	return (entry.scale * character_count + divisor - 1) / divisor;
+}
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	unsigned int value,
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	uint8_t ptr[2]
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 16 bits from two bytes.
+ *
+ * This function reads a packed N-bit field from two bytes in memory. The stored value must exist
+ * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline unsigned int read_bits(
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	const uint8_t* ptr
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	unsigned int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/* See header for documentation. */
+void encode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+	unsigned int mask = (1 << bits) - 1;
+
+	// Write out trits and bits
+	if (trits)
+	{
+		unsigned int i = 0;
+		unsigned int full_trit_blocks = character_count / 5;
+
+		for (unsigned int j = 0; j < full_trit_blocks; j++)
+		{
+			unsigned int i4 = input_data[i + 4] >> bits;
+			unsigned int i3 = input_data[i + 3] >> bits;
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			// The max size of a trit bit count is 6, so we can always safely
+			// pack a single MX value with the following 1 or 2 T bits.
+			uint8_t pack;
+
+			// Element 0 + T0 + T1
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 1 + T2 + T3
+			pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2 + T4
+			pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+
+			// Element 3 + T5 + T6
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 4 + T7
+			pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i4 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i4 =                            0;
+			unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
+			unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[4]  { 2, 2, 1, 2 };
+				static const uint8_t tshift[4] { 0, 2, 4, 5 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out quints and bits
+	else if (quints)
+	{
+		unsigned int i = 0;
+		unsigned int full_quint_blocks = character_count / 3;
+
+		for (unsigned int j = 0; j < full_quint_blocks; j++)
+		{
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			// The max size of a quint bit count is 5, so we can always safely
+			// pack a single M value with the following 2 or 3 T bits.
+			uint8_t pack;
+
+			// Element 0
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
+			write_bits(pack, bits + 3, bit_offset, output_data);
+			bit_offset += bits + 3;
+
+			// Element 1
+			pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i2 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i2 =                            0;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[2]  { 3, 2 };
+				static const uint8_t tshift[2] { 0, 3 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out just bits
+	else
+	{
+		for (unsigned int i = 0; i < character_count; i++)
+		{
+			write_bits(input_data[i], bits, bit_offset, output_data);
+			bit_offset += bits;
+		}
+	}
+}
+
+/* See header for documentation. */
+void decode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
+	// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
+	// but we keep 4 additional character_count of padding.
+	uint8_t results[68];
+	uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+
+	unsigned int lcounter = 0;
+	unsigned int hcounter = 0;
+
+	// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
+		bit_offset += bits;
+
+		if (trits)
+		{
+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+
+		if (quints)
+		{
+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
+			static const uint8_t block_shift[3]   { 0, 3, 5 };
+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+
+	// Unpack trit-blocks or quint-blocks as needed
+	if (trits)
+	{
+		unsigned int trit_blocks = (character_count + 4) / 5;
+		promise(trit_blocks > 0);
+		for (unsigned int i = 0; i < trit_blocks; i++)
+		{
+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
+			results[5 * i    ] |= tritptr[0] << bits;
+			results[5 * i + 1] |= tritptr[1] << bits;
+			results[5 * i + 2] |= tritptr[2] << bits;
+			results[5 * i + 3] |= tritptr[3] << bits;
+			results[5 * i + 4] |= tritptr[4] << bits;
+		}
+	}
+
+	if (quints)
+	{
+		unsigned int quint_blocks = (character_count + 2) / 3;
+		promise(quint_blocks > 0);
+		for (unsigned int i = 0; i < quint_blocks; i++)
+		{
+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+			results[3 * i    ] |= quintptr[0] << bits;
+			results[3 * i + 1] |= quintptr[1] << bits;
+			results[3 * i + 2] |= quintptr[2] << bits;
+		}
+	}
+
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		output_data[i] = results[i];
+	}
+}
--- a/thirdparty/astcenc/astcenc_internal.h
+++ b/thirdparty/astcenc/astcenc_internal.h
--- a/thirdparty/astcenc/astcenc_internal_entry.h
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations for the outer context.
+ *
+ * The outer context includes thread-pool management, which is slower to
+ * compile due to increased use of C++ stdlib. The inner context used in the
+ * majority of the codec library does not include this.
+ */
+
+#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
+#define ASTCENC_INTERNAL_ENTRY_INCLUDED
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "astcenc_internal.h"
+
+/* ============================================================================
+  Parallel execution control
+============================================================================ */
+
+/**
+ * @brief A simple counter-based manager for parallel task execution.
+ *
+ * The task processing execution consists of:
+ *
+ *     * A single-threaded init stage.
+ *     * A multi-threaded processing stage.
+ *     * A condition variable so threads can wait for processing completion.
+ *
+ * The init stage will be executed by the first thread to arrive in the critical section, there is
+ * no main thread in the thread pool.
+ *
+ * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
+ * basis. Threads may each therefore executed different numbers of tasks, depending on their
+ * processing complexity. The task queue and the task tickets are just counters; the caller must map
+ * these integers to an actual processing partition in a specific problem domain.
+ *
+ * The exit wait condition is needed to ensure processing has finished before a worker thread can
+ * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
+ * because there are no new tasks to assign to it while other worker threads are still processing.
+ * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
+ *
+ * The basic usage model:
+ *
+ *     // --------- From single-threaded code ---------
+ *
+ *     // Reset the tracker state
+ *     manager->reset()
+ *
+ *     // --------- From multi-threaded code ---------
+ *
+ *     // Run the stage init; only first thread actually runs the lambda
+ *     manager->init(<lambda>)
+ *
+ *     do
+ *     {
+ *         // Request a task assignment
+ *         uint task_count;
+ *         uint base_index = manager->get_tasks(<granule>, task_count);
+ *
+ *         // Process any tasks we were given (task_count <= granule size)
+ *         if (task_count)
+ *         {
+ *             // Run the user task processing code for N tasks here
+ *             ...
+ *
+ *             // Flag these tasks as complete
+ *             manager->complete_tasks(task_count);
+ *         }
+ *     } while (task_count);
+ *
+ *     // Wait for all threads to complete tasks before progressing
+ *     manager->wait()
+ *
+  *     // Run the stage term; only first thread actually runs the lambda
+ *     manager->term(<lambda>)
+ */
+class ParallelManager
+{
+private:
+	/** @brief Lock used for critical section and condition synchronization. */
+	std::mutex m_lock;
+
+	/** @brief True if the current operation is cancelled. */
+	std::atomic<bool> m_is_cancelled;
+
+	/** @brief True if the stage init() step has been executed. */
+	bool m_init_done;
+
+	/** @brief True if the stage term() step has been executed. */
+	bool m_term_done;
+
+	/** @brief Condition variable for tracking stage processing completion. */
+	std::condition_variable m_complete;
+
+	/** @brief Number of tasks started, but not necessarily finished. */
+	std::atomic<unsigned int> m_start_count;
+
+	/** @brief Number of tasks finished. */
+	unsigned int m_done_count;
+
+	/** @brief Number of tasks that need to be processed. */
+	unsigned int m_task_count;
+
+	/** @brief Progress callback (optional). */
+	astcenc_progress_callback m_callback;
+
+	/** @brief Lock used for callback synchronization. */
+	std::mutex m_callback_lock;
+
+	/** @brief Minimum progress before making a callback. */
+	float m_callback_min_diff;
+
+	/** @brief Last progress callback value. */
+	float m_callback_last_value;
+
+public:
+	/** @brief Create a new ParallelManager. */
+	ParallelManager()
+	{
+		reset();
+	}
+
+	/**
+	 * @brief Reset the tracker for a new processing batch.
+	 *
+	 * This must be called from single-threaded code before starting the multi-threaded processing
+	 * operations.
+	 */
+	void reset()
+	{
+		m_init_done = false;
+		m_term_done = false;
+		m_is_cancelled = false;
+		m_start_count = 0;
+		m_done_count = 0;
+		m_task_count = 0;
+		m_callback = nullptr;
+		m_callback_last_value = 0.0f;
+		m_callback_min_diff = 1.0f;
+	}
+
+	/**
+	 * @brief Clear the tracker and stop new tasks being assigned.
+	 *
+	 * Note, all in-flight tasks in a worker will still complete normally.
+	 */
+	void cancel()
+	{
+		m_is_cancelled = true;
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param init_func   Callable which executes the stage initialization. It must return the
+	 *                    total number of tasks in the stage.
+	 */
+	void init(std::function<unsigned int(void)> init_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = init_func();
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param task_count   Total number of tasks needing processing.
+	 * @param callback     Function pointer for progress status callbacks.
+	 */
+	void init(unsigned int task_count, astcenc_progress_callback callback)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_callback = callback;
+			m_task_count = task_count;
+			m_init_done = true;
+
+			// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
+			float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
+			m_callback_min_diff = astc::max(min_diff, 1.0f);
+		}
+	}
+
+	/**
+	 * @brief Request a task assignment.
+	 *
+	 * Assign up to @c granule tasks to the caller for processing.
+	 *
+	 * @param      granule   Maximum number of tasks that can be assigned.
+	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
+	 *
+	 * @return Task index of the first assigned task; assigned tasks increment from this.
+	 */
+	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
+	{
+		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
+		if (m_is_cancelled || base >= m_task_count)
+		{
+			count = 0;
+			return 0;
+		}
+
+		count = astc::min(m_task_count - base, granule);
+		return base;
+	}
+
+	/**
+	 * @brief Complete a task assignment.
+	 *
+	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
+	 * completes the processing of the stage.
+	 *
+	 * @param count   The number of completed tasks.
+	 */
+	void complete_task_assignment(unsigned int count)
+	{
+		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
+		// update here and the wait() for other threads
+		unsigned int local_count;
+		float local_last_value;
+		{
+			std::unique_lock<std::mutex> lck(m_lock);
+			m_done_count += count;
+			local_count = m_done_count;
+			local_last_value = m_callback_last_value;
+
+			// Ensure the progress bar hits 100%
+			if (m_callback && m_done_count == m_task_count)
+			{
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				m_callback(100.0f);
+				m_callback_last_value = 100.0f;
+			}
+
+			// Notify if nothing left to do
+			if (m_is_cancelled || m_done_count == m_task_count)
+			{
+				lck.unlock();
+				m_complete.notify_all();
+			}
+		}
+
+		// Process progress callback if we have one
+		if (m_callback)
+		{
+			// Initial lockless test - have we progressed enough to emit?
+			float num = static_cast<float>(local_count);
+			float den = static_cast<float>(m_task_count);
+			float this_value =  (num / den) * 100.0f;
+			bool report_test = (this_value - local_last_value) > m_callback_min_diff;
+
+			// Recheck under lock, because another thread might report first
+			if (report_test)
+			{
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
+				if (report_retest)
+				{
+					m_callback(this_value);
+					m_callback_last_value = this_value;
+				}
+			}
+		}
+	}
+
+	/**
+	 * @brief Wait for stage processing to complete.
+	 */
+	void wait()
+	{
+		std::unique_lock<std::mutex> lck(m_lock);
+		m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage term step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * work pool termination. Caller must have called @c wait() prior to calling this function to
+	 * ensure that processing is complete.
+	 *
+	 * @param term_func   Callable which executes the stage termination.
+	 */
+	void term(std::function<void(void)> term_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_term_done)
+		{
+			term_func();
+			m_term_done = true;
+		}
+	}
+};
+
+/**
+ * @brief The astcenc compression context.
+ */
+struct astcenc_context
+{
+	/** @brief The context internal state. */
+	astcenc_contexti context;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	/** @brief The parallel manager for averages computation. */
+	ParallelManager manage_avg;
+
+	/** @brief The parallel manager for compression. */
+	ParallelManager manage_compress;
+#endif
+
+	/** @brief The parallel manager for decompression. */
+	ParallelManager manage_decompress;
+};
+
+#endif
--- a/thirdparty/astcenc/astcenc_mathlib.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include "astcenc_mathlib.h"
+
+/**
+ * @brief 64-bit rotate left.
+ *
+ * @param val   The value to rotate.
+ * @param count The rotation, in bits.
+ */
+static inline uint64_t rotl(uint64_t val, int count)
+{
+	return (val << count) | (val >> (64 - count));
+}
+
+/* See header for documentation. */
+void astc::rand_init(uint64_t state[2])
+{
+	state[0] = 0xfaf9e171cea1ec6bULL;
+	state[1] = 0xf1b318cc06af5d71ULL;
+}
+
+/* See header for documentation. */
+uint64_t astc::rand(uint64_t state[2])
+{
+	uint64_t s0 = state[0];
+	uint64_t s1 = state[1];
+	uint64_t res = s0 + s1;
+	s1 ^= s0;
+	state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
+	state[1] = rotl(s1, 37);
+	return res;
+}
--- a/thirdparty/astcenc/astcenc_mathlib.h
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@@ -0,0 +1,505 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements a variety of mathematical data types and library
+ * functions used by the codec.
+ */
+
+#ifndef ASTC_MATHLIB_H_INCLUDED
+#define ASTC_MATHLIB_H_INCLUDED
+
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+
+#ifndef ASTCENC_POPCNT
+  #if defined(__POPCNT__)
+    #define ASTCENC_POPCNT 1
+  #else
+    #define ASTCENC_POPCNT 0
+  #endif
+#endif
+
+#ifndef ASTCENC_F16C
+  #if defined(__F16C__)
+    #define ASTCENC_F16C 1
+  #else
+    #define ASTCENC_F16C 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SSE
+  #if defined(__SSE4_2__)
+    #define ASTCENC_SSE 42
+  #elif defined(__SSE4_1__)
+    #define ASTCENC_SSE 41
+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+    #define ASTCENC_SSE 20
+  #else
+    #define ASTCENC_SSE 0
+  #endif
+#endif
+
+#ifndef ASTCENC_AVX
+  #if defined(__AVX2__)
+    #define ASTCENC_AVX 2
+    #define ASTCENC_X86_GATHERS 1
+  #elif defined(__AVX__)
+    #define ASTCENC_AVX 1
+    #define ASTCENC_X86_GATHERS 1
+  #else
+    #define ASTCENC_AVX 0
+  #endif
+#endif
+
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SVE
+  #if defined(__ARM_FEATURE_SVE)
+    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+      #define ASTCENC_SVE 8
+    // Auto-detected SVE can only assume vector width of 4 is available, but
+    // must also allow for hardware being longer and so all use of intrinsics
+    // must explicitly use predicate masks to limit to 4-wide.
+    #else
+      #define ASTCENC_SVE 4
+    #endif
+    #else
+    #define ASTCENC_SVE 0
+  #endif
+#endif
+
+// Force vector-sized SIMD alignment
+#if ASTCENC_AVX || ASTCENC_SVE == 8
+  #define ASTCENC_VECALIGN 32
+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
+  #define ASTCENC_VECALIGN 16
+// Use default alignment for non-SIMD builds
+#else
+  #define ASTCENC_VECALIGN 0
+#endif
+
+// C++11 states that alignas(0) should be ignored but GCC doesn't do
+// this on some versions, so workaround and avoid emitting alignas(0)
+#if ASTCENC_VECALIGN > 0
+	#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
+#else
+	#define ASTCENC_ALIGNAS
+#endif
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
+	#include <immintrin.h>
+#endif
+
+/* ============================================================================
+  Fast math library; note that many of the higher-order functions in this set
+  use approximations which are less accurate, but faster, than <cmath> standard
+  library equivalents.
+
+  Note: Many of these are not necessarily faster than simple C versions when
+  used on a single scalar value, but are included for testing purposes as most
+  have an option based on SSE intrinsics and therefore provide an obvious route
+  to future vectorization.
+============================================================================ */
+
+// Union for manipulation of float bit patterns
+typedef union
+{
+	uint32_t u;
+	int32_t s;
+	float f;
+} if32;
+
+// These are namespaced to avoid colliding with C standard library functions.
+namespace astc
+{
+
+static const float PI          = 3.14159265358979323846f;
+static const float PI_OVER_TWO = 1.57079632679489661923f;
+
+/**
+ * @brief SP float absolute value.
+ *
+ * @param v   The value to make absolute.
+ *
+ * @return The absolute value.
+ */
+static inline float fabs(float v)
+{
+	return std::fabs(v);
+}
+
+/**
+ * @brief Test if a float value is a nan.
+ *
+ * @param v    The value test.
+ *
+ * @return Zero is not a NaN, non-zero otherwise.
+ */
+static inline bool isnan(float v)
+{
+	return v != v;
+}
+
+/**
+ * @brief Return the minimum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q)
+{
+	return p < q ? p : q;
+}
+
+/**
+ * @brief Return the minimum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r)
+{
+	return min(min(p, q), r);
+}
+
+/**
+ * @brief Return the minimum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r, T s)
+{
+	return min(min(p, q), min(r, s));
+}
+
+/**
+ * @brief Return the maximum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q)
+{
+	return p > q ? p : q;
+}
+
+/**
+ * @brief Return the maximum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r)
+{
+	return max(max(p, q), r);
+}
+
+/**
+ * @brief Return the maximum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r, T s)
+{
+	return max(max(p, q), max(r, s));
+}
+
+/**
+ * @brief Clamp a value value between @c mn and @c mx.
+ *
+ * For floats, NaNs are turned into @c mn.
+ *
+ * @param v      The value to clamp.
+ * @param mn     The min value (inclusive).
+ * @param mx     The max value (inclusive).
+ *
+ * @return The clamped value.
+ */
+template<typename T>
+inline T clamp(T v, T mn, T mx)
+{
+	// Do not reorder; correct NaN handling relies on the fact that comparison
+	// with NaN returns false and will fall-though to the "min" value.
+	if (v > mx) return mx;
+	if (v > mn) return v;
+	return mn;
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 1.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v   The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp1f(float v)
+{
+	return astc::clamp(v, 0.0f, 1.0f);
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 255.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v  The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp255f(float v)
+{
+	return astc::clamp(v, 0.0f, 255.0f);
+}
+
+/**
+ * @brief SP float round-down.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline float flt_rd(float v)
+{
+	return std::floor(v);
+}
+
+/**
+ * @brief SP float round-to-nearest and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rtn(float v)
+{
+
+	return static_cast<int>(v + 0.5f);
+}
+
+/**
+ * @brief SP float round down and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rd(float v)
+{
+	return static_cast<int>(v);
+}
+
+/**
+ * @brief SP float bit-interpreted as an integer.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline int float_as_int(float v)
+{
+	union { int a; float b; } u;
+	u.b = v;
+	return u.a;
+}
+
+/**
+ * @brief Integer bit-interpreted as an SP float.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline float int_as_float(int v)
+{
+	union { int a; float b; } u;
+	u.a = v;
+	return u.b;
+}
+
+/**
+ * @brief Fast approximation of 1.0 / sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float rsqrt(float v)
+{
+	return 1.0f / std::sqrt(v);
+}
+
+/**
+ * @brief Fast approximation of sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float sqrt(float v)
+{
+	return std::sqrt(v);
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      v      The input value.
+ * @param[out] expo   The output exponent.
+ *
+ * @return The mantissa.
+ */
+static inline float frexp(float v, int* expo)
+{
+	if32 p;
+	p.f = v;
+	*expo = ((p.u >> 23) & 0xFF) - 126;
+	p.u = (p.u & 0x807fffff) | 0x3f000000;
+	return p.f;
+}
+
+/**
+ * @brief Initialize the seed structure for a random number generator.
+ *
+ * Important note: For the purposes of ASTC we want sets of random numbers to
+ * use the codec, but we want the same seed value across instances and threads
+ * to ensure that image output is stable across compressor runs and across
+ * platforms. Every PRNG created by this call will therefore return the same
+ * sequence of values ...
+ *
+ * @param state The state structure to initialize.
+ */
+void rand_init(uint64_t state[2]);
+
+/**
+ * @brief Return the next random number from the generator.
+ *
+ * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
+ * public-domain implementation given by David Blackman & Sebastiano Vigna at
+ * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
+ *
+ * @param state The state structure to use/update.
+ */
+uint64_t rand(uint64_t state[2]);
+
+}
+
+/* ============================================================================
+  Softfloat library with fp32 and fp16 conversion functionality.
+============================================================================ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+	/* narrowing float->float conversions */
+	uint16_t float_to_sf16(float val);
+	float sf16_to_float(uint16_t val);
+#endif
+
+/*********************************
+  Vector library
+*********************************/
+#include "astcenc_vecmathlib.h"
+
+/*********************************
+  Declaration of line types
+*********************************/
+// parametric line, 2D: The line is given by line = a + b * t.
+
+struct line2
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+// parametric line, 3D
+struct line3
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+struct line4
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+
+struct processed_line2
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line3
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line4
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+#endif
--- a/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
+++ b/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Soft-float library for IEEE-754.
+ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+
+#include "astcenc_mathlib.h"
+
+/*	sized soft-float types. These are mapped to the sized integer
+    types of C99, instead of C's floating-point types; this is because
+    the library needs to maintain exact, bit-level control on all
+    operations on these data types. */
+typedef uint16_t sf16;
+typedef uint32_t sf32;
+
+/******************************************
+  helper functions and their lookup tables
+ ******************************************/
+/* count leading zeros functions. Only used when the input is nonzero. */
+
+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+#elif defined(__arm__) && defined(__ARMCC_VERSION)
+#elif defined(__arm__) && defined(__GNUC__)
+#else
+	/* table used for the slow default versions. */
+	static const uint8_t clz_table[256] =
+	{
+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+#endif
+
+/*
+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
+static uint32_t clz32(uint32_t inp)
+{
+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+		uint32_t bsr;
+		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
+		return 31 - bsr;
+	#else
+		#if defined(__arm__) && defined(__ARMCC_VERSION)
+			return __clz(inp);			/* armcc builtin */
+		#else
+			#if defined(__arm__) && defined(__GNUC__)
+				uint32_t lz;
+				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
+				return lz;
+			#else
+				/* slow default version */
+				uint32_t summa = 24;
+				if (inp >= UINT32_C(0x10000))
+				{
+					inp >>= 16;
+					summa -= 16;
+				}
+				if (inp >= UINT32_C(0x100))
+				{
+					inp >>= 8;
+					summa -= 8;
+				}
+				return summa + clz_table[inp];
+			#endif
+		#endif
+	#endif
+}
+
+/* the five rounding modes that IEEE-754r defines */
+typedef enum
+{
+	SF_UP = 0,				/* round towards positive infinity */
+	SF_DOWN = 1,			/* round towards negative infinity */
+	SF_TOZERO = 2,			/* round towards zero */
+	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
+	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
+} roundmode;
+
+
+static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
+	msk--;						/* negative if even, nonnegative if odd. */
+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
+	inp2 >>= shamt;
+	return inp2;
+}
+
+static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
+	inp += vl1;
+	inp >>= shamt;
+	return inp;
+}
+
+static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	inp += vl1;
+	inp--;
+	inp >>= shamt;
+	return inp;
+}
+
+/* convert from FP16 to FP32. */
+static sf32 sf16_to_sf32(sf16 inp)
+{
+	uint32_t inpx = inp;
+
+	/*
+		This table contains, for every FP16 sign/exponent value combination,
+		the difference between the input FP16 value and the value obtained
+		by shifting the correct FP32 result right by 13 bits.
+		This table allows us to handle every case except denormals and NaN
+		with just 1 table lookup, 2 shifts and 1 add.
+	*/
+
+	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
+	static const uint32_t tbl[64] =
+	{
+		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
+		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
+	};
+
+	uint32_t res = tbl[inpx >> 10];
+	res += inpx;
+
+	/* Normal cases: MSB of 'res' not set. */
+	if ((res & WITH_MSB(0)) == 0)
+	{
+		return res << 13;
+	}
+
+	/* Infinity and Zero: 10 LSB of 'res' not set. */
+	if ((res & 0x3FF) == 0)
+	{
+		return res << 13;
+	}
+
+	/* NaN: the exponent field of 'inp' is non-zero. */
+	if ((inpx & 0x7C00) != 0)
+	{
+		/* All NaNs are quietened. */
+		return (res << 13) | 0x400000;
+	}
+
+	/* Denormal cases */
+	uint32_t sign = (inpx & 0x8000) << 16;
+	uint32_t mskval = inpx & 0x7FFF;
+	uint32_t leadingzeroes = clz32(mskval);
+	mskval <<= leadingzeroes;
+	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
+}
+
+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
+static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
+{
+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
+	static const uint8_t tab[512] {
+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
+
+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
+	};
+
+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
+	   size. */
+	static const uint32_t tabx[60] {
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
+	};
+
+	uint32_t p;
+	uint32_t idx = rmode + tab[inp >> 23];
+	uint32_t vlx = tabx[idx];
+	switch (idx)
+	{
+		/*
+			Positive number which may be Infinity or NaN.
+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
+			(If we don't do this quieting, then a NaN  that is distinguished only by having
+			its low-order bits set, would be turned into an INF. */
+	case 50:
+	case 51:
+	case 52:
+	case 53:
+	case 54:
+	case 55:
+	case 56:
+	case 57:
+	case 58:
+	case 59:
+		/*
+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
+			For NaNs, however, this operation will keep bit 23 with the value 1.
+			We can then extract bit 23, and logical-OR bit 9 of the result with this
+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
+			of the mantissa is set.)
+		*/
+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
+		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
+		/*
+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
+		*/
+	case 0:
+		/*
+			-inp will set the MSB if the input number is nonzero.
+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
+
+		/*
+			negative, exponent = , round-mode == DOWN, need to check whether number is
+			actually 0. If it is, return 0x8000 ( float -0.0 )
+			Else return the smallest negative number ( 0x8001 ) */
+	case 6:
+		/*
+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
+			we obtain a value that is 0 if the input value is in fact zero and has
+			the MSB set if it isn't. We then right-shift the value by 31 places to
+			get a value that is 0 if the input is -0.0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
+
+		/*
+			for all other cases involving underflow/overflow, we don't need to
+			do actual tests; we just return 'vlx'.
+		*/
+	case 1:
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+	case 7:
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+	case 17:
+	case 18:
+	case 19:
+	case 40:
+	case 41:
+	case 42:
+	case 43:
+	case 44:
+	case 45:
+	case 46:
+	case 47:
+	case 48:
+	case 49:
+		return static_cast<sf16>(vlx);
+
+		/*
+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
+
+		/* normal number, all rounding modes except round-to-nearest-even: */
+	case 30:
+	case 31:
+	case 32:
+	case 34:
+	case 35:
+	case 36:
+	case 37:
+	case 39:
+		return static_cast<sf16>((inp + vlx) >> 13);
+
+		/* normal number, round-to-nearest-even. */
+	case 33:
+	case 38:
+		p = inp + vlx;
+		p += (inp >> 13) & 1;
+		return static_cast<sf16>(p >> 13);
+
+		/*
+			the various denormal cases. These are not expected to be common, so their performance is a bit
+			less important. For each of these cases, we need to extract an exponent and a mantissa
+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
+			sign of the resulting denormal number.
+		*/
+	case 21:
+	case 22:
+	case 25:
+	case 27:
+		/* denormal, round towards zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
+	case 20:
+	case 26:
+		/* denormal, round away from zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 24:
+	case 29:
+		/* denormal, round to nearest-away */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 23:
+	case 28:
+		/* denormal, round to nearest-even. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	}
+
+	return 0;
+}
+
+/* convert from soft-float to native-float */
+float sf16_to_float(uint16_t p)
+{
+	if32 i;
+	i.u = sf16_to_sf32(p);
+	return i.f;
+}
+
+/* convert from native-float to soft-float */
+uint16_t float_to_sf16(float p)
+{
+	if32 i;
+	i.f = p;
+	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_partition_tables.cpp
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for generating partition tables on demand.
+ */
+
+#include "astcenc_internal.h"
+
+/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
+#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
+
+/**
+ * @brief Generate a canonical representation of a partition pattern.
+ *
+ * The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
+ * the remapped texel index. Remapping ensures that we only match on the partition pattern,
+ * independent of the partition order generated by the hash.
+ *
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_of_texel   The partition assignments, in hash order.
+ * @param[out] bit_pattern          The output bit pattern representation.
+ */
+static void generate_canonical_partitioning(
+	unsigned int texel_count,
+	const uint8_t* partition_of_texel,
+	uint64_t bit_pattern[BIT_PATTERN_WORDS]
+) {
+	// Clear the pattern
+	for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
+	{
+		bit_pattern[i] = 0;
+	}
+
+	// Store a mapping to reorder the raw partitions so that the partitions are ordered such
+	// that the lowest texel index in partition N is smaller than the lowest texel index in
+	// partition N + 1.
+	int mapped_index[BLOCK_MAX_PARTITIONS];
+	int map_weight_count = 0;
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		mapped_index[i] = -1;
+	}
+
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		int index = partition_of_texel[i];
+		if (mapped_index[index] < 0)
+		{
+			mapped_index[index] = map_weight_count++;
+		}
+
+		uint64_t xlat_index = mapped_index[index];
+		bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
+	}
+}
+
+/**
+ * @brief Compare two canonical patterns to see if they are the same.
+ *
+ * @param part1   The first canonical bit pattern to check.
+ * @param part2   The second canonical bit pattern to check.
+ *
+ * @return @c true if the patterns are the same, @c false otherwise.
+ */
+static bool compare_canonical_partitionings(
+	const uint64_t part1[BIT_PATTERN_WORDS],
+	const uint64_t part2[BIT_PATTERN_WORDS]
+) {
+	return (part1[0] == part2[0])
+#if BIT_PATTERN_WORDS > 1
+	    && (part1[1] == part2[1])
+#endif
+#if BIT_PATTERN_WORDS > 2
+	    && (part1[2] == part2[2])
+#endif
+#if BIT_PATTERN_WORDS > 3
+	    && (part1[3] == part2[3])
+#endif
+#if BIT_PATTERN_WORDS > 4
+	    && (part1[4] == part2[4])
+#endif
+#if BIT_PATTERN_WORDS > 5
+	    && (part1[5] == part2[5])
+#endif
+#if BIT_PATTERN_WORDS > 6
+	    && (part1[6] == part2[6])
+#endif
+	    ;
+}
+
+/**
+ * @brief Hash function used for procedural partition assignment.
+ *
+ * @param inp   The hash seed.
+ *
+ * @return The hashed value.
+ */
+static uint32_t hash52(
+	uint32_t inp
+) {
+	inp ^= inp >> 15;
+
+	// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
+	inp *= 0xEEDE0891;
+	inp ^= inp >> 5;
+	inp += inp << 16;
+	inp ^= inp >> 7;
+	inp ^= inp >> 3;
+	inp ^= inp << 6;
+	inp ^= inp >> 17;
+	return inp;
+}
+
+/**
+ * @brief Select texel assignment for a single coordinate.
+ *
+ * @param seed              The seed - the partition index from the block.
+ * @param x                 The texel X coordinate in the block.
+ * @param y                 The texel Y coordinate in the block.
+ * @param z                 The texel Z coordinate in the block.
+ * @param partition_count   The total partition count of this encoding.
+ * @param small_block       @c true if the block has fewer than 32 texels.
+ *
+ * @return The assigned partition index for this texel.
+ */
+static uint8_t select_partition(
+	int seed,
+	int x,
+	int y,
+	int z,
+	int partition_count,
+	bool small_block
+) {
+	// For small blocks bias the coordinates to get better distribution
+	if (small_block)
+	{
+		x <<= 1;
+		y <<= 1;
+		z <<= 1;
+	}
+
+	seed += (partition_count - 1) * 1024;
+
+	uint32_t rnum = hash52(seed);
+
+	uint8_t seed1 = rnum & 0xF;
+	uint8_t seed2 = (rnum >> 4) & 0xF;
+	uint8_t seed3 = (rnum >> 8) & 0xF;
+	uint8_t seed4 = (rnum >> 12) & 0xF;
+	uint8_t seed5 = (rnum >> 16) & 0xF;
+	uint8_t seed6 = (rnum >> 20) & 0xF;
+	uint8_t seed7 = (rnum >> 24) & 0xF;
+	uint8_t seed8 = (rnum >> 28) & 0xF;
+	uint8_t seed9 = (rnum >> 18) & 0xF;
+	uint8_t seed10 = (rnum >> 22) & 0xF;
+	uint8_t seed11 = (rnum >> 26) & 0xF;
+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+	// Squaring all the seeds in order to bias their distribution towards lower values.
+	seed1 *= seed1;
+	seed2 *= seed2;
+	seed3 *= seed3;
+	seed4 *= seed4;
+	seed5 *= seed5;
+	seed6 *= seed6;
+	seed7 *= seed7;
+	seed8 *= seed8;
+	seed9 *= seed9;
+	seed10 *= seed10;
+	seed11 *= seed11;
+	seed12 *= seed12;
+
+	int sh1, sh2;
+	if (seed & 1)
+	{
+		sh1 = (seed & 2 ? 4 : 5);
+		sh2 = (partition_count == 3 ? 6 : 5);
+	}
+	else
+	{
+		sh1 = (partition_count == 3 ? 6 : 5);
+		sh2 = (seed & 2 ? 4 : 5);
+	}
+
+	int sh3 = (seed & 0x10) ? sh1 : sh2;
+
+	seed1 >>= sh1;
+	seed2 >>= sh2;
+	seed3 >>= sh1;
+	seed4 >>= sh2;
+	seed5 >>= sh1;
+	seed6 >>= sh2;
+	seed7 >>= sh1;
+	seed8 >>= sh2;
+
+	seed9 >>= sh3;
+	seed10 >>= sh3;
+	seed11 >>= sh3;
+	seed12 >>= sh3;
+
+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+	// Apply the saw
+	a &= 0x3F;
+	b &= 0x3F;
+	c &= 0x3F;
+	d &= 0x3F;
+
+	// Remove some of the components if we are to output < 4 partitions.
+	if (partition_count <= 3)
+	{
+		d = 0;
+	}
+
+	if (partition_count <= 2)
+	{
+		c = 0;
+	}
+
+	if (partition_count <= 1)
+	{
+		b = 0;
+	}
+
+	uint8_t partition;
+	if (a >= b && a >= c && a >= d)
+	{
+		partition = 0;
+	}
+	else if (b >= c && b >= d)
+	{
+		partition = 1;
+	}
+	else if (c >= d)
+	{
+		partition = 2;
+	}
+	else
+	{
+		partition = 3;
+	}
+
+	return partition;
+}
+
+/**
+ * @brief Generate a single partition info structure.
+ *
+ * @param[out] bsd                     The block size information.
+ * @param      partition_count         The partition count of this partitioning.
+ * @param      partition_index         The partition index / seed of this partitioning.
+ * @param      partition_remap_index   The remapped partition index of this partitioning.
+ * @param[out] pi                      The partition info structure to populate.
+ *
+ * @return True if this is a useful partition index, False if we can skip it.
+ */
+static bool generate_one_partition_info_entry(
+	block_size_descriptor& bsd,
+	unsigned int partition_count,
+	unsigned int partition_index,
+	unsigned int partition_remap_index,
+	partition_info& pi
+) {
+	int texels_per_block = bsd.texel_count;
+	bool small_block = texels_per_block < 32;
+
+	uint8_t *partition_of_texel = pi.partition_of_texel;
+
+	// Assign texels to partitions
+	int texel_idx = 0;
+	int counts[BLOCK_MAX_PARTITIONS] { 0 };
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		for (unsigned int y = 0; y <  bsd.ydim; y++)
+		{
+			for (unsigned int x = 0; x <  bsd.xdim; x++)
+			{
+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
+				pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
+				*partition_of_texel++ = part;
+			}
+		}
+	}
+
+	// Fill loop tail so we can overfetch later
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		size_t ptex_count = counts[i];
+		size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		for (size_t j = ptex_count; j < ptex_count_simd; j++)
+		{
+			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
+		}
+	}
+
+	// Populate the actual procedural partition count
+	if (counts[0] == 0)
+	{
+		pi.partition_count = 0;
+	}
+	else if (counts[1] == 0)
+	{
+		pi.partition_count = 1;
+	}
+	else if (counts[2] == 0)
+	{
+		pi.partition_count = 2;
+	}
+	else if (counts[3] == 0)
+	{
+		pi.partition_count = 3;
+	}
+	else
+	{
+		pi.partition_count = 4;
+	}
+
+	// Populate the partition index
+	pi.partition_index = static_cast<uint16_t>(partition_index);
+
+	// Populate the coverage bitmaps for 2/3/4 partitions
+	uint64_t* bitmaps { nullptr };
+	if (partition_count == 2)
+	{
+		bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
+	}
+	else if (partition_count == 3)
+	{
+		bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
+	}
+	else if (partition_count == 4)
+	{
+		bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
+	}
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
+	}
+
+	// Valid partitionings have texels in all of the requested partitions
+	bool valid = pi.partition_count == partition_count;
+
+	if (bitmaps)
+	{
+		// Populate the partition coverage bitmap
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			bitmaps[i] = 0ULL;
+		}
+
+		unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+		for (unsigned int i = 0; i < texels_to_process; i++)
+		{
+			unsigned int idx = bsd.kmeans_texels[i];
+			bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
+		}
+	}
+
+	return valid;
+}
+
+static void build_partition_table_for_one_partition_count(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff,
+	unsigned int partition_count,
+	partition_info* ptab,
+	uint64_t* canonical_patterns
+) {
+	unsigned int next_index = 0;
+	bsd.partitioning_count_selected[partition_count - 1] = 0;
+	bsd.partitioning_count_all[partition_count - 1] = 0;
+
+	// Skip tables larger than config max partition count if we can omit modes
+	if (can_omit_partitionings && (partition_count > partition_count_cutoff))
+	{
+		return;
+	}
+
+	// Iterate through twice
+	//   - Pass 0: Keep selected partitionings
+	//   - Pass 1: Keep non-selected partitionings (skip if in omit mode)
+	unsigned int max_iter = can_omit_partitionings ? 1 : 2;
+
+	// Tracker for things we built in the first iteration
+	uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
+	for (unsigned int x = 0; x < max_iter; x++)
+	{
+		for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
+		{
+			// Don't include things we built in the first pass
+			if ((x == 1) && build[i])
+			{
+				continue;
+			}
+
+			bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
+			if ((x == 0) && !keep_useful)
+			{
+				continue;
+			}
+
+			generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
+			bool keep_canonical = true;
+			for (unsigned int j = 0; j < next_index; j++)
+			{
+				bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns +  j * BIT_PATTERN_WORDS);
+				if (match)
+				{
+					keep_canonical = false;
+					break;
+				}
+			}
+
+			if (keep_useful && keep_canonical)
+			{
+				if (x == 0)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_selected[partition_count - 1]++;
+					bsd.partitioning_count_all[partition_count - 1]++;
+					build[i] = 1;
+					next_index++;
+				}
+			}
+			else
+			{
+				if (x == 1)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_all[partition_count - 1]++;
+					next_index++;
+				}
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+void init_partition_tables(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff
+) {
+	partition_info* par_tab2 = bsd.partitionings;
+	partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
+
+	generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
+	bsd.partitioning_count_selected[0] = 1;
+	bsd.partitioning_count_all[0] = 1;
+
+	uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
+
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
+
+	delete[] canonical_patterns;
+}
--- a/thirdparty/astcenc/astcenc_percentile_tables.cpp
+++ b/thirdparty/astcenc/astcenc_percentile_tables.cpp
--- a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
--- a/thirdparty/astcenc/astcenc_quantization.cpp
+++ b/thirdparty/astcenc/astcenc_quantization.cpp
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data tables for numeric quantization..
+ */
+
+#include "astcenc_internal.h"
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+// Not scrambled, starts from QUANT_6
+const uint8_t color_unquant_to_uquant_tables[17][512] {
+	{ // QUANT_6
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_8
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_10
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_12
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
+		 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  69,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
+		 92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163,
+		163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
+		209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_16
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,
+		 34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  68,  68,  68,  68,  68,  68,  68,  68,
+		 68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  85,  85,  85,  85,  85,  85,
+		 85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119,
+		119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+		136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+		170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
+		187, 187, 187, 187, 187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
+		221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
+		238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_20
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,  13,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  27,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  54,
+		 54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  67,  67,  67,  67,  67,  67,
+		 67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
+		 80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,
+		 94,  94,  94,  94,  94,  94,  94,  94,  94,  94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
+		107, 107, 107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148, 148, 148,
+		148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161,
+		161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175,
+		175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
+		188, 188, 188, 188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
+		201, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228, 228, 228,
+		228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242, 242, 242, 242, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_24
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  33,  33,  33,  33,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,
+		 44,  44,  44,  44,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  66,  66,  66,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,
+		 77,  77,  77,  77,  77,  77,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  99,  99,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178, 178, 178, 178,
+		178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211, 211, 211,
+		211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244, 244,
+		244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_32
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,  16,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  33,  33,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  49,  49,  49,  49,  49,
+		 49,  49,  49,  49,  49,  49,  49,  49,  49,  49,  49,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  66,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  82,  82,  82,
+		 82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 115,
+		115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140,
+		140, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173,
+		173, 173, 173, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+		206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 239,
+		239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_40
+		  0,   0,   0,   0,   0,   0,   0,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  32,  32,  32,  32,  32,
+		 32,  32,  32,  32,  32,  32,  32,  32,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,
+		 45,  45,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  65,  65,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  65,  65,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  78,  78,  78,  78,  78,  78,  78,  78,  78,  78,
+		 78,  78,  78,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  97,  97,  97,
+		 97,  97,  97,  97,  97,  97,  97,  97,  97,  97, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158,
+		158, 158, 158, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 177, 177, 177,
+		177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 190, 190, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 210, 210,
+		210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223, 223, 223, 223, 223,
+		223, 223, 223, 223, 223, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_48
+		  0,   0,   0,   0,   0,   0,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  16,  16,  16,  16,
+		 16,  16,  16,  16,  16,  16,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  32,  32,  32,  32,
+		 32,  32,  32,  32,  32,  32,  32,  38,  38,  38,  38,  38,  38,  38,  38,  38,  38,  38,  43,  43,  43,  43,  43,  43,  43,  43,  43,  43,  48,  48,  48,  48,
+		 48,  48,  48,  48,  48,  48,  48,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  59,  59,  59,  59,  59,  59,  59,  59,  59,  59,  59,  65,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  65,  70,  70,  70,  70,  70,  70,  70,  70,  70,  70,  70,  76,  76,  76,  76,  76,  76,  76,  76,  76,  76,  76,  81,  81,
+		 81,  81,  81,  81,  81,  81,  81,  81,  86,  86,  86,  86,  86,  86,  86,  86,  86,  86,  86,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  97,  97,
+		 97,  97,  97,  97,  97,  97,  97,  97,  97, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+		131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158, 158, 158, 158, 158,
+		158, 158, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174, 174, 174, 174, 174,
+		174, 174, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 190, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 207, 207, 207, 207, 207, 207, 207,
+		207, 207, 207, 207, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 223, 223, 223, 223, 223, 223, 223,
+		223, 223, 223, 223, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 239, 239, 239, 239, 239, 239,
+		239, 239, 239, 239, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_64
+		  0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  16,  16,  16,
+		 16,  16,  16,  16,  16,  20,  20,  20,  20,  20,  20,  20,  20,  24,  24,  24,  24,  24,  24,  24,  24,  28,  28,  28,  28,  28,  28,  28,  28,  32,  32,  32,
+		 32,  32,  32,  32,  32,  36,  36,  36,  36,  36,  36,  36,  36,  40,  40,  40,  40,  40,  40,  40,  40,  44,  44,  44,  44,  44,  44,  44,  44,  48,  48,  48,
+		 48,  48,  48,  48,  48,  52,  52,  52,  52,  52,  52,  52,  52,  56,  56,  56,  56,  56,  56,  56,  56,  60,  60,  60,  60,  60,  60,  60,  60,  60,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  69,  69,  69,  69,  69,  69,  69,  69,  73,  73,  73,  73,  73,  73,  73,  73,  77,  77,  77,  77,  77,  77,  77,  77,  81,
+		 81,  81,  81,  81,  81,  81,  81,  85,  85,  85,  85,  85,  85,  85,  85,  89,  89,  89,  89,  89,  89,  89,  89,  93,  93,  93,  93,  93,  93,  93,  93,  97,
+		 97,  97,  97,  97,  97,  97,  97, 101, 101, 101, 101, 101, 101, 101, 101, 105, 105, 105, 105, 105, 105, 105, 105, 109, 109, 109, 109, 109, 109, 109, 109, 113,
+		113, 113, 113, 113, 113, 113, 113, 117, 117, 117, 117, 117, 117, 117, 117, 121, 121, 121, 121, 121, 121, 121, 121, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+		130, 130, 130, 130, 130, 130, 130, 130, 130, 134, 134, 134, 134, 134, 134, 134, 134, 138, 138, 138, 138, 138, 138, 138, 138, 142, 142, 142, 142, 142, 142, 142,
+		142, 146, 146, 146, 146, 146, 146, 146, 146, 150, 150, 150, 150, 150, 150, 150, 150, 154, 154, 154, 154, 154, 154, 154, 154, 158, 158, 158, 158, 158, 158, 158,
+		158, 162, 162, 162, 162, 162, 162, 162, 162, 166, 166, 166, 166, 166, 166, 166, 166, 170, 170, 170, 170, 170, 170, 170, 170, 174, 174, 174, 174, 174, 174, 174,
+		174, 178, 178, 178, 178, 178, 178, 178, 178, 182, 182, 182, 182, 182, 182, 182, 182, 186, 186, 186, 186, 186, 186, 186, 186, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 195, 195, 195, 195, 195, 195, 195, 195, 195, 199, 199, 199, 199, 199, 199, 199, 199, 203, 203, 203, 203, 203, 203, 203, 203, 207, 207, 207, 207, 207,
+		207, 207, 207, 211, 211, 211, 211, 211, 211, 211, 211, 215, 215, 215, 215, 215, 215, 215, 215, 219, 219, 219, 219, 219, 219, 219, 219, 223, 223, 223, 223, 223,
+		223, 223, 223, 227, 227, 227, 227, 227, 227, 227, 227, 231, 231, 231, 231, 231, 231, 231, 231, 235, 235, 235, 235, 235, 235, 235, 235, 239, 239, 239, 239, 239,
+		239, 239, 239, 243, 243, 243, 243, 243, 243, 243, 243, 247, 247, 247, 247, 247, 247, 247, 247, 251, 251, 251, 251, 251, 251, 251, 251, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_80
+		  0,   0,   0,   0,   3,   3,   3,   3,   3,   3,   6,   6,   6,   6,   6,   6,   9,   9,   9,   9,   9,   9,   9,  13,  13,  13,  13,  13,  13,  13,  16,  16,
+		 16,  16,  16,  16,  19,  19,  19,  19,  19,  19,  22,  22,  22,  22,  22,  22,  25,  25,  25,  25,  25,  25,  25,  29,  29,  29,  29,  29,  29,  29,  32,  32,
+		 32,  32,  32,  32,  35,  35,  35,  35,  35,  35,  38,  38,  38,  38,  38,  38,  38,  42,  42,  42,  42,  42,  42,  42,  45,  45,  45,  45,  45,  45,  48,  48,
+		 48,  48,  48,  48,  51,  51,  51,  51,  51,  51,  54,  54,  54,  54,  54,  54,  54,  58,  58,  58,  58,  58,  58,  58,  61,  61,  61,  61,  61,  61,  64,  64,
+		 64,  64,  64,  64,  67,  67,  67,  67,  67,  67,  67,  71,  71,  71,  71,  71,  71,  71,  74,  74,  74,  74,  74,  74,  77,  77,  77,  77,  77,  77,  80,  80,
+		 80,  80,  80,  80,  83,  83,  83,  83,  83,  83,  83,  87,  87,  87,  87,  87,  87,  87,  90,  90,  90,  90,  90,  90,  93,  93,  93,  93,  93,  93,  96,  96,
+		 96,  96,  96,  96,  96, 100, 100, 100, 100, 100, 100, 100, 103, 103, 103, 103, 103, 103, 106, 106, 106, 106, 106, 106, 109, 109, 109, 109, 109, 109, 112, 112,
+		112, 112, 112, 112, 112, 116, 116, 116, 116, 116, 116, 116, 119, 119, 119, 119, 119, 119, 122, 122, 122, 122, 122, 122, 125, 125, 125, 125, 125, 125, 125, 125,
+		130, 130, 130, 130, 130, 130, 130, 130, 133, 133, 133, 133, 133, 133, 136, 136, 136, 136, 136, 136, 139, 139, 139, 139, 139, 139, 139, 143, 143, 143, 143, 143,
+		143, 143, 146, 146, 146, 146, 146, 146, 149, 149, 149, 149, 149, 149, 152, 152, 152, 152, 152, 152, 155, 155, 155, 155, 155, 155, 155, 159, 159, 159, 159, 159,
+		159, 159, 162, 162, 162, 162, 162, 162, 165, 165, 165, 165, 165, 165, 168, 168, 168, 168, 168, 168, 168, 172, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
+		175, 175, 178, 178, 178, 178, 178, 178, 181, 181, 181, 181, 181, 181, 184, 184, 184, 184, 184, 184, 184, 188, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
+		191, 191, 194, 194, 194, 194, 194, 194, 197, 197, 197, 197, 197, 197, 197, 201, 201, 201, 201, 201, 201, 201, 204, 204, 204, 204, 204, 204, 207, 207, 207, 207,
+		207, 207, 210, 210, 210, 210, 210, 210, 213, 213, 213, 213, 213, 213, 213, 217, 217, 217, 217, 217, 217, 217, 220, 220, 220, 220, 220, 220, 223, 223, 223, 223,
+		223, 223, 226, 226, 226, 226, 226, 226, 226, 230, 230, 230, 230, 230, 230, 230, 233, 233, 233, 233, 233, 233, 236, 236, 236, 236, 236, 236, 239, 239, 239, 239,
+		239, 239, 242, 242, 242, 242, 242, 242, 242, 246, 246, 246, 246, 246, 246, 246, 249, 249, 249, 249, 249, 249, 252, 252, 252, 252, 252, 252, 255, 255, 255, 255
+	},
+	{ // QUANT_96
+		  0,   0,   0,   2,   2,   2,   2,   2,   5,   5,   5,   5,   5,   5,   8,   8,   8,   8,   8,  10,  10,  10,  10,  10,  13,  13,  13,  13,  13,  13,  16,  16,
+		 16,  16,  16,  18,  18,  18,  18,  18,  21,  21,  21,  21,  21,  21,  24,  24,  24,  24,  24,  26,  26,  26,  26,  26,  29,  29,  29,  29,  29,  29,  32,  32,
+		 32,  32,  32,  32,  35,  35,  35,  35,  35,  37,  37,  37,  37,  37,  40,  40,  40,  40,  40,  40,  43,  43,  43,  43,  43,  45,  45,  45,  45,  45,  48,  48,
+		 48,  48,  48,  48,  51,  51,  51,  51,  51,  53,  53,  53,  53,  53,  56,  56,  56,  56,  56,  56,  59,  59,  59,  59,  59,  61,  61,  61,  61,  61,  64,  64,
+		 64,  64,  64,  64,  67,  67,  67,  67,  67,  67,  70,  70,  70,  70,  70,  72,  72,  72,  72,  72,  75,  75,  75,  75,  75,  75,  78,  78,  78,  78,  78,  80,
+		 80,  80,  80,  80,  83,  83,  83,  83,  83,  83,  86,  86,  86,  86,  86,  88,  88,  88,  88,  88,  91,  91,  91,  91,  91,  91,  94,  94,  94,  94,  94,  96,
+		 96,  96,  96,  96,  99,  99,  99,  99,  99,  99, 102, 102, 102, 102, 102, 104, 104, 104, 104, 104, 107, 107, 107, 107, 107, 107, 110, 110, 110, 110, 110, 112,
+		112, 112, 112, 112, 115, 115, 115, 115, 115, 115, 118, 118, 118, 118, 118, 120, 120, 120, 120, 120, 123, 123, 123, 123, 123, 123, 126, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 129, 132, 132, 132, 132, 132, 132, 135, 135, 135, 135, 135, 137, 137, 137, 137, 137, 140, 140, 140, 140, 140, 140, 143, 143, 143, 143,
+		143, 145, 145, 145, 145, 145, 148, 148, 148, 148, 148, 148, 151, 151, 151, 151, 151, 153, 153, 153, 153, 153, 156, 156, 156, 156, 156, 156, 159, 159, 159, 159,
+		159, 161, 161, 161, 161, 161, 164, 164, 164, 164, 164, 164, 167, 167, 167, 167, 167, 169, 169, 169, 169, 169, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
+		175, 177, 177, 177, 177, 177, 180, 180, 180, 180, 180, 180, 183, 183, 183, 183, 183, 185, 185, 185, 185, 185, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
+		191, 191, 194, 194, 194, 194, 194, 196, 196, 196, 196, 196, 199, 199, 199, 199, 199, 199, 202, 202, 202, 202, 202, 204, 204, 204, 204, 204, 207, 207, 207, 207,
+		207, 207, 210, 210, 210, 210, 210, 212, 212, 212, 212, 212, 215, 215, 215, 215, 215, 215, 218, 218, 218, 218, 218, 220, 220, 220, 220, 220, 223, 223, 223, 223,
+		223, 223, 226, 226, 226, 226, 226, 226, 229, 229, 229, 229, 229, 231, 231, 231, 231, 231, 234, 234, 234, 234, 234, 234, 237, 237, 237, 237, 237, 239, 239, 239,
+		239, 239, 242, 242, 242, 242, 242, 242, 245, 245, 245, 245, 245, 247, 247, 247, 247, 247, 250, 250, 250, 250, 250, 250, 253, 253, 253, 253, 253, 255, 255, 255
+	},
+	{ // QUANT_128
+		  0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6,   8,   8,   8,   8,  10,  10,  10,  10,  12,  12,  12,  12,  14,  14,  14,  14,  16,
+		 16,  16,  16,  18,  18,  18,  18,  20,  20,  20,  20,  22,  22,  22,  22,  24,  24,  24,  24,  26,  26,  26,  26,  28,  28,  28,  28,  30,  30,  30,  30,  32,
+		 32,  32,  32,  34,  34,  34,  34,  36,  36,  36,  36,  38,  38,  38,  38,  40,  40,  40,  40,  42,  42,  42,  42,  44,  44,  44,  44,  46,  46,  46,  46,  48,
+		 48,  48,  48,  50,  50,  50,  50,  52,  52,  52,  52,  54,  54,  54,  54,  56,  56,  56,  56,  58,  58,  58,  58,  60,  60,  60,  60,  62,  62,  62,  62,  64,
+		 64,  64,  64,  66,  66,  66,  66,  68,  68,  68,  68,  70,  70,  70,  70,  72,  72,  72,  72,  74,  74,  74,  74,  76,  76,  76,  76,  78,  78,  78,  78,  80,
+		 80,  80,  80,  82,  82,  82,  82,  84,  84,  84,  84,  86,  86,  86,  86,  88,  88,  88,  88,  90,  90,  90,  90,  92,  92,  92,  92,  94,  94,  94,  94,  96,
+		 96,  96,  96,  98,  98,  98,  98, 100, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 104, 106, 106, 106, 106, 108, 108, 108, 108, 110, 110, 110, 110, 112,
+		112, 112, 112, 114, 114, 114, 114, 116, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 120, 122, 122, 122, 122, 124, 124, 124, 124, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 131, 131, 131, 131, 133, 133, 133, 133, 135, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 139, 141, 141, 141, 141, 143, 143, 143,
+		143, 145, 145, 145, 145, 147, 147, 147, 147, 149, 149, 149, 149, 151, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 155, 157, 157, 157, 157, 159, 159, 159,
+		159, 161, 161, 161, 161, 163, 163, 163, 163, 165, 165, 165, 165, 167, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 171, 173, 173, 173, 173, 175, 175, 175,
+		175, 177, 177, 177, 177, 179, 179, 179, 179, 181, 181, 181, 181, 183, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 187, 189, 189, 189, 189, 191, 191, 191,
+		191, 193, 193, 193, 193, 195, 195, 195, 195, 197, 197, 197, 197, 199, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 203, 205, 205, 205, 205, 207, 207, 207,
+		207, 209, 209, 209, 209, 211, 211, 211, 211, 213, 213, 213, 213, 215, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 219, 221, 221, 221, 221, 223, 223, 223,
+		223, 225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239,
+		239, 241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255
+	},
+	{ // QUANT_160
+		  0,   0,   1,   1,   1,   3,   3,   3,   4,   4,   4,   6,   6,   6,   6,   8,   8,   8,   9,   9,   9,  11,  11,  11,  12,  12,  12,  14,  14,  14,  14,  16,
+		 16,  16,  17,  17,  17,  19,  19,  19,  20,  20,  20,  22,  22,  22,  22,  24,  24,  24,  25,  25,  25,  27,  27,  27,  28,  28,  28,  30,  30,  30,  30,  32,
+		 32,  32,  33,  33,  33,  35,  35,  35,  36,  36,  36,  38,  38,  38,  38,  40,  40,  40,  41,  41,  41,  43,  43,  43,  44,  44,  44,  46,  46,  46,  46,  48,
+		 48,  48,  49,  49,  49,  51,  51,  51,  52,  52,  52,  54,  54,  54,  54,  56,  56,  56,  57,  57,  57,  59,  59,  59,  60,  60,  60,  62,  62,  62,  62,  64,
+		 64,  64,  65,  65,  65,  67,  67,  67,  68,  68,  68,  70,  70,  70,  70,  72,  72,  72,  73,  73,  73,  75,  75,  75,  76,  76,  76,  78,  78,  78,  78,  80,
+		 80,  80,  81,  81,  81,  83,  83,  83,  84,  84,  84,  86,  86,  86,  86,  88,  88,  88,  89,  89,  89,  91,  91,  91,  92,  92,  92,  94,  94,  94,  94,  96,
+		 96,  96,  97,  97,  97,  99,  99,  99, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 105, 105, 105, 107, 107, 107, 108, 108, 108, 110, 110, 110, 110, 112,
+		112, 112, 113, 113, 113, 115, 115, 115, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 121, 121, 121, 123, 123, 123, 124, 124, 124, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 131, 131, 131, 132, 132, 132, 134, 134, 134, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 140, 140, 140, 142, 142, 142, 143, 143,
+		143, 145, 145, 145, 145, 147, 147, 147, 148, 148, 148, 150, 150, 150, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 156, 156, 156, 158, 158, 158, 159, 159,
+		159, 161, 161, 161, 161, 163, 163, 163, 164, 164, 164, 166, 166, 166, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 172, 172, 172, 174, 174, 174, 175, 175,
+		175, 177, 177, 177, 177, 179, 179, 179, 180, 180, 180, 182, 182, 182, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 188, 188, 188, 190, 190, 190, 191, 191,
+		191, 193, 193, 193, 193, 195, 195, 195, 196, 196, 196, 198, 198, 198, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 204, 204, 204, 206, 206, 206, 207, 207,
+		207, 209, 209, 209, 209, 211, 211, 211, 212, 212, 212, 214, 214, 214, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 220, 220, 220, 222, 222, 222, 223, 223,
+		223, 225, 225, 225, 225, 227, 227, 227, 228, 228, 228, 230, 230, 230, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 236, 236, 236, 238, 238, 238, 239, 239,
+		239, 241, 241, 241, 241, 243, 243, 243, 244, 244, 244, 246, 246, 246, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 252, 252, 252, 254, 254, 254, 255, 255
+	},
+	{ // QUANT_192
+		  0,   0,   1,   1,   2,   2,   2,   4,   4,   4,   5,   5,   6,   6,   6,   8,   8,   8,   9,   9,  10,  10,  10,  12,  12,  12,  13,  13,  14,  14,  14,  16,
+		 16,  16,  17,  17,  18,  18,  18,  20,  20,  20,  21,  21,  22,  22,  22,  24,  24,  24,  25,  25,  26,  26,  26,  28,  28,  28,  29,  29,  30,  30,  30,  32,
+		 32,  32,  33,  33,  34,  34,  34,  36,  36,  36,  37,  37,  38,  38,  38,  40,  40,  40,  41,  41,  42,  42,  42,  44,  44,  44,  45,  45,  46,  46,  46,  48,
+		 48,  48,  49,  49,  50,  50,  50,  52,  52,  52,  53,  53,  54,  54,  54,  56,  56,  56,  57,  57,  58,  58,  58,  60,  60,  60,  61,  61,  62,  62,  62,  64,
+		 64,  64,  65,  65,  66,  66,  66,  68,  68,  68,  69,  69,  70,  70,  70,  72,  72,  72,  73,  73,  74,  74,  74,  76,  76,  76,  77,  77,  78,  78,  78,  80,
+		 80,  80,  81,  81,  82,  82,  82,  84,  84,  84,  85,  85,  86,  86,  86,  88,  88,  88,  89,  89,  90,  90,  90,  92,  92,  92,  93,  93,  94,  94,  94,  96,
+		 96,  96,  97,  97,  98,  98,  98, 100, 100, 100, 101, 101, 102, 102, 102, 104, 104, 104, 105, 105, 106, 106, 106, 108, 108, 108, 109, 109, 110, 110, 110, 112,
+		112, 112, 113, 113, 114, 114, 114, 116, 116, 116, 117, 117, 118, 118, 118, 120, 120, 120, 121, 121, 122, 122, 122, 124, 124, 124, 125, 125, 126, 126, 126, 126,
+		129, 129, 129, 129, 130, 130, 131, 131, 131, 133, 133, 133, 134, 134, 135, 135, 135, 137, 137, 137, 138, 138, 139, 139, 139, 141, 141, 141, 142, 142, 143, 143,
+		143, 145, 145, 145, 146, 146, 147, 147, 147, 149, 149, 149, 150, 150, 151, 151, 151, 153, 153, 153, 154, 154, 155, 155, 155, 157, 157, 157, 158, 158, 159, 159,
+		159, 161, 161, 161, 162, 162, 163, 163, 163, 165, 165, 165, 166, 166, 167, 167, 167, 169, 169, 169, 170, 170, 171, 171, 171, 173, 173, 173, 174, 174, 175, 175,
+		175, 177, 177, 177, 178, 178, 179, 179, 179, 181, 181, 181, 182, 182, 183, 183, 183, 185, 185, 185, 186, 186, 187, 187, 187, 189, 189, 189, 190, 190, 191, 191,
+		191, 193, 193, 193, 194, 194, 195, 195, 195, 197, 197, 197, 198, 198, 199, 199, 199, 201, 201, 201, 202, 202, 203, 203, 203, 205, 205, 205, 206, 206, 207, 207,
+		207, 209, 209, 209, 210, 210, 211, 211, 211, 213, 213, 213, 214, 214, 215, 215, 215, 217, 217, 217, 218, 218, 219, 219, 219, 221, 221, 221, 222, 222, 223, 223,
+		223, 225, 225, 225, 226, 226, 227, 227, 227, 229, 229, 229, 230, 230, 231, 231, 231, 233, 233, 233, 234, 234, 235, 235, 235, 237, 237, 237, 238, 238, 239, 239,
+		239, 241, 241, 241, 242, 242, 243, 243, 243, 245, 245, 245, 246, 246, 247, 247, 247, 249, 249, 249, 250, 250, 251, 251, 251, 253, 253, 253, 254, 254, 255, 255
+	},
+	{ // QUANT_256
+		  0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,   8,   8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
+		 16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,
+		 32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,  40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,
+		 48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,  56,  56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,  72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,  88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
+		128, 128, 129, 129, 130, 130, 131, 131, 132, 132, 133, 133, 134, 134, 135, 135, 136, 136, 137, 137, 138, 138, 139, 139, 140, 140, 141, 141, 142, 142, 143, 143,
+		144, 144, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 150, 150, 151, 151, 152, 152, 153, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159,
+		160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 165, 166, 166, 167, 167, 168, 168, 169, 169, 170, 170, 171, 171, 172, 172, 173, 173, 174, 174, 175, 175,
+		176, 176, 177, 177, 178, 178, 179, 179, 180, 180, 181, 181, 182, 182, 183, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188, 188, 189, 189, 190, 190, 191, 191,
+		192, 192, 193, 193, 194, 194, 195, 195, 196, 196, 197, 197, 198, 198, 199, 199, 200, 200, 201, 201, 202, 202, 203, 203, 204, 204, 205, 205, 206, 206, 207, 207,
+		208, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223,
+		224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239,
+		240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255
+	},
+};
+
+// Starts from QUANT_6
+// Scrambled
+const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
+	{ // QUANT_6
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_8
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7
+	},
+	{ // QUANT_10
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_12
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_16
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15
+	},
+	{ // QUANT_20
+		  0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  16,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
+		 19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 17,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_24
+		  0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  15,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_32
+		  0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,
+		  2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,
+		  4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,
+		  6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   7,   8,   8,
+		  8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,  10,  10,
+		 10,  10,  10,  10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  12,
+		 12,  12,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15,
+		 16,  16,  16,  16,  16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,
+		 18,  18,  18,  18,  18,  18,  18,  18,  19,  19,  19,  19,  19,  19,  19,  19,
+		 19,  20,  20,  20,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,
+		 21,  22,  22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  24,  24,  24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  25,
+		 25,  25,  26,  26,  26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  28,  28,  28,  28,  28,  28,  28,  28,  29,  29,  29,  29,  29,
+		 29,  29,  29,  30,  30,  30,  30,  30,  30,  30,  30,  31,  31,  31,  31,  31
+	},
+	{ // QUANT_40
+		  0,   0,   0,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,
+		 24,  24,  24,  24,  24,  24,  24,  32,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  18,  18,  18,  18,  18,  18,
+		 18,  26,  26,  26,  26,  26,  26,  34,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  20,  20,  20,  20,  20,
+		 20,  28,  28,  28,  28,  28,  28,  28,  36,  36,  36,  36,  36,  36,  36,   6,
+		  6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  22,  22,  22,  22,  22,
+		 22,  22,  30,  30,  30,  30,  30,  30,  38,  38,  38,  38,  38,  38,  38,  38,
+		 39,  39,  39,  39,  39,  39,  39,  39,  31,  31,  31,  31,  31,  31,  23,  23,
+		 23,  23,  23,  23,  23,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,
+		  7,  37,  37,  37,  37,  37,  37,  37,  29,  29,  29,  29,  29,  29,  29,  21,
+		 21,  21,  21,  21,  21,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  35,  27,  27,  27,  27,  27,  27,  19,
+		 19,  19,  19,  19,  19,  19,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  33,  25,  25,  25,  25,  25,  25,  25,
+		 17,  17,  17,  17,  17,  17,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1
+	},
+	{ // QUANT_48
+		  0,   0,   0,  16,  16,  16,  16,  16,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,  18,  18,  18,  18,  18,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,  20,  20,  20,  20,  20,  20,  36,  36,  36,  36,  36,   6,   6,
+		  6,   6,   6,  22,  22,  22,  22,  22,  22,  38,  38,  38,  38,  38,  38,   8,
+		  8,   8,   8,   8,  24,  24,  24,  24,  24,  40,  40,  40,  40,  40,  40,  10,
+		 10,  10,  10,  10,  26,  26,  26,  26,  26,  42,  42,  42,  42,  42,  42,  12,
+		 12,  12,  12,  12,  28,  28,  28,  28,  28,  28,  44,  44,  44,  44,  44,  14,
+		 14,  14,  14,  14,  30,  30,  30,  30,  30,  30,  46,  46,  46,  46,  46,  46,
+		 47,  47,  47,  47,  47,  47,  31,  31,  31,  31,  31,  31,  15,  15,  15,  15,
+		 15,  45,  45,  45,  45,  45,  29,  29,  29,  29,  29,  29,  13,  13,  13,  13,
+		 13,  43,  43,  43,  43,  43,  43,  27,  27,  27,  27,  27,  11,  11,  11,  11,
+		 11,  41,  41,  41,  41,  41,  41,  25,  25,  25,  25,  25,   9,   9,   9,   9,
+		  9,  39,  39,  39,  39,  39,  39,  23,  23,  23,  23,  23,  23,   7,   7,   7,
+		  7,   7,  37,  37,  37,  37,  37,  21,  21,  21,  21,  21,  21,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  19,  19,  19,  19,  19,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  17,  17,  17,  17,  17,   1,   1,   1
+	},
+	{ // QUANT_64
+		  0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3,   4,   4,
+		  4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7,   8,   8,
+		  8,   8,   9,   9,   9,   9,  10,  10,  10,  10,  11,  11,  11,  11,  12,  12,
+		 12,  12,  13,  13,  13,  13,  14,  14,  14,  14,  15,  15,  15,  15,  15,  16,
+		 16,  16,  16,  17,  17,  17,  17,  18,  18,  18,  18,  19,  19,  19,  19,  20,
+		 20,  20,  20,  21,  21,  21,  21,  22,  22,  22,  22,  23,  23,  23,  23,  24,
+		 24,  24,  24,  25,  25,  25,  25,  26,  26,  26,  26,  27,  27,  27,  27,  28,
+		 28,  28,  28,  29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  31,  31,
+		 32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
+		 36,  36,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  39,  39,  39,
+		 40,  40,  40,  40,  41,  41,  41,  41,  42,  42,  42,  42,  43,  43,  43,  43,
+		 44,  44,  44,  44,  45,  45,  45,  45,  46,  46,  46,  46,  47,  47,  47,  47,
+		 47,  48,  48,  48,  48,  49,  49,  49,  49,  50,  50,  50,  50,  51,  51,  51,
+		 51,  52,  52,  52,  52,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  55,
+		 55,  56,  56,  56,  56,  57,  57,  57,  57,  58,  58,  58,  58,  59,  59,  59,
+		 59,  60,  60,  60,  60,  61,  61,  61,  61,  62,  62,  62,  62,  63,  63,  63
+	},
+	{ // QUANT_80
+		  0,   0,  16,  16,  16,  32,  32,  32,  48,  48,  48,  64,  64,  64,  64,   2,
+		  2,   2,  18,  18,  18,  34,  34,  34,  50,  50,  50,  66,  66,  66,  66,   4,
+		  4,   4,  20,  20,  20,  36,  36,  36,  52,  52,  52,  52,  68,  68,  68,   6,
+		  6,   6,  22,  22,  22,  38,  38,  38,  54,  54,  54,  54,  70,  70,  70,   8,
+		  8,   8,  24,  24,  24,  40,  40,  40,  40,  56,  56,  56,  72,  72,  72,  10,
+		 10,  10,  26,  26,  26,  42,  42,  42,  42,  58,  58,  58,  74,  74,  74,  12,
+		 12,  12,  28,  28,  28,  28,  44,  44,  44,  60,  60,  60,  76,  76,  76,  14,
+		 14,  14,  30,  30,  30,  30,  46,  46,  46,  62,  62,  62,  78,  78,  78,  78,
+		 79,  79,  79,  79,  63,  63,  63,  47,  47,  47,  31,  31,  31,  31,  15,  15,
+		 15,  77,  77,  77,  61,  61,  61,  45,  45,  45,  29,  29,  29,  29,  13,  13,
+		 13,  75,  75,  75,  59,  59,  59,  43,  43,  43,  43,  27,  27,  27,  11,  11,
+		 11,  73,  73,  73,  57,  57,  57,  41,  41,  41,  41,  25,  25,  25,   9,   9,
+		  9,  71,  71,  71,  55,  55,  55,  55,  39,  39,  39,  23,  23,  23,   7,   7,
+		  7,  69,  69,  69,  53,  53,  53,  53,  37,  37,  37,  21,  21,  21,   5,   5,
+		  5,  67,  67,  67,  67,  51,  51,  51,  35,  35,  35,  19,  19,  19,   3,   3,
+		  3,  65,  65,  65,  65,  49,  49,  49,  33,  33,  33,  17,  17,  17,   1,   1
+	},
+	{ // QUANT_96
+		  0,  32,  32,  32,  64,  64,  64,   2,   2,  34,  34,  34,  66,  66,  66,   4,
+		  4,  36,  36,  36,  68,  68,  68,   6,   6,  38,  38,  38,  70,  70,  70,   8,
+		  8,   8,  40,  40,  72,  72,  72,  10,  10,  10,  42,  42,  74,  74,  74,  12,
+		 12,  12,  44,  44,  76,  76,  76,  14,  14,  14,  46,  46,  78,  78,  78,  16,
+		 16,  16,  48,  48,  48,  80,  80,  80,  18,  18,  50,  50,  50,  82,  82,  82,
+		 20,  20,  52,  52,  52,  84,  84,  84,  22,  22,  54,  54,  54,  86,  86,  86,
+		 24,  24,  56,  56,  56,  88,  88,  88,  26,  26,  58,  58,  58,  90,  90,  90,
+		 28,  28,  60,  60,  60,  92,  92,  92,  30,  30,  62,  62,  62,  94,  94,  94,
+		 95,  95,  95,  63,  63,  63,  31,  31,  93,  93,  93,  61,  61,  61,  29,  29,
+		 91,  91,  91,  59,  59,  59,  27,  27,  89,  89,  89,  57,  57,  57,  25,  25,
+		 87,  87,  87,  55,  55,  55,  23,  23,  85,  85,  85,  53,  53,  53,  21,  21,
+		 83,  83,  83,  51,  51,  51,  19,  19,  81,  81,  81,  49,  49,  49,  17,  17,
+		 17,  79,  79,  79,  47,  47,  15,  15,  15,  77,  77,  77,  45,  45,  13,  13,
+		 13,  75,  75,  75,  43,  43,  11,  11,  11,  73,  73,  73,  41,  41,   9,   9,
+		  9,  71,  71,  71,  39,  39,  39,   7,   7,  69,  69,  69,  37,  37,  37,   5,
+		  5,  67,  67,  67,  35,  35,  35,   3,   3,  65,  65,  65,  33,  33,  33,   1
+	},
+	{ // QUANT_128
+		  0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,   8,
+		  8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,  16,
+		 16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,  24,
+		 24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,  32,
+		 32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,  40,
+		 40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,  48,
+		 48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,  56,
+		 56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,
+		 72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,
+		 88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103,
+		104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
+		120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
+	},
+	{ // QUANT_160
+		  0,  32,  64,  64,  96, 128, 128, 128,   2,  34,  66,  66,  98, 130, 130, 130,
+		  4,  36,  68,  68, 100, 132, 132, 132,   6,  38,  70,  70, 102, 134, 134, 134,
+		  8,  40,  72,  72, 104, 136, 136, 136,  10,  42,  74,  74, 106, 138, 138, 138,
+		 12,  44,  76,  76, 108, 140, 140, 140,  14,  46,  78,  78, 110, 142, 142, 142,
+		 16,  48,  80,  80, 112, 144, 144, 144,  18,  50,  82,  82, 114, 146, 146, 146,
+		 20,  52,  84,  84, 116, 148, 148, 148,  22,  54,  86,  86, 118, 150, 150, 150,
+		 24,  56,  88,  88, 120, 152, 152, 152,  26,  58,  90,  90, 122, 154, 154, 154,
+		 28,  60,  92,  92, 124, 156, 156, 156,  30,  62,  94,  94, 126, 158, 158, 158,
+		159, 159, 159, 127,  95,  95,  63,  31, 157, 157, 157, 125,  93,  93,  61,  29,
+		155, 155, 155, 123,  91,  91,  59,  27, 153, 153, 153, 121,  89,  89,  57,  25,
+		151, 151, 151, 119,  87,  87,  55,  23, 149, 149, 149, 117,  85,  85,  53,  21,
+		147, 147, 147, 115,  83,  83,  51,  19, 145, 145, 145, 113,  81,  81,  49,  17,
+		143, 143, 143, 111,  79,  79,  47,  15, 141, 141, 141, 109,  77,  77,  45,  13,
+		139, 139, 139, 107,  75,  75,  43,  11, 137, 137, 137, 105,  73,  73,  41,   9,
+		135, 135, 135, 103,  71,  71,  39,   7, 133, 133, 133, 101,  69,  69,  37,   5,
+		131, 131, 131,  99,  67,  67,  35,   3, 129, 129, 129,  97,  65,  65,  33,   1
+	},
+	{ // QUANT_192
+		  0,  64, 128, 128,   2,  66, 130, 130,   4,  68, 132, 132,   6,  70, 134, 134,
+		  8,  72, 136, 136,  10,  74, 138, 138,  12,  76, 140, 140,  14,  78, 142, 142,
+		 16,  80, 144, 144,  18,  82, 146, 146,  20,  84, 148, 148,  22,  86, 150, 150,
+		 24,  88, 152, 152,  26,  90, 154, 154,  28,  92, 156, 156,  30,  94, 158, 158,
+		 32,  96, 160, 160,  34,  98, 162, 162,  36, 100, 164, 164,  38, 102, 166, 166,
+		 40, 104, 168, 168,  42, 106, 170, 170,  44, 108, 172, 172,  46, 110, 174, 174,
+		 48, 112, 176, 176,  50, 114, 178, 178,  52, 116, 180, 180,  54, 118, 182, 182,
+		 56, 120, 184, 184,  58, 122, 186, 186,  60, 124, 188, 188,  62, 126, 190, 190,
+		191, 191, 127,  63, 189, 189, 125,  61, 187, 187, 123,  59, 185, 185, 121,  57,
+		183, 183, 119,  55, 181, 181, 117,  53, 179, 179, 115,  51, 177, 177, 113,  49,
+		175, 175, 111,  47, 173, 173, 109,  45, 171, 171, 107,  43, 169, 169, 105,  41,
+		167, 167, 103,  39, 165, 165, 101,  37, 163, 163,  99,  35, 161, 161,  97,  33,
+		159, 159,  95,  31, 157, 157,  93,  29, 155, 155,  91,  27, 153, 153,  89,  25,
+		151, 151,  87,  23, 149, 149,  85,  21, 147, 147,  83,  19, 145, 145,  81,  17,
+		143, 143,  79,  15, 141, 141,  77,  13, 139, 139,  75,  11, 137, 137,  73,   9,
+		135, 135,  71,   7, 133, 133,  69,   5, 131, 131,  67,   3, 129, 129,  65,   1
+	},
+	{ // QUANT_256
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+#endif
+
+// Starts from QUANT_6
+// Scrambled
+static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
+	  0, 255,  51, 204, 102, 153
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
+	  0,  36,  73, 109, 146, 182, 219, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
+	  0, 255,  28, 227,  56, 199,  84, 171, 113, 142
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
+	  0, 255,  69, 186,  23, 232,  92, 163,  46, 209, 116, 139
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
+	  0,  17,  34,  51,  68,  85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
+	  0, 255,  67, 188,  13, 242,  80, 175,  27, 228,  94, 161,  40, 215, 107, 148,
+	 54, 201, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
+	  0, 255,  33, 222,  66, 189,  99, 156,  11, 244,  44, 211,  77, 178, 110, 145,
+	 22, 233,  55, 200,  88, 167, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
+	  0,   8,  16,  24,  33,  41,  49,  57,  66,  74,  82,  90,  99, 107, 115, 123,
+	132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
+	  0, 255,  32, 223,  65, 190,  97, 158,   6, 249,  39, 216,  71, 184, 104, 151,
+	 13, 242,  45, 210,  78, 177, 110, 145,  19, 236,  52, 203,  84, 171, 117, 138,
+	 26, 229,  58, 197,  91, 164, 123, 132
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  65, 190,  81, 174,  97, 158, 113, 142,
+	  5, 250,  21, 234,  38, 217,  54, 201,  70, 185,  86, 169, 103, 152, 119, 136,
+	 11, 244,  27, 228,  43, 212,  59, 196,  76, 179,  92, 163, 108, 147, 124, 131
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
+	  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
+	 65,  69,  73,  77,  81,  85,  89,  93,  97, 101, 105, 109, 113, 117, 121, 125,
+	130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
+	195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  64, 191,  80, 175,  96, 159, 112, 143,
+	  3, 252,  19, 236,  35, 220,  51, 204,  67, 188,  83, 172, 100, 155, 116, 139,
+	  6, 249,  22, 233,  38, 217,  54, 201,  71, 184,  87, 168, 103, 152, 119, 136,
+	  9, 246,  25, 230,  42, 213,  58, 197,  74, 181,  90, 165, 106, 149, 122, 133,
+	 13, 242,  29, 226,  45, 210,  61, 194,  77, 178,  93, 162, 109, 146, 125, 130
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  2, 253,  10, 245,  18, 237,  26, 229,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  5, 250,  13, 242,  21, 234,  29, 226,  37, 218,  45, 210,  53, 202,  61, 194,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
+	  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+	 32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+	 64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+	 96,  98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+	129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
+	161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
+	193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
+	225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  1, 254,   9, 246,  17, 238,  25, 230,  33, 222,  41, 214,  49, 206,  57, 198,
+	 65, 190,  73, 182,  81, 174,  89, 166,  97, 158, 105, 150, 113, 142, 121, 134,
+	  3, 252,  11, 244,  19, 236,  27, 228,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  4, 251,  12, 243,  20, 235,  28, 227,  36, 219,  44, 211,  52, 203,  60, 195,
+	 68, 187,  76, 179,  84, 171,  92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
+	  6, 249,  14, 241,  22, 233,  30, 225,  38, 217,  46, 209,  54, 201,  62, 193,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
+	  0, 255,   4, 251,   8, 247,  12, 243,  16, 239,  20, 235,  24, 231,  28, 227,
+	 32, 223,  36, 219,  40, 215,  44, 211,  48, 207,  52, 203,  56, 199,  60, 195,
+	 64, 191,  68, 187,  72, 183,  76, 179,  80, 175,  84, 171,  88, 167,  92, 163,
+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
+	  1, 254,   5, 250,   9, 246,  13, 242,  17, 238,  21, 234,  25, 230,  29, 226,
+	 33, 222,  37, 218,  41, 214,  45, 210,  49, 206,  53, 202,  57, 198,  61, 194,
+	 65, 190,  69, 186,  73, 182,  77, 178,  81, 174,  85, 170,  89, 166,  93, 162,
+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
+	  2, 253,   6, 249,  10, 245,  14, 241,  18, 237,  22, 233,  26, 229,  30, 225,
+	 34, 221,  38, 217,  42, 213,  46, 209,  50, 205,  54, 201,  58, 197,  62, 193,
+	 66, 189,  70, 185,  74, 181,  78, 177,  82, 173,  86, 169,  90, 165,  94, 161,
+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
+	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+};
+
+const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
+	color_scrambled_pquant_to_uquant_q6,
+	color_scrambled_pquant_to_uquant_q8,
+	color_scrambled_pquant_to_uquant_q10,
+	color_scrambled_pquant_to_uquant_q12,
+	color_scrambled_pquant_to_uquant_q16,
+	color_scrambled_pquant_to_uquant_q20,
+	color_scrambled_pquant_to_uquant_q24,
+	color_scrambled_pquant_to_uquant_q32,
+	color_scrambled_pquant_to_uquant_q40,
+	color_scrambled_pquant_to_uquant_q48,
+	color_scrambled_pquant_to_uquant_q64,
+	color_scrambled_pquant_to_uquant_q80,
+	color_scrambled_pquant_to_uquant_q96,
+	color_scrambled_pquant_to_uquant_q128,
+	color_scrambled_pquant_to_uquant_q160,
+	color_scrambled_pquant_to_uquant_q192,
+	color_scrambled_pquant_to_uquant_q256
+};
+
+// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
+// count and number of bits that the integer may fit into.
+const int8_t quant_mode_table[10][128] {
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    },
+    {
+         -1, -1,  0,  0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1,  0,  0,  0,  1,  2,  2,  3,  4,  5,  5,  6,  7,
+          8,  8,  9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
+          4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
+         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  1,  1,  1,
+          2,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  7,  7,  7,
+          8,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
+         14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,
+          1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,
+          5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10, 10,
+         10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
+         15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,
+          0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+          4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+          8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+         12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+         16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,
+          0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
+          2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,
+          6,  6,  6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,
+          9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
+         13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+         16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
+          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+          8,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+         11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
+         14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
+          1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
+          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
+          6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,
+          9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+         12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+         14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
+    }
+};
--- a/thirdparty/astcenc/astcenc_symbolic_physical.cpp
+++ b/thirdparty/astcenc/astcenc_symbolic_physical.cpp
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for converting between symbolic and physical encodings.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Reverse bits in a byte.
+ *
+ * @param p   The value to reverse.
+  *
+ * @return The reversed result.
+ */
+static inline int bitrev8(int p)
+{
+	p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
+	return p;
+}
+
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline int read_bits(
+	int bitcount,
+	int bitoffset,
+	const uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
+ * may span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	int value,
+	int bitcount,
+	int bitoffset,
+	uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/* See header for documentation. */
+void symbolic_to_physical(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	uint8_t pcb[16]
+) {
+	assert(scb.block_type != SYM_BTYPE_ERROR);
+
+	// Constant color block using UNORM16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_U16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	// Constant color block using FP16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_F16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8]  { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	unsigned int partition_count = scb.partition_count;
+
+	// Compress the weights.
+	// They are encoded as an ordinary integer-sequence, then bit-reversed
+	uint8_t weightbuf[16] { 0 };
+
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+	int weight_count = di.weight_count;
+	quant_method weight_quant_method = bm.get_weight_quant_mode();
+	float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
+	int is_dual_plane = bm.is_dual_plane;
+
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	uint8_t weights[64];
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i] = qat.scramble_map[qwi];
+
+			uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
+			qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i + 1] = qat.scramble_map[qwi];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[i] = qat.scramble_map[qwi];
+		}
+	}
+
+	encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
+
+	for (int i = 0; i < 16; i++)
+	{
+		pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
+	}
+
+	write_bits(scb.block_mode, 11, 0, pcb);
+	write_bits(partition_count - 1, 2, 11, pcb);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	// Encode partition index and color endpoint types for blocks with 2+ partitions
+	if (partition_count > 1)
+	{
+		write_bits(scb.partition_index, 6, 13, pcb);
+		write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
+
+		if (scb.color_formats_matched)
+		{
+			write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
+		}
+		else
+		{
+			// Check endpoint types for each partition to determine the lowest class present
+			int low_class = 4;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int class_of_format = scb.color_formats[i] >> 2;
+				low_class = astc::min(class_of_format, low_class);
+			}
+
+			if (low_class == 3)
+			{
+				low_class = 2;
+			}
+
+			int encoded_type = low_class + 1;
+			int bitpos = 2;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
+				encoded_type |= classbit_of_format << bitpos;
+				bitpos++;
+			}
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int lowbits_of_format = scb.color_formats[i] & 3;
+				encoded_type |= lowbits_of_format << bitpos;
+				bitpos += 2;
+			}
+
+			int encoded_type_lowpart = encoded_type & 0x3F;
+			int encoded_type_highpart = encoded_type >> 6;
+			int encoded_type_highpart_size = (3 * partition_count) - 4;
+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
+			below_weights_pos -= encoded_type_highpart_size;
+		}
+	}
+	else
+	{
+		write_bits(scb.color_formats[0], 4, 13, pcb);
+	}
+
+	// In dual-plane mode, encode the color component of the second plane of weights
+	if (is_dual_plane)
+	{
+		write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
+	}
+
+	// Encode the color components
+	uint8_t values_to_encode[32];
+	int valuecount_to_encode = 0;
+
+	const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
+	for (unsigned int i = 0; i < scb.partition_count; i++)
+	{
+		int vals = 2 * (scb.color_formats[i] >> 2) + 2;
+		assert(vals <= 8);
+		for (int j = 0; j < vals; j++)
+		{
+			values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
+		}
+		valuecount_to_encode += vals;
+	}
+
+	encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
+	           scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
+}
+
+#endif
+
+/* See header for documentation. */
+void physical_to_symbolic(
+	const block_size_descriptor& bsd,
+	const uint8_t pcb[16],
+	symbolic_compressed_block& scb
+) {
+	uint8_t bswapped[16];
+
+	scb.block_type = SYM_BTYPE_NONCONST;
+
+	// Extract header fields
+	int block_mode = read_bits(11, 0, pcb);
+	if ((block_mode & 0x1FF) == 0x1FC)
+	{
+		// Constant color block
+
+		// Check what format the data has
+		if (block_mode & 0x200)
+		{
+			scb.block_type = SYM_BTYPE_CONST_F16;
+		}
+		else
+		{
+			scb.block_type = SYM_BTYPE_CONST_U16;
+		}
+
+		scb.partition_count = 0;
+		for (int i = 0; i < 4; i++)
+		{
+			scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
+		}
+
+		// Additionally, check that the void-extent
+		if (bsd.zdim == 1)
+		{
+			// 2D void-extent
+			int rsvbits = read_bits(2, 10, pcb);
+			if (rsvbits != 3)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+
+			// Low values span 3 bytes so need two read_bits calls
+			int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
+			int vx_high_s = read_bits(13, 25, pcb);
+			int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
+			int vx_high_t = read_bits(13, 51, pcb);
+
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
+			               vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+		else
+		{
+			// 3D void-extent
+			int vx_low_s = read_bits(9, 10, pcb);
+			int vx_high_s = read_bits(9, 19, pcb);
+			int vx_low_t = read_bits(9, 28, pcb);
+			int vx_high_t = read_bits(9, 37, pcb);
+			int vx_low_r = read_bits(9, 46, pcb);
+			int vx_high_r = read_bits(9, 55, pcb);
+
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
+			               vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
+			               vx_low_r == 0x1FF && vx_high_r == 0x1FF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+
+		return;
+	}
+
+	unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
+	if (packed_index == BLOCK_BAD_BLOCK_MODE)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	const auto& bm = bsd.get_block_mode(block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+
+	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
+	int is_dual_plane = bm.is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int partition_count = read_bits(2, 11, pcb) + 1;
+	promise(partition_count > 0);
+
+	scb.block_mode = static_cast<uint16_t>(block_mode);
+	scb.partition_count = static_cast<uint8_t>(partition_count);
+
+	for (int i = 0; i < 16; i++)
+	{
+		bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
+	}
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	uint8_t indices[64];
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
+
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
+			scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
+		}
+	}
+
+	if (is_dual_plane && partition_count == 4)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	scb.color_formats_matched = 0;
+
+	// Determine the format of each endpoint pair
+	int color_formats[BLOCK_MAX_PARTITIONS];
+	int encoded_type_highpart_size = 0;
+	if (partition_count == 1)
+	{
+		color_formats[0] = read_bits(4, 13, pcb);
+		scb.partition_index = 0;
+	}
+	else
+	{
+		encoded_type_highpart_size = (3 * partition_count) - 4;
+		below_weights_pos -= encoded_type_highpart_size;
+		int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
+		                  (read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
+		int baseclass = encoded_type & 0x3;
+		if (baseclass == 0)
+		{
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (encoded_type >> 2) & 0xF;
+			}
+
+			below_weights_pos += encoded_type_highpart_size;
+			scb.color_formats_matched = 1;
+			encoded_type_highpart_size = 0;
+		}
+		else
+		{
+			int bitpos = 2;
+			baseclass--;
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
+				bitpos++;
+			}
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] |= (encoded_type >> bitpos) & 3;
+				bitpos += 2;
+			}
+		}
+		scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
+	}
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
+	}
+
+	// Determine number of color endpoint integers
+	int color_integer_count = 0;
+	for (int i = 0; i < partition_count; i++)
+	{
+		int endpoint_class = color_formats[i] >> 2;
+		color_integer_count += (endpoint_class + 1) * 2;
+	}
+
+	if (color_integer_count > 18)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Determine the color endpoint format to use
+	static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
+	if (is_dual_plane)
+	{
+		color_bits -= 2;
+	}
+
+	if (color_bits < 0)
+	{
+		color_bits = 0;
+	}
+
+	int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
+	if (color_quant_level < QUANT_6)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Unpack the integer color values and assign to endpoints
+	scb.quant_mode = static_cast<quant_method>(color_quant_level);
+
+	uint8_t values_to_decode[32];
+	decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
+	           values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
+
+	int valuecount_to_decode = 0;
+	const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
+	for (int i = 0; i < partition_count; i++)
+	{
+		int vals = 2 * (color_formats[i] >> 2) + 2;
+		for (int j = 0; j < vals; j++)
+		{
+			scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
+		}
+		valuecount_to_decode += vals;
+	}
+
+	// Fetch component for second-plane in the case of dual plane of weights.
+	scb.plane2_component = -1;
+	if (is_dual_plane)
+	{
+		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
+	}
+}
--- a/thirdparty/astcenc/astcenc_vecmathlib.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2025 Arm Limited
+// Copyright 2008 Jose Fonseca
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements vector support for floats, ints, and vector lane
+ * control masks. It provides access to both explicit vector width types, and
+ * flexible N-wide types where N can be determined at compile time.
+ *
+ * The design of this module encourages use of vector length agnostic code, via
+ * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
+ * with that is available at compile time. The current vector width is
+ * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
+ *
+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
+ * These are provided primarily for prototyping and algorithm debug of VLA
+ * implementations.
+ *
+ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
+ * types. These are provided for use by VLA code, but are also expected to be
+ * used as a fixed-width type and will supported a reference C++ fallback for
+ * use on platforms without SIMD intrinsics.
+ *
+ * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
+ * types. These are provide for use by VLA code, and are not expected to be
+ * used as a fixed-width type in normal code. No reference C implementation is
+ * provided on platforms without underlying SIMD intrinsics.
+ *
+ * With the current implementation ISA support is provided for:
+ *
+ *     * 1-wide for scalar reference
+ *     * 4-wide for Armv8-A NEON
+ *     * 4-wide for x86-64 SSE2
+ *     * 4-wide for x86-64 SSE4.1
+ *     * 8-wide for Armv8-A SVE
+ *     * 8-wide for x86-64 AVX2
+ */
+
+#ifndef ASTC_VECMATHLIB_H_INCLUDED
+#define ASTC_VECMATHLIB_H_INCLUDED
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
+	#include <immintrin.h>
+#endif
+
+#if ASTCENC_SVE != 0
+	#include <arm_sve.h>
+	#include <arm_neon_sve_bridge.h>
+#endif
+
+#if ASTCENC_NEON != 0
+	#include <arm_neon.h>
+#endif
+
+#if !defined(__clang__) && defined(_MSC_VER)
+	#define ASTCENC_SIMD_INLINE __forceinline
+	#define ASTCENC_NO_INLINE
+#elif defined(__GNUC__) && !defined(__clang__)
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#else
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#endif
+
+template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
+
+#if ASTCENC_AVX >= 2
+	// If we have AVX2 expose 8-wide VLA.
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_avx2_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
+
+#elif ASTCENC_SSE >= 20
+	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+
+#elif ASTCENC_SVE == 8
+	// Check the compiler is configured with fixed-length 256-bit SVE.
+	#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
+		#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
+	#endif
+
+	// If we have SVE configured as 8-wide, expose 8-wide VLA.
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_sve_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
+
+#elif ASTCENC_NEON > 0
+	// If we have NEON expose 4-wide VLA.
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+
+#else
+	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
+
+	// Note: We no longer expose the 1-wide scalar fallback because it is not
+	// invariant with the 4-wide path due to algorithms that use horizontal
+	// operations that accumulate a local vector sum before accumulating into
+	// a running sum.
+	//
+	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
+	//
+	//     result = ((((sum + l0) + l1) + l2) + l3)
+	//
+    // ... whereas the accumulator for a 4-wide vector sum is:
+	//
+	//     result = sum + ((l0 + l2) + (l1 + l3))
+	//
+	// In "normal maths" this is the same, but the floating point reassociation
+	// differences mean that these will not produce the same result.
+
+	#include "astcenc_vecmathlib_none_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+#endif
+
+/**
+ * @brief Round a count down to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
+{
+	return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
+}
+
+/**
+ * @brief Round a count up to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
+{
+	size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	return multiples * ASTCENC_SIMD_WIDTH;
+}
+
+/**
+ * @brief Return @c a with lanes negated if the @c b lane is negative.
+ */
+ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
+{
+	vint ia = float_as_int(a);
+	vint ib = float_as_int(b);
+	vint sign_mask(static_cast<int>(0x80000000));
+	vint r = ia ^ (ib & sign_mask);
+	return int_as_float(r);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan(x).
+ *
+ * Max error of this implementation is 0.004883.
+ */
+ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
+{
+	vmask c = abs(x) > vfloat(1.0f);
+	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
+	vfloat y = select(x, vfloat(1.0f) / x, c);
+	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
+	return select(y, z - y, c);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan2(x, y).
+ */
+ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
+{
+	vfloat z = atan(abs(y / x));
+	vmask xmask = x < vfloat::zero();
+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
+}
+
+/*
+ * @brief Factory that returns a unit length 4 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit4()
+{
+	return vfloat4(0.5f);
+}
+
+/**
+ * @brief Factory that returns a unit length 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit3()
+{
+	float val = 0.577350258827209473f;
+	return vfloat4(val, val, val, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a unit length 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit2()
+{
+	float val = 0.707106769084930420f;
+	return vfloat4(val, val, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
+{
+	return vfloat4(a, b, c, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
+{
+	return vfloat4(a, b, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Normalize a non-zero length vector to unit length.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
+{
+	vfloat4 length = dot(a, a);
+	return a / sqrt(length);
+}
+
+/**
+ * @brief Normalize a vector, returning @c safe if len is zero.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
+{
+	vfloat4 length = dot(a, a);
+	if (length.lane<0>() != 0.0f)
+	{
+		return a / sqrt(length);
+	}
+
+	return safe;
+}
+
+
+
+#define POLY0(x, c0)                     (                                     c0)
+#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
+#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
+#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
+#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
+#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
+
+/**
+ * @brief Compute an approximate exp2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
+{
+	x = clamp(-126.99999f, 129.0f, x);
+
+	vint4 ipart = float_to_int(x - 0.5f);
+	vfloat4 fpart = x - int_to_float(ipart);
+
+	// Integer contrib, using 1 << ipart
+	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
+
+	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
+	vfloat4 fexp = POLY5(fpart,
+	                     9.9999994e-1f,
+	                     6.9315308e-1f,
+	                     2.4015361e-1f,
+	                     5.5826318e-2f,
+	                     8.9893397e-3f,
+	                     1.8775767e-3f);
+
+	return iexp * fexp;
+}
+
+/**
+ * @brief Compute an approximate log2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
+{
+	vint4 exp(0x7F800000);
+	vint4 mant(0x007FFFFF);
+	vint4 one(0x3F800000);
+
+	vint4 i = float_as_int(x);
+
+	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
+
+	vfloat4 m = int_as_float((i & mant) | one);
+
+	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
+	vfloat4 p = POLY4(m,
+	                  2.8882704548164776201f,
+	                 -2.52074962577807006663f,
+	                  1.48116647521213171641f,
+	                 -0.465725644288844778798f,
+	                  0.0596515482674574969533f);
+
+	// Increases the polynomial degree, but ensures that log2(1) == 0
+	p = p * (m - 1.0f);
+
+	return p + e;
+}
+
+/**
+ * @brief Compute an approximate pow(x, y) for each lane in the vector.
+ *
+ * Power function based on the exp2(log2(x) * y) transform.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
+{
+	vmask4 zero_mask = y == vfloat4(0.0f);
+	vfloat4 estimate = exp2(log2(x) * y);
+
+	// Guarantee that y == 0 returns exactly 1.0f
+	return select(estimate, vfloat4(1.0f), zero_mask);
+}
+
+/**
+ * @brief Count the leading zeros for each lane in @c a.
+ *
+ * Valid for all data values of @c a; will return a per-lane value [0, 32].
+ */
+static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
+{
+	// This function is a horrible abuse of floating point exponents to convert
+	// the original integer value into a 2^N encoding we can recover easily.
+
+	// Convert to float without risk of rounding up by keeping only top 8 bits.
+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
+	a = (~lsr<8>(a)) & a;
+	a = float_as_int(int_to_float(a));
+
+	// Extract and unbias exponent
+	a = vint4(127 + 31) - lsr<23>(a);
+
+	// Clamp result to a valid 32-bit range
+	return clamp(0, 32, a);
+}
+
+/**
+ * @brief Return lanewise 2^a for each lane in @c a.
+ *
+ * Use of signed int means that this is only valid for values in range [0, 31].
+ */
+static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
+{
+	// 2^30 is the largest signed number than can be represented
+	assert(all(a < vint4(31)));
+
+	// This function is a horrible abuse of floating point to use the exponent
+	// and float conversion to generate a 2^N multiple.
+
+	// Bias the exponent
+	vint4 exp = a + 127;
+	exp = lsl<23>(exp);
+
+	// Reinterpret the bits as a float, and then convert to an int
+	vfloat4 f = int_as_float(exp);
+	return float_to_int(f);
+}
+
+/**
+ * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
+ */
+static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
+{
+	vint4 fp16_one = vint4(0x3C00);
+	vint4 fp16_small = lsl<8>(p);
+
+	vmask4 is_one = p == vint4(0xFFFF);
+	vmask4 is_small = p < vint4(4);
+
+	// Manually inline clz() on Visual Studio to avoid release build codegen bug
+	// see https://github.com/ARM-software/astc-encoder/issues/259
+#if !defined(__clang__) && defined(_MSC_VER)
+	vint4 a = (~lsr<8>(p)) & p;
+	a = float_as_int(int_to_float(a));
+	a = vint4(127 + 31) - lsr<23>(a);
+	vint4 lz = clamp(0, 32, a) - 16;
+#else
+	vint4 lz = clz(p) - 16;
+#endif
+
+	p = p * two_to_the_n(lz + 1);
+	p = p & vint4(0xFFFF);
+
+	p = lsr<6>(p);
+
+	p = p | lsl<10>(vint4(14) - lz);
+
+	vint4 r = select(p, fp16_one, is_one);
+	r = select(r, fp16_small, is_small);
+	return r;
+}
+
+/**
+ * @brief Convert 16-bit LNS to float16.
+ */
+static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
+{
+	vint4 mc = p & 0x7FF;
+	vint4 ec = lsr<11>(p);
+
+	vint4 mc_512 = mc * 3;
+	vmask4 mask_512 = mc < vint4(512);
+
+	vint4 mc_1536 = mc * 4 - 512;
+	vmask4 mask_1536 = mc < vint4(1536);
+
+	vint4 mc_else = mc * 5 - 2048;
+
+	vint4 mt = mc_else;
+	mt = select(mt, mc_1536, mask_1536);
+	mt = select(mt, mc_512, mask_512);
+
+	vint4 res = lsl<10>(ec) | lsr<3>(mt);
+	return min(res, vint4(0x7BFF));
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      a      The input value.
+ * @param[out] exp    The output exponent.
+ *
+ * @return The mantissa.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
+{
+	// Interpret the bits as an integer
+	vint4 ai = float_as_int(a);
+
+	// Extract and unbias the exponent
+	exp = (lsr<23>(ai) & 0xFF) - 126;
+
+	// Extract and unbias the mantissa
+	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
+	return int_as_float(manti);
+}
+
+/**
+ * @brief Convert float to 16-bit LNS.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
+{
+	vint4 exp;
+	vfloat4 mant = frexp(a, exp);
+
+	// Do these early before we start messing about ...
+	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
+	vmask4 mask_infinity = a >= vfloat4(65536.0f);
+
+	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
+	vmask4 exp_lt_m13 = exp < vint4(-13);
+
+	vfloat4 a1a = a * 33554432.0f;
+	vint4 expa = vint4::zero();
+
+	vfloat4 a1b = (mant - 0.5f) * 4096;
+	vint4 expb = exp + 14;
+
+	a = select(a1b, a1a, exp_lt_m13);
+	exp = select(expb, expa, exp_lt_m13);
+
+	vmask4 a_lt_384 = a < vfloat4(384.0f);
+	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
+
+	vfloat4 a2a = a * (4.0f / 3.0f);
+	vfloat4 a2b = a + 128.0f;
+	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
+
+	a = a2c;
+	a = select(a, a2b, a_lt_1408);
+	a = select(a, a2a, a_lt_384);
+
+	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
+
+	a = select(a, vfloat4(65535.0f), mask_infinity);
+	a = select(a, vfloat4::zero(), mask_underflow_nan);
+
+	return a;
+}
+
+namespace astc
+{
+
+static ASTCENC_SIMD_INLINE float pow(float x, float y)
+{
+	return pow(vfloat4(x), vfloat4(y)).lane<0>();
+}
+
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
--- a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Generic 4x32-bit vector functions.
+ *
+ * This module implements generic 4-wide vector functions that are valid for
+ * all instruction sets, typically implemented using lower level 4-wide
+ * operations that are ISA-specific.
+ */
+
+#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+#include <limits>
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
+{
+	return a + vint4(b);
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
+{
+	return a - vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
+{
+	return a * vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
+{
+	return a | vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
+{
+	return a & vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
+{
+	return a ^ vint4(b);
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ */
+ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
+{
+	return min(max(a, vint4(minv)), vint4(maxv));
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Generate a vint4 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint4(static_cast<int>(a));
+ }
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
+{
+	return a + vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
+{
+	return a - vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	return a * vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	return vfloat4(a) * b;
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
+{
+	return a / vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
+{
+	return vfloat4(a) / b;
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
+{
+	return min(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
+{
+	return max(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), 1.0f);
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal min of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
+{
+	a.set_lane<3>(a.lane<0>());
+	return hmin_s(a);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
+{
+	accum = accum + a;
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a masked vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
+{
+	a = select(vfloat4::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_s(m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return vfloat4(hadd_s(m));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_rgb_s(m);
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	float d3 = hadd_rgb_s(m);
+	return vfloat4(d3, d3, d3, 0.0f);
+}
+
+#endif
+
+#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+static inline int popcount(uint64_t v)
+{
+	uint64_t mask1 = 0x5555555555555555ULL;
+	uint64_t mask2 = 0x3333333333333333ULL;
+	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
+	v -= (v >> 1) & mask1;
+	v = (v & mask2) + ((v >> 2) & mask2);
+	v += v >> 4;
+	v &= mask3;
+	v *= 0x0101010101010101ULL;
+	v >>= 56;
+	return static_cast<int>(v);
+}
+
+#endif
+
+/**
+ * @brief Apply signed bit transfer.
+ *
+ * @param input0   The first encoded endpoint.
+ * @param input1   The second encoded endpoint.
+ */
+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
+	vint4& input0,
+	vint4& input1
+) {
+	input1 = lsr<1>(input1) | (input0 & 0x80);
+	input0 = lsr<1>(input0) & 0x3F;
+
+	vmask4 mask = (input0 & 0x20) != vint4::zero();
+	input0 = select(input0, input0 - 0x40, mask);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	ASTCENC_ALIGNAS int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint4 a)
+{
+	ASTCENC_ALIGNAS int v[4];
+	storea(a, v);
+
+	unsigned int uv[4];
+	std::memcpy(uv, v, sizeof(int) * 4);
+
+	printf("v4_i32:\n  %08x %08x %08x %08x\n",
+		uv[0], uv[1], uv[2], uv[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	ASTCENC_ALIGNAS float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask4 a)
+{
+	print(select(vint4(0), vint4(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
--- a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
--- a/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sve_8.h
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for angular-sum algorithm for weight alignment.
+ *
+ * This algorithm works as follows:
+ * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
+ *   where i is the input value and s is a scaling factor based on the spacing between the weights.
+ * - we then add together complex numbers for all the weights.
+ * - we then compute the length and angle of the resulting sum.
+ *
+ * This should produce the following results:
+ * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
+ * - even distribution results in a vector of length 0.
+ * - all samples identical results in perfect alignment for every scaling.
+ *
+ * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
+ * should then result in some scalings standing out as having particularly good alignment factors;
+ * we can use this to produce a set of candidate scale/shift values for various quantization levels;
+ * we should then actually try them and see what happens.
+ */
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+#include <stdio.h>
+#include <cassert>
+#include <cstring>
+#include <cfloat>
+
+static constexpr unsigned int ANGULAR_STEPS { 32 };
+
+static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
+              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert(ANGULAR_STEPS >= 32,
+              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
+
+// Store a reduced sin/cos table for 64 possible weight values; this causes
+// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
+static constexpr unsigned int SINCOS_STEPS { 64 };
+
+static const uint8_t steps_for_quant_level[12] {
+	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
+};
+
+ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	static bool print_once { true };
+#endif
+
+/* See header for documentation. */
+void prepare_angular_tables()
+{
+	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
+	{
+		float angle_step = static_cast<float>(i + 1);
+
+		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
+		{
+			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+		}
+	}
+}
+
+/**
+ * @brief Compute the angular alignment factors and offsets.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param[out] offsets                   The output angular offsets array.
+ */
+static void compute_angular_offsets(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	float* offsets
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
+
+	// Precompute isample; arrays are always allocated 64 elements long
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
+
+		// Convert a weight to a sincos table index
+		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
+		vint isample = float_to_int_rtn(sample);
+		storea(isample, isamplev + i);
+	}
+
+	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
+	vfloat mult(1.0f / (2.0f * astc::PI));
+
+	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat anglesum_x = vfloat::zero();
+		vfloat anglesum_y = vfloat::zero();
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			int isample = isamplev[j];
+			anglesum_x += loada(cos_table[isample] + i);
+			anglesum_y += loada(sin_table[isample] + i);
+		}
+
+		vfloat angle = atan2(anglesum_y, anglesum_x);
+		vfloat ofs = angle * mult;
+		storea(ofs, offsets + i);
+	}
+}
+
+/**
+ * @brief For a given step size compute the lowest and highest weight.
+ *
+ * Compute the lowest and highest weight that results from quantizing using the given stepsize and
+ * offset, and then compute the resulting error. The cut errors indicate the error that results from
+ * forcing samples that should have had one weight value one step up or down.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param      max_quant_steps           The maximum quantization level to be tested.
+ * @param      offsets                   The angular offsets array.
+ * @param[out] lowest_weight             Per angular step, the lowest weight.
+ * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
+ * @param[out] error                     Per angular step, the error.
+ * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
+ * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
+ */
+static void compute_lowest_and_highest_weight(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	unsigned int max_quant_steps,
+	const float* offsets,
+	float* lowest_weight,
+	int* weight_span,
+	float* error,
+	float* cut_low_weight_error,
+	float* cut_high_weight_error
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
+
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+
+	vint lane_id = vint::lane_id();
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vmask active = lane_id < vint(weight_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, select(min_weight, weights, active));
+		max_weight = max(max_weight, select(max_weight, weights, active));
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);
+
+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat errval = vfloat::zero();
+		vfloat cut_low_weight_err = vfloat::zero();
+		vfloat cut_high_weight_err = vfloat::zero();
+		vfloat offset = loada(offsets + sp);
+
+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
+			vfloat svalrte = round(sval);
+			vfloat diff = sval - svalrte;
+			errval += diff * diff;
+
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
+
+			// Accumulate errors for maximum index
+			mask = svalrte == maxidx;
+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
+		}
+
+		// Write out min weight and weight span; clamp span to a usable range
+		vint span = float_to_int(maxidx - minidx + vfloat(1));
+		span = min(span, vint(max_quant_steps + 3));
+		span = max(span, vint(2));
+		storea(minidx, lowest_weight + sp);
+		storea(span, weight_span + sp);
+
+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
+		// samples that should have had the weight value one step (up/down).
+		vfloat ssize = 1.0f / rcp_stepsize;
+		vfloat errscale = ssize * ssize;
+		storea(errval * errscale, error + sp);
+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
+
+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
+	}
+}
+
+/**
+ * @brief The main function for the angular algorithm.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_quant_level           The maximum quantization level to be tested.
+ * @param[out] low_value                 Per angular step, the lowest weight value.
+ * @param[out] high_value                Per angular step, the highest weight value.
+ */
+static void compute_angular_endpoints_for_quant_levels(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_quant_level,
+	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
+	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
+) {
+	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
+	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
+
+	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
+
+	compute_angular_offsets(weight_count, dec_weight_ideal_value,
+	                        max_angular_steps, angular_offsets);
+
+	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
+
+	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
+	                                  max_angular_steps, max_quant_steps,
+	                                  angular_offsets, lowest_weight, weight_span, error,
+	                                  cut_low_weight_error, cut_high_weight_error);
+
+	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
+	// branches can become selects. This involves some integer to float casts, but the values are
+	// small enough so they never round the wrong way.
+	vfloat4 best_results[36];
+
+	// Initialize the array to some safe defaults
+	promise(max_quant_steps > 0);
+	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
+	{
+		// Lane<0> = Best error
+		// Lane<1> = Best scale; -1 indicates no solution found
+		// Lane<2> = Cut low weight
+		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
+	}
+
+	promise(max_angular_steps > 0);
+	for (unsigned int i = 0; i < max_angular_steps; i++)
+	{
+		float i_flt = static_cast<float>(i);
+
+		int idx_span = weight_span[i];
+
+		float error_cut_low = error[i] + cut_low_weight_error[i];
+		float error_cut_high = error[i] + cut_high_weight_error[i];
+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
+
+		// Check best error against record N
+		vfloat4 best_result = best_results[idx_span];
+		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
+		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
+		best_results[idx_span] = select(best_result, new_result, mask);
+
+		// Check best error against record N-1 with either cut low or cut high
+		best_result = best_results[idx_span - 1];
+
+		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
+		best_result = select(best_result, new_result, mask);
+
+		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
+		best_results[idx_span - 1] = select(best_result, new_result, mask);
+
+		// Check best error against record N-2 with both cut low and high
+		best_result = best_results[idx_span - 2];
+		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
+		best_results[idx_span - 2] = select(best_result, new_result, mask);
+	}
+
+	for (unsigned int i = 0; i <= max_quant_level; i++)
+	{
+		unsigned int q = steps_for_quant_level[i];
+		int bsi = static_cast<int>(best_results[q].lane<1>());
+
+		// Did we find anything?
+#if defined(ASTCENC_DIAGNOSTICS)
+		if ((bsi < 0) && print_once)
+		{
+			print_once = false;
+			printf("INFO: Unable to find full encoding within search error limit.\n\n");
+		}
+#endif
+
+		bsi = astc::max(0, bsi);
+
+		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
+		float hwi = lwi + static_cast<float>(q) - 1.0f;
+
+		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
+		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
+		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_1plane(
+	bool only_always,
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+
+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+
+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
+	                                                : bsd.decimation_mode_count_selected;
+	promise(max_decimation_modes > 0);
+	for (unsigned int i = 0; i < max_decimation_modes; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_1plane;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values[i], high_values[i]);
+	}
+
+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
+	                                           : bsd.block_mode_count_1plane_selected;
+	promise(max_block_modes > 0);
+	for (unsigned int i = 0; i < max_block_modes; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		assert(!bm.is_dual_plane);
+
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value[i] = low_values[decim_mode][quant_mode];
+			high_value[i] = high_values[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value[i] = 0.0f;
+			high_value[i] = 1.0f;
+		}
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_2planes(
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
+
+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
+
+	promise(bsd.decimation_mode_count_selected > 0);
+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_2planes;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values1[i], high_values1[i]);
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
+		    max_precision, low_values2[i], high_values2[i]);
+	}
+
+	unsigned int start = bsd.block_mode_count_1plane_selected;
+	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
+	for (unsigned int i = start; i < end; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value1[i] = low_values1[decim_mode][quant_mode];
+			high_value1[i] = high_values1[decim_mode][quant_mode];
+			low_value2[i] = low_values2[decim_mode][quant_mode];
+			high_value2[i] = high_values2[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value1[i] = 0.0f;
+			high_value1[i] = 1.0f;
+			low_value2[i] = 0.0f;
+			high_value2[i] = 1.0f;
+		}
+	}
+}
+
+#endif
--- a/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
+++ b/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Data tables for quantization transfer.
+ */
+
+#include "astcenc_internal.h"
+
+#define _ 0 // Using _ to indicate an entry that will not be used.
+
+const quant_and_transfer_table quant_and_xfer_tables[12] {
+	// QUANT2, range 0..1
+	{
+		{0, 64},
+		{0, 1},
+		{0, 64},
+		{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x4000}
+	},
+	// QUANT_3, range 0..2
+	{
+		{0, 32, 64},
+		{0, 1, 2},
+		{0, 32, 64},
+		{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,0x4020}
+	},
+	// QUANT_4, range 0..3
+	{
+		{0, 21, 43, 64},
+		{0, 1, 2, 3},
+		{0, 21, 43, 64},
+		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,0x402b}
+	},
+	//QUANT_5, range 0..4
+	{
+		{0, 16, 32, 48, 64},
+		{0, 1, 2, 3, 4},
+		{0, 16, 32, 48, 64},
+		{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
+	},
+	// QUANT_6, range 0..5
+	{
+		{0, 12, 25, 39, 52, 64},
+		{0, 2, 4, 5, 3, 1},
+		{0, 64, 12, 52, 25, 39},
+		{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
+	},
+	// QUANT_8, range 0..7
+	{
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7},
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
+		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
+		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
+	},
+	// QUANT_10, range 0..9
+	{
+		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
+		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
+		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
+		{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
+		 0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
+		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
+		 _,0x4039}
+	},
+	// QUANT_12, range 0..11
+	{
+		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
+		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
+		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
+		{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
+		 0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
+		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
+		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
+	},
+	// QUANT_16, range 0..15
+	{
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
+		 0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
+		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
+		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
+	},
+	// QUANT_20, range 0..19
+	{
+		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
+		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
+		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
+		{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
+		 0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
+		 0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
+		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
+		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
+	},
+	// QUANT_24, range 0..23
+	{
+		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
+		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
+		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
+		{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
+		 _,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
+		 0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
+		 0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
+		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
+		 0x403b,_,0x403e}
+	},
+	// QUANT_32, range 0..31
+	{
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
+		 0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
+		 0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
+		 0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
+		 0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
+		 0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
+		 0x403c,_,0x403e}
+	}
+};
--- a/thirdparty/basis_universal/LICENSE
+++ b/thirdparty/basis_universal/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
+++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
--- a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h
+++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h
@@ -0,0 +1,45 @@
+// File: android_astc_decomp.h
+#ifndef _TCUASTCUTIL_HPP
+#define _TCUASTCUTIL_HPP
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+
+#include <vector>
+#include <stdint.h>
+
+namespace basisu_astc
+{
+namespace astc
+{
+
+// Unpacks a single ASTC block to pDst
+// If isSRGB is true, the spec requires the decoder to scale the LDR 8-bit endpoints to 16-bit before interpolation slightly differently, 
+// which will lead to different outputs. So be sure to set it correctly (ideally it should match whatever the encoder did).
+bool decompress_ldr(uint8_t* pDst, const uint8_t* data, bool isSRGB, int blockWidth, int blockHeight);
+bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight);
+bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool& is_hdr);
+
+} // astc
+} // basisu
+
+#endif
--- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp
--- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h
@@ -0,0 +1,129 @@
+// File: basisu_astc_hdr_6x6_enc.h
+#pragma once
+#include "basisu_enc.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
+
+namespace astc_6x6_hdr
+{
+	const uint32_t ASTC_HDR_6X6_MAX_USER_COMP_LEVEL = 12;
+
+	const uint32_t ASTC_HDR_6X6_MAX_COMP_LEVEL = 4;
+	
+	const float LDR_BLACK_BIAS = 0.0f;// .49f;
+		
+	// Note: This struct is copied several times, so do not place any heavyweight objects in here.
+	struct astc_hdr_6x6_global_config
+	{
+		// Important: The Delta ITP colorspace error metric we use internally makes several assumptions about the nature of the HDR RGB inputs supplied to the encoder.
+		// This encoder computes colorspace error in the ICtCp (or more accurately the delta ITP, where CT is scaled by .5 vs. ICtCp to become T) colorspace, so getting this correct is important.
+		// By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m²), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light).
+		// If the m_rec2020_bt2100_color_gamut flag is true, the input colorspace is treated as REC 2020/BT.2100 (which is wider than 709).
+		// For SDR/LDR->HDR upconversion, the REC 709 sRGB input should be converted to linear light (sRGB->linear) and the resulting normalized linear RGB values scaled by either 80 or 100 nits (the luminance of a typical SDR monitor). 
+		// SDR upconversion to normalized [0,1] (i.e. non-absolute) luminances may work but is not supported because ITP errors will not be predicted correctly.
+		bool m_rec2020_bt2100_color_gamut = false; 
+
+		// levels 0-3 normal levels, 4=exhaustive
+		uint32_t m_master_comp_level = 0;
+		uint32_t m_highest_comp_level = 1;
+
+		float m_lambda = 0.0f;
+
+		bool m_extra_patterns_flag = false; // def to false, works in comp levels [1,4]
+		bool m_brute_force_partition_matching = false; // def to false
+
+		bool m_jnd_optimization = false; // defaults to false for HDR inputs, on SDR upconverted images this can default to enabled
+		float m_jnd_delta_itp_thresh = .75f;
+
+		bool m_force_one_strip = false;
+				
+		bool m_gaussian1_fallback = true; // def to true, if this is disabled m_gaussian2_fallback should be disabled too
+		float m_gaussian1_strength = 1.45f;
+
+		bool m_gaussian2_fallback = true; // def to true, hopefully rarely kicks in
+		float m_gaussian2_strength = 1.83f;
+				
+		// m_disable_delta_endpoint_usage may give a slight increase in RDO ASTC encoding efficiency. It's also faster.
+		bool m_disable_delta_endpoint_usage = false;
+
+		// Scale up Delta ITP errors for very dark pixels, assuming they will be brightly exposed > 1.0x.
+		// We don't know if the output will be exposed, or not. If heavily exposed, our JND calculations will not be conservative enough.
+		bool m_delta_itp_dark_adjustment = true;
+
+		bool m_debug_images = false;
+		std::string m_debug_image_prefix = "dbg_astc_hdr_6x6_devel_";
+
+		bool m_output_images = false;
+		std::string m_output_image_prefix = "dbg_astc_hdr_6x6_output_";
+
+		bool m_debug_output = false;
+		bool m_image_stats = false;
+		bool m_status_output = false;
+
+		//-------------------------------------------------------------------------------------
+		// Very low level/devel parameters - intended for development. Best not to change them.
+		//-------------------------------------------------------------------------------------
+		bool m_deblocking_flag = true;
+		float m_deblock_penalty_weight = .03f;
+		bool m_disable_twothree_subsets = false; // def to false
+		bool m_use_solid_blocks = true; // def to true
+		bool m_use_runs = true; // def to true
+		bool m_block_stat_optimizations_flag = true; // def to true	
+
+		bool m_rdo_candidate_diversity_boost = true; // def to true
+		float m_rdo_candidate_diversity_boost_bit_window_weight = 1.2f;
+
+		bool m_favor_higher_compression = true; // utilize all modes
+		uint32_t m_num_reuse_xy_deltas = basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS;
+
+		void print() const
+		{
+			basisu::fmt_debug_printf("m_master_comp_level: {}, m_highest_comp_level: {}\n", m_master_comp_level, m_highest_comp_level);
+			basisu::fmt_debug_printf("m_lambda: {}\n", m_lambda);
+			basisu::fmt_debug_printf("m_rec2020_bt2100_color_gamut: {}\n", m_rec2020_bt2100_color_gamut);
+			basisu::fmt_debug_printf("m_extra_patterns_flag: {}, m_brute_force_partition_matching: {}\n", m_extra_patterns_flag, m_brute_force_partition_matching);
+			basisu::fmt_debug_printf("m_jnd_optimization: {}, m_jnd_delta_itp_thresh: {}\n", m_jnd_optimization, m_jnd_delta_itp_thresh);
+			basisu::fmt_debug_printf("m_force_one_strip: {}\n", m_force_one_strip);
+			basisu::fmt_debug_printf("m_gaussian1_fallback: {}, m_gaussian1_strength: {}\n", m_gaussian1_fallback, m_gaussian1_strength);
+			basisu::fmt_debug_printf("m_gaussian2_fallback: {}, m_gaussian2_strength: {}\n", m_gaussian2_fallback, m_gaussian2_strength);
+			basisu::fmt_debug_printf("m_disable_delta_endpoint_usage: {}\n", m_disable_delta_endpoint_usage);
+			basisu::fmt_debug_printf("m_delta_itp_dark_adjustment: {}\n", m_delta_itp_dark_adjustment);
+			basisu::fmt_debug_printf("m_debug_images: {}, m_debug_image_prefix: {}\n", m_debug_images, m_debug_image_prefix);
+			basisu::fmt_debug_printf("m_output_images: {}, m_output_image_prefix: {}\n", m_output_images, m_output_image_prefix);
+			basisu::fmt_debug_printf("m_image_stats: {}, m_status_output: {}\n", m_image_stats, m_status_output);
+			basisu::fmt_debug_printf("m_deblocking_flag: {}, m_deblock_penalty_weight: {}\n", m_deblocking_flag, m_deblock_penalty_weight);
+			basisu::fmt_debug_printf("m_disable_twothree_subsets: {}, m_use_solid_blocks: {}\n", m_disable_twothree_subsets, m_use_solid_blocks);
+			basisu::fmt_debug_printf("m_use_runs: {}, m_block_stat_optimizations_flag: {}\n", m_use_runs, m_block_stat_optimizations_flag);
+			basisu::fmt_debug_printf("m_rdo_candidate_diversity_boost: {}, m_rdo_candidate_diversity_boost_bit_window_weight: {}\n", m_rdo_candidate_diversity_boost, m_rdo_candidate_diversity_boost_bit_window_weight);
+			basisu::fmt_debug_printf("m_favor_higher_compression: {}, m_num_reuse_xy_deltas: {}\n", m_favor_higher_compression, m_num_reuse_xy_deltas);
+		}
+				
+		astc_hdr_6x6_global_config()
+		{
+		}
+
+		void clear()
+		{
+			astc_hdr_6x6_global_config def;
+			std::swap(*this, def);
+		}
+
+		// Max level is ASTC_HDR_6X6_MAX_USER_COMP_LEVEL
+		void set_user_level(int level);
+	};
+
+	void global_init();
+
+	struct result_metrics
+	{
+		basisu::image_metrics m_im_astc_log2;
+		basisu::image_metrics m_im_astc_half;
+
+		basisu::image_metrics m_im_bc6h_log2;
+		basisu::image_metrics m_im_bc6h_half;
+	};
+	
+	// The input image should be unpadded to 6x6 boundaries, i.e. the original unexpanded image.
+	bool compress_photo(const basisu::imagef& orig_src_img, const astc_hdr_6x6_global_config& global_cfg, basisu::job_pool* pJob_pool,
+		basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics);
+
+} // namespace uastc_6x6_hdr
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`// This file doesn't exist in this version of FSR.`