initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/modules/betsy/LICENSE.Betsy.md
+++ b/modules/betsy/LICENSE.Betsy.md
@@ -0,0 +1,18 @@
+Copyright 2020-2022 Matias N. Goldberg
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+This software uses code from:
+
+* [GPURealTimeBC6H](https://github.com/knarkowicz/GPURealTimeBC6H), under public domain. Modifications by Matias N. Goldberg
+* [rg-etc1](https://github.com/richgel999/rg-etc1/), Copyright (c) 2012 Rich Geldreich, zlib license. Extensive modifications by Matias N. Goldberg to adapt it as a compute shader
+* [stb_dxt](https://github.com/nothings/stb/blob/master/stb_dxt.h), under dual-license: A. MIT License
+Copyright (c) 2017 Sean Barrett, B. Public Domain (www.unlicense.org). Original by fabian "ryg" giesen - ported to C by stb. Modifications by Matias N. Goldberg to adapt it as a compute shader
+* EAC loosely inspired on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license
+* ETC2 T & H modes based on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license. A couple minor bugfixes applied by Matias N. Goldberg. Modifications made by Matias N. Goldberg to adapt it as a compute shader
+* ETC2 P very loosely based on [etc2_encoder](https://github.com/titilambert/packaging-efl/blob/master/src/static_libs/rg_etc/etc2_encoder.c), Copyright (C) 2014 Jean-Philippe ANDRE, 2-clause BSD license. Considerable rewrite by Matias N. Goldberg to enhance its quality.
--- a/modules/betsy/SCsub
+++ b/modules/betsy/SCsub
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+from misc.utility.scons_hints import *
+
+Import("env")
+Import("env_modules")
+
+env_betsy = env_modules.Clone()
+
+# Betsy shaders, originally from https://github.com/darksylinc/betsy
+env_betsy.GLSL_HEADER("bc6h.glsl")
+env_betsy.GLSL_HEADER("bc1.glsl")
+env_betsy.GLSL_HEADER("bc4.glsl")
+env_betsy.GLSL_HEADER("alpha_stitch.glsl")
+
+env_betsy.Depends(Glob("*.glsl.gen.h"), ["#glsl_builders.py"])
+
+# Godot source files
+env_betsy.add_source_files(env.modules_sources, "*.cpp")
--- a/modules/betsy/alpha_stitch.glsl
+++ b/modules/betsy/alpha_stitch.glsl
@@ -0,0 +1,21 @@
+// RGB and Alpha components of ETC2 RGBA/DXT5 are computed separately.
+// This compute shader merely stitches them together to form the final result
+// It's also used by RG11/BC4 driver to stitch two R11/BC4 into one RG11/BC5
+
+#[compute]
+#version 450
+
+layout(local_size_x = 8, //
+		local_size_y = 8, //
+		local_size_z = 1) in;
+
+layout(binding = 0) uniform usampler2D srcRGB;
+layout(binding = 1) uniform usampler2D srcAlpha;
+layout(binding = 2, rgba32ui) uniform restrict writeonly uimage2D dstTexture;
+
+void main() {
+	uvec2 rgbBlock = texelFetch(srcRGB, ivec2(gl_GlobalInvocationID.xy), 0).xy;
+	uvec2 alphaBlock = texelFetch(srcAlpha, ivec2(gl_GlobalInvocationID.xy), 0).xy;
+
+	imageStore(dstTexture, ivec2(gl_GlobalInvocationID.xy), uvec4(rgbBlock.xy, alphaBlock.xy));
+}
--- a/modules/betsy/bc1.glsl
+++ b/modules/betsy/bc1.glsl
@@ -0,0 +1,491 @@
+#[versions]
+
+standard = "";
+dithered = "#define BC1_DITHER";
+
+#[compute]
+#version 450
+
+#VERSION_DEFINES
+
+#define FLT_MAX 340282346638528859811704183484516925440.0f
+
+layout(binding = 0) uniform sampler2D srcTex;
+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(std430, binding = 2) readonly restrict buffer globalBuffer {
+	vec2 c_oMatch5[256];
+	vec2 c_oMatch6[256];
+};
+
+layout(push_constant, std430) uniform Params {
+	uint p_numRefinements;
+	uint p_padding[3];
+}
+params;
+
+layout(local_size_x = 8, //
+		local_size_y = 8, //
+		local_size_z = 1) in;
+
+vec3 rgb565to888(float rgb565) {
+	vec3 retVal;
+	retVal.x = floor(rgb565 / 2048.0f);
+	retVal.y = floor(mod(rgb565, 2048.0f) / 32.0f);
+	retVal.z = floor(mod(rgb565, 32.0f));
+
+	// This is the correct 565 to 888 conversion:
+	//		rgb = floor( rgb * ( 255.0f / vec3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
+	//
+	// However stb_dxt follows a different one:
+	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
+	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
+	//
+	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
+	// It's quite possible this is the reason:
+	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
+	//
+	// Or maybe it's just because it's cheap to do with integer shifts.
+	// Anyway, we follow stb_dxt's conversion just in case
+	// (gives almost the same result, with 1 or -1 of difference for a very few values)
+	//
+	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
+	// because they end up mapping to the original number
+
+	return floor(retVal * vec3(8.25f, 4.0625f, 8.25f));
+}
+
+float rgb888to565(vec3 rgbValue) {
+	rgbValue.rb = floor(rgbValue.rb * 31.0f / 255.0f + 0.5f);
+	rgbValue.g = floor(rgbValue.g * 63.0f / 255.0f + 0.5f);
+
+	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
+}
+
+// linear interpolation at 1/3 point between a and b, using desired rounding type
+vec3 lerp13(vec3 a, vec3 b) {
+#ifdef STB_DXT_USE_ROUNDING_BIAS
+	// with rounding bias
+	return a + floor((b - a) * (1.0f / 3.0f) + 0.5f);
+#else
+	// without rounding bias
+	return floor((2.0f * a + b) / 3.0f);
+#endif
+}
+
+/// Unpacks a block of 4 colors from two 16-bit endpoints
+void EvalColors(out vec3 colors[4], float c0, float c1) {
+	colors[0] = rgb565to888(c0);
+	colors[1] = rgb565to888(c1);
+	colors[2] = lerp13(colors[0], colors[1]);
+	colors[3] = lerp13(colors[1], colors[0]);
+}
+
+/** The color optimization function. (Clever code, part 1)
+@param outMinEndp16 [out]
+	Minimum endpoint, in RGB565
+@param outMaxEndp16 [out]
+	Maximum endpoint, in RGB565
+*/
+void OptimizeColorsBlock(const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16) {
+	// determine color distribution
+	vec3 avgColor;
+	vec3 minColor;
+	vec3 maxColor;
+
+	avgColor = minColor = maxColor = unpackUnorm4x8(srcPixelsBlock[0]).xyz;
+	for (int i = 1; i < 16; ++i) {
+		const vec3 currColorUnorm = unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+		avgColor += currColorUnorm;
+		minColor = min(minColor, currColorUnorm);
+		maxColor = max(maxColor, currColorUnorm);
+	}
+
+	avgColor = round(avgColor * 255.0f / 16.0f);
+	maxColor *= 255.0f;
+	minColor *= 255.0f;
+
+	// determine covariance matrix
+	float cov[6];
+	for (int i = 0; i < 6; ++i) {
+		cov[i] = 0;
+	}
+
+	for (int i = 0; i < 16; ++i) {
+		const vec3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+		vec3 rgbDiff = currColor - avgColor;
+
+		cov[0] += rgbDiff.r * rgbDiff.r;
+		cov[1] += rgbDiff.r * rgbDiff.g;
+		cov[2] += rgbDiff.r * rgbDiff.b;
+		cov[3] += rgbDiff.g * rgbDiff.g;
+		cov[4] += rgbDiff.g * rgbDiff.b;
+		cov[5] += rgbDiff.b * rgbDiff.b;
+	}
+
+	// convert covariance matrix to float, find principal axis via power iter
+	for (int i = 0; i < 6; ++i) {
+		cov[i] /= 255.0f;
+	}
+
+	vec3 vF = maxColor - minColor;
+
+	const int nIterPower = 4;
+	for (int iter = 0; iter < nIterPower; ++iter) {
+		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
+		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
+		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
+
+		vF.r = r;
+		vF.g = g;
+		vF.b = b;
+	}
+
+	float magn = max(abs(vF.r), max(abs(vF.g), abs(vF.b)));
+	vec3 v;
+
+	if (magn < 4.0f) { // too small, default to luminance
+		v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000.
+		v.g = 587.0f;
+		v.b = 114.0f;
+	} else {
+		v = trunc(vF * (512.0f / magn));
+	}
+
+	// Pick colors at extreme points
+	vec3 minEndpoint, maxEndpoint;
+	float minDot = FLT_MAX;
+	float maxDot = -FLT_MAX;
+	for (int i = 0; i < 16; ++i) {
+		const vec3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+		const float dotValue = dot(currColor, v);
+
+		if (dotValue < minDot) {
+			minDot = dotValue;
+			minEndpoint = currColor;
+		}
+
+		if (dotValue > maxDot) {
+			maxDot = dotValue;
+			maxEndpoint = currColor;
+		}
+	}
+
+	outMinEndp16 = rgb888to565(minEndpoint);
+	outMaxEndp16 = rgb888to565(maxEndpoint);
+}
+
+// The color matching function
+uint MatchColorsBlock(const uint srcPixelsBlock[16], vec3 color[4]) {
+	uint mask = 0u;
+	vec3 dir = color[0] - color[1];
+	float stops[4];
+
+	for (int i = 0; i < 4; ++i) {
+		stops[i] = dot(color[i], dir);
+	}
+
+	// think of the colors as arranged on a line; project point onto that line, then choose
+	// next color out of available ones. we compute the crossover points for "best color in top
+	// half"/"best in bottom half" and then the same inside that subinterval.
+	//
+	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+	// but it's very close and a lot faster.
+	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+
+	float c0Point = trunc((stops[1] + stops[3]) * 0.5f);
+	float halfPoint = trunc((stops[3] + stops[2]) * 0.5f);
+	float c3Point = trunc((stops[2] + stops[0]) * 0.5f);
+
+#ifndef BC1_DITHER
+	// the version without dithering is straightforward
+	for (uint i = 16u; i-- > 0u;) {
+		const vec3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+		const float dotValue = dot(currColor, dir);
+		mask <<= 2u;
+
+		if (dotValue < halfPoint) {
+			mask |= ((dotValue < c0Point) ? 1u : 3u);
+		} else {
+			mask |= ((dotValue < c3Point) ? 2u : 0u);
+		}
+	}
+#else
+	// with floyd-steinberg dithering
+	vec4 ep1 = vec4(0, 0, 0, 0);
+	vec4 ep2 = vec4(0, 0, 0, 0);
+
+	c0Point *= 16.0f;
+	halfPoint *= 16.0f;
+	c3Point *= 16.0f;
+
+	for (uint y = 0u; y < 4u; ++y) {
+		float ditherDot;
+		uint lmask, step;
+
+		vec3 currColor;
+		float dotValue;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 0]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (3 * ep2[1] + 5 * ep2[0]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[0] = dotValue - stops[step];
+		lmask = step;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 1]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[1] = dotValue - stops[step];
+		lmask |= step << 2u;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[2] = dotValue - stops[step];
+		lmask |= step << 4u;
+
+		currColor = unpackUnorm4x8(srcPixelsBlock[y * 4 + 2]).xyz * 255.0f;
+		dotValue = dot(currColor, dir);
+
+		ditherDot = (dotValue * 16.0f) + (7 * ep1[2] + 5 * ep2[3] + ep2[2]);
+		if (ditherDot < halfPoint) {
+			step = (ditherDot < c0Point) ? 1u : 3u;
+		} else {
+			step = (ditherDot < c3Point) ? 2u : 0u;
+		}
+		ep1[3] = dotValue - stops[step];
+		lmask |= step << 6u;
+
+		mask |= lmask << (y * 8u);
+		{
+			vec4 tmp = ep1;
+			ep1 = ep2;
+			ep2 = tmp;
+		} // swap
+	}
+#endif
+
+	return mask;
+}
+
+// The refinement function. (Clever code, part 2)
+// Tries to optimize colors to suit block contents better.
+// (By solving a least squares system via normal equations+Cramer's rule)
+bool RefineBlock(const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
+		inout float inOutMaxEndp16) {
+	float newMin16, newMax16;
+	const float oldMin = inOutMinEndp16;
+	const float oldMax = inOutMaxEndp16;
+
+	if ((mask ^ (mask << 2u)) < 4u) // all pixels have the same index?
+	{
+		// yes, linear system would be singular; solve using optimal
+		// single-color match on average color
+		vec3 rgbVal = vec3(8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f);
+		for (int i = 0; i < 16; ++i) {
+			rgbVal += unpackUnorm4x8(srcPixelsBlock[i]).xyz;
+		}
+
+		rgbVal = floor(rgbVal * (255.0f / 16.0f));
+
+		newMax16 = c_oMatch5[uint(rgbVal.r)][0] * 2048.0f + //
+				c_oMatch6[uint(rgbVal.g)][0] * 32.0f + //
+				c_oMatch5[uint(rgbVal.b)][0];
+		newMin16 = c_oMatch5[uint(rgbVal.r)][1] * 2048.0f + //
+				c_oMatch6[uint(rgbVal.g)][1] * 32.0f + //
+				c_oMatch5[uint(rgbVal.b)][1];
+	} else {
+		const float w1Tab[4] = { 3, 0, 2, 1 };
+		const float prods[4] = { 589824.0f, 2304.0f, 262402.0f, 66562.0f };
+		// ^some magic to save a lot of multiplies in the accumulating loop...
+		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
+		// register)
+
+		float akku = 0.0f;
+		uint cm = mask;
+		vec3 at1 = vec3(0, 0, 0);
+		vec3 at2 = vec3(0, 0, 0);
+		for (int i = 0; i < 16; ++i, cm >>= 2u) {
+			const vec3 currColor = unpackUnorm4x8(srcPixelsBlock[i]).xyz * 255.0f;
+
+			const uint step = cm & 3u;
+			const float w1 = w1Tab[step];
+			akku += prods[step];
+			at1 += currColor * w1;
+			at2 += currColor;
+		}
+
+		at2 = 3.0f * at2 - at1;
+
+		// extract solutions and decide solvability
+		const float xx = floor(akku / 65535.0f);
+		const float yy = floor(mod(akku, 65535.0f) / 256.0f);
+		const float xy = mod(akku, 256.0f);
+
+		vec2 f_rb_g;
+		f_rb_g.x = 3.0f * 31.0f / 255.0f / (xx * yy - xy * xy);
+		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
+
+		// solve.
+		const vec3 newMaxVal = clamp(floor((at1 * yy - at2 * xy) * f_rb_g.xyx + 0.5f),
+				vec3(0.0f, 0.0f, 0.0f), vec3(31, 63, 31));
+		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
+
+		const vec3 newMinVal = clamp(floor((at2 * xx - at1 * xy) * f_rb_g.xyx + 0.5f),
+				vec3(0.0f, 0.0f, 0.0f), vec3(31, 63, 31));
+		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
+	}
+
+	inOutMinEndp16 = newMin16;
+	inOutMaxEndp16 = newMax16;
+
+	return oldMin != newMin16 || oldMax != newMax16;
+}
+
+#ifdef BC1_DITHER
+/// Quantizes 'srcValue' which is originally in 888 (full range),
+/// converting it to 565 and then back to 888 (quantized)
+vec3 quant(vec3 srcValue) {
+	srcValue = clamp(srcValue, 0.0f, 255.0f);
+	// Convert 888 -> 565
+	srcValue = floor(srcValue * vec3(31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f) + 0.5f);
+	// Convert 565 -> 888 back
+	srcValue = floor(srcValue * vec3(8.25f, 4.0625f, 8.25f));
+
+	return srcValue;
+}
+
+void DitherBlock(const uint srcPixBlck[16], out uint dthPixBlck[16]) {
+	vec3 ep1[4] = { vec3(0, 0, 0), vec3(0, 0, 0), vec3(0, 0, 0), vec3(0, 0, 0) };
+	vec3 ep2[4] = { vec3(0, 0, 0), vec3(0, 0, 0), vec3(0, 0, 0), vec3(0, 0, 0) };
+
+	for (uint y = 0u; y < 16u; y += 4u) {
+		vec3 srcPixel, dithPixel;
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 0u]).xyz * 255.0f;
+		dithPixel = quant(srcPixel + trunc((3 * ep2[1] + 5 * ep2[0]) * (1.0f / 16.0f)));
+		ep1[0] = srcPixel - dithPixel;
+		dthPixBlck[y + 0u] = packUnorm4x8(vec4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 1u]).xyz * 255.0f;
+		dithPixel = quant(
+				srcPixel + trunc((7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0]) * (1.0f / 16.0f)));
+		ep1[1] = srcPixel - dithPixel;
+		dthPixBlck[y + 1u] = packUnorm4x8(vec4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 2u]).xyz * 255.0f;
+		dithPixel = quant(
+				srcPixel + trunc((7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1]) * (1.0f / 16.0f)));
+		ep1[2] = srcPixel - dithPixel;
+		dthPixBlck[y + 2u] = packUnorm4x8(vec4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		srcPixel = unpackUnorm4x8(srcPixBlck[y + 3u]).xyz * 255.0f;
+		dithPixel = quant(srcPixel + trunc((7 * ep1[2] + 5 * ep2[3] + ep2[2]) * (1.0f / 16.0f)));
+		ep1[3] = srcPixel - dithPixel;
+		dthPixBlck[y + 3u] = packUnorm4x8(vec4(dithPixel * (1.0f / 255.0f), 1.0f));
+
+		// swap( ep1, ep2 )
+		for (uint i = 0u; i < 4u; ++i) {
+			vec3 tmp = ep1[i];
+			ep1[i] = ep2[i];
+			ep2[i] = tmp;
+		}
+	}
+}
+#endif
+
+void main() {
+	uint srcPixelsBlock[16];
+
+	bool bAllColorsEqual = true;
+
+	// Load the whole 4x4 block
+	const uvec2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
+	for (uint i = 0u; i < 16u; ++i) {
+		const uvec2 pixelsToLoad = pixelsToLoadBase + uvec2(i & 0x03u, i >> 2u);
+		const vec3 srcPixels0 = texelFetch(srcTex, ivec2(pixelsToLoad), 0).xyz;
+		srcPixelsBlock[i] = packUnorm4x8(vec4(srcPixels0, 1.0f));
+		bAllColorsEqual = bAllColorsEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
+	}
+
+	float maxEndp16, minEndp16;
+	uint mask = 0u;
+
+	if (bAllColorsEqual) {
+		const uvec3 rgbVal = uvec3(unpackUnorm4x8(srcPixelsBlock[0]).xyz * 255.0f);
+		mask = 0xAAAAAAAAu;
+		maxEndp16 =
+				c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
+		minEndp16 =
+				c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
+	} else {
+#ifdef BC1_DITHER
+		uint ditherPixelsBlock[16];
+		// first step: compute dithered version for PCA if desired
+		DitherBlock(srcPixelsBlock, ditherPixelsBlock);
+#else
+#define ditherPixelsBlock srcPixelsBlock
+#endif
+
+		// second step: pca+map along principal axis
+		OptimizeColorsBlock(ditherPixelsBlock, minEndp16, maxEndp16);
+		if (minEndp16 != maxEndp16) {
+			vec3 colors[4];
+			EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+			mask = MatchColorsBlock(srcPixelsBlock, colors);
+		}
+
+		// third step: refine (multiple times if requested)
+		bool bStopRefinement = false;
+		for (uint i = 0u; i < params.p_numRefinements && !bStopRefinement; ++i) {
+			const uint lastMask = mask;
+
+			if (RefineBlock(ditherPixelsBlock, mask, minEndp16, maxEndp16)) {
+				if (minEndp16 != maxEndp16) {
+					vec3 colors[4];
+					EvalColors(colors, maxEndp16, minEndp16); // Note min/max are inverted
+					mask = MatchColorsBlock(srcPixelsBlock, colors);
+				} else {
+					mask = 0u;
+					bStopRefinement = true;
+				}
+			}
+
+			bStopRefinement = mask == lastMask || bStopRefinement;
+		}
+	}
+
+	// write the color block
+	if (maxEndp16 < minEndp16) {
+		const float tmpValue = minEndp16;
+		minEndp16 = maxEndp16;
+		maxEndp16 = tmpValue;
+		mask ^= 0x55555555u;
+	}
+
+	uvec2 outputBytes;
+	outputBytes.x = uint(maxEndp16) | (uint(minEndp16) << 16u);
+	outputBytes.y = mask;
+
+	uvec2 dstUV = gl_GlobalInvocationID.xy;
+	imageStore(dstTexture, ivec2(dstUV), uvec4(outputBytes.xy, 0u, 0u));
+}
--- a/modules/betsy/bc4.glsl
+++ b/modules/betsy/bc4.glsl
@@ -0,0 +1,151 @@
+#[versions]
+
+unsigned = "";
+signed = "#define SNORM";
+
+#[compute]
+#version 450
+
+#VERSION_DEFINES
+
+shared vec2 g_minMaxValues[4u * 4u * 4u];
+shared uvec2 g_mask[4u * 4u];
+
+layout(binding = 0) uniform sampler2D srcTex;
+layout(binding = 1, rg32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(push_constant, std430) uniform Params {
+	uint p_channelIdx;
+	uint p_padding[3];
+}
+params;
+
+layout(local_size_x = 4, //
+		local_size_y = 4, //
+		local_size_z = 4) in;
+
+/// Each block is 16 pixels
+/// Each thread works on 4 pixels
+/// Therefore each block needs 4 threads, generating 8 masks
+/// At the end these 8 masks get merged into 2 and results written to output
+///
+/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
+///
+/// A: It's a sweetspot.
+///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
+///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
+///    overhead, and also more LDS usage which reduces occupancy.
+///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
+void main() {
+	float minVal, maxVal;
+	vec4 srcPixel;
+
+	const uint blockThreadId = gl_LocalInvocationID.x;
+
+	const uvec2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
+
+	for (uint i = 0u; i < 4u; ++i) {
+		const uvec2 pixelsToLoad = pixelsToLoadBase + uvec2(i, blockThreadId);
+
+		const vec4 value = texelFetch(srcTex, ivec2(pixelsToLoad), 0).xyzw;
+		srcPixel[i] = params.p_channelIdx == 0 ? value.x : (params.p_channelIdx == 1 ? value.y : value.w);
+		srcPixel[i] *= 255.0f;
+	}
+
+	minVal = min(srcPixel.x, min(srcPixel.y, srcPixel.z));
+	maxVal = max(srcPixel.x, max(srcPixel.y, srcPixel.z));
+	minVal = min(minVal, srcPixel.w);
+	maxVal = max(maxVal, srcPixel.w);
+
+	const uint minMaxIdxBase = (gl_LocalInvocationID.z << 4u) + (gl_LocalInvocationID.y << 2u);
+	const uint maskIdxBase = (gl_LocalInvocationID.z << 2u) + gl_LocalInvocationID.y;
+
+	g_minMaxValues[minMaxIdxBase + blockThreadId] = vec2(minVal, maxVal);
+	g_mask[maskIdxBase] = uvec2(0u, 0u);
+
+	memoryBarrierShared();
+	barrier();
+
+	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
+	for (uint i = 0u; i < 4u; ++i) {
+		minVal = min(g_minMaxValues[minMaxIdxBase + i].x, minVal);
+		maxVal = max(g_minMaxValues[minMaxIdxBase + i].y, maxVal);
+	}
+
+	// determine bias and emit color indices
+	// given the choice of maxVal/minVal, these indices are optimal:
+	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
+	float dist = maxVal - minVal;
+	float dist4 = dist * 4.0f;
+	float dist2 = dist * 2.0f;
+	float bias = (dist < 8) ? (dist - 1) : (trunc(dist * 0.5f) + 2);
+	bias -= minVal * 7;
+
+	uint mask0 = 0u, mask1 = 0u;
+
+	for (uint i = 0u; i < 4u; ++i) {
+		float a = srcPixel[i] * 7.0f + bias;
+
+		int ind = 0;
+
+		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
+		if (a >= dist4) {
+			ind = 4;
+			a -= dist4;
+		}
+
+		if (a >= dist2) {
+			ind += 2;
+			a -= dist2;
+		}
+
+		if (a >= dist) {
+			ind += 1;
+		}
+
+		// turn linear scale into DXT index (0/1 are extremal pts)
+		ind = -ind & 7;
+		ind ^= (2 > ind) ? 1 : 0;
+
+		// write index
+		const uint bits = 16u + ((blockThreadId << 2u) + i) * 3u;
+		if (bits < 32u) {
+			mask0 |= uint(ind) << bits;
+			if (bits + 3u > 32u) {
+				mask1 |= uint(ind) >> (32u - bits);
+			}
+		} else {
+			mask1 |= uint(ind) << (bits - 32u);
+		}
+	}
+
+	if (mask0 != 0u) {
+		atomicOr(g_mask[maskIdxBase].x, mask0);
+	}
+	if (mask1 != 0u) {
+		atomicOr(g_mask[maskIdxBase].y, mask1);
+	}
+
+	memoryBarrierShared();
+	barrier();
+
+	if (blockThreadId == 0u) {
+		// Save data
+		uvec2 outputBytes;
+
+#ifdef SNORM
+		outputBytes.x =
+				packSnorm4x8(vec4(maxVal * (1.0f / 255.0f) * 2.0f - 1.0f,
+						minVal * (1.0f / 255.0f) * 2.0f - 1.0f, 0.0f, 0.0f));
+#else
+		outputBytes.x = packUnorm4x8(
+				vec4(maxVal * (1.0f / 255.0f), minVal * (1.0f / 255.0f), 0.0f, 0.0f));
+#endif
+
+		outputBytes.x |= g_mask[maskIdxBase].x;
+		outputBytes.y = g_mask[maskIdxBase].y;
+
+		uvec2 dstUV = gl_GlobalInvocationID.yz;
+		imageStore(dstTexture, ivec2(dstUV), uvec4(outputBytes.xy, 0u, 0u));
+	}
+}
--- a/modules/betsy/bc6h.glsl
+++ b/modules/betsy/bc6h.glsl
@@ -0,0 +1,741 @@
+#[versions]
+
+signed = "#define SIGNED";
+unsigned = "#define QUALITY"; // The "Quality" preset causes artifacting on signed data, so for now it's exclusive to unsigned.
+
+#[compute]
+#version 450
+
+#VERSION_DEFINES
+
+vec3 f32tof16(vec3 value) {
+	return vec3(packHalf2x16(vec2(value.x, 0.0)),
+			packHalf2x16(vec2(value.y, 0.0)),
+			packHalf2x16(vec2(value.z, 0.0)));
+}
+
+vec3 f16tof32(uvec3 value) {
+	return vec3(unpackHalf2x16(value.x).x,
+			unpackHalf2x16(value.y).x,
+			unpackHalf2x16(value.z).x);
+}
+
+float f32tof16(float value) {
+	return packHalf2x16(vec2(value.x, 0.0));
+}
+
+float f16tof32(uint value) {
+	return unpackHalf2x16(value.x).x;
+}
+
+layout(binding = 0) uniform sampler2D srcTexture;
+layout(binding = 1, rgba32ui) uniform restrict writeonly uimage2D dstTexture;
+
+layout(push_constant, std430) uniform Params {
+	vec2 p_textureSizeRcp;
+	uint padding0;
+	uint padding1;
+}
+params;
+
+const float HALF_MAX = 65504.0f;
+const uint PATTERN_NUM = 32u;
+
+#ifdef SIGNED
+const float HALF_MIN = -65504.0f;
+#else
+const float HALF_MIN = 0.0f;
+#endif
+
+#ifdef SIGNED
+// https://github.com/godotengine/godot/pull/96377#issuecomment-2323488254
+// https://github.com/godotengine/godot/pull/96377#issuecomment-2323450950
+bool isNegative(float a) {
+	return a < 0.0f;
+}
+
+float CalcSignlessMSLE(float a, float b) {
+	float err = log2((b + 1.0f) / (a + 1.0f));
+	err = err * err;
+	return err;
+}
+
+float CrossCalcMSLE(float a, float b) {
+	float result = 0.0f;
+	result += CalcSignlessMSLE(0.0f, abs(a));
+	result += CalcSignlessMSLE(0.0f, abs(b));
+	return result;
+}
+
+float CalcMSLE(vec3 a, vec3 b) {
+	float result = 0.0f;
+	if (isNegative(a.x) != isNegative(b.x)) {
+		result += CrossCalcMSLE(a.x, b.x);
+	} else {
+		result += CalcSignlessMSLE(abs(a.x), abs(b.x));
+	}
+	if (isNegative(a.y) != isNegative(b.y)) {
+		result += CrossCalcMSLE(a.y, b.y);
+	} else {
+		result += CalcSignlessMSLE(abs(a.y), abs(b.y));
+	}
+	if (isNegative(a.z) != isNegative(b.z)) {
+		result += CrossCalcMSLE(a.z, b.z);
+	} else {
+		result += CalcSignlessMSLE(abs(a.z), abs(b.z));
+	}
+
+	return result;
+}
+
+// Adapt the log function to make sense when a < 0
+vec3 customLog2(vec3 a) {
+	return vec3(
+			a.x >= 0 ? log2(a.x + 1.0f) : -log2(-a.x + 1.0f),
+			a.y >= 0 ? log2(a.y + 1.0f) : -log2(-a.y + 1.0f),
+			a.z >= 0 ? log2(a.z + 1.0f) : -log2(-a.z + 1.0f));
+}
+
+// Inverse of customLog2()
+vec3 customExp2(vec3 a) {
+	return vec3(
+			a.x >= 0 ? exp2(a.x) - 1.0f : -(exp2(-a.x) - 1.0f),
+			a.y >= 0 ? exp2(a.y) - 1.0f : -(exp2(-a.y) - 1.0f),
+			a.z >= 0 ? exp2(a.z) - 1.0f : -(exp2(-a.z) - 1.0f));
+}
+#else
+float CalcMSLE(vec3 a, vec3 b) {
+	vec3 err = log2((b + 1.0f) / (a + 1.0f));
+	err = err * err;
+	return err.x + err.y + err.z;
+}
+
+vec3 customLog2(vec3 a) {
+	return log2(a + 1.0f);
+}
+
+vec3 customExp2(vec3 a) {
+	return exp2(a) - 1.0f;
+}
+#endif
+
+uint PatternFixupID(uint i) {
+	uint ret = 15u;
+	ret = ((3441033216u >> i) & 0x1u) != 0 ? 2u : ret;
+	ret = ((845414400u >> i) & 0x1u) != 0 ? 8u : ret;
+	return ret;
+}
+
+uint Pattern(uint p, uint i) {
+	uint p2 = p / 2u;
+	uint p3 = p - p2 * 2u;
+
+	uint enc = 0u;
+	enc = p2 == 0u ? 2290666700u : enc;
+	enc = p2 == 1u ? 3972591342u : enc;
+	enc = p2 == 2u ? 4276930688u : enc;
+	enc = p2 == 3u ? 3967876808u : enc;
+	enc = p2 == 4u ? 4293707776u : enc;
+	enc = p2 == 5u ? 3892379264u : enc;
+	enc = p2 == 6u ? 4278255592u : enc;
+	enc = p2 == 7u ? 4026597360u : enc;
+	enc = p2 == 8u ? 9369360u : enc;
+	enc = p2 == 9u ? 147747072u : enc;
+	enc = p2 == 10u ? 1930428556u : enc;
+	enc = p2 == 11u ? 2362323200u : enc;
+	enc = p2 == 12u ? 823134348u : enc;
+	enc = p2 == 13u ? 913073766u : enc;
+	enc = p2 == 14u ? 267393000u : enc;
+	enc = p2 == 15u ? 966553998u : enc;
+
+	enc = p3 != 0u ? enc >> 16u : enc;
+	uint ret = (enc >> i) & 0x1u;
+	return ret;
+}
+
+#ifndef SIGNED
+//UF
+vec3 Quantize7(vec3 x) {
+	return (f32tof16(x) * 128.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Quantize9(vec3 x) {
+	return (f32tof16(x) * 512.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Quantize10(vec3 x) {
+	return (f32tof16(x) * 1024.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Unquantize7(vec3 x) {
+	return (x * 65536.0f + 0x8000) / 128.0f;
+}
+
+vec3 Unquantize9(vec3 x) {
+	return (x * 65536.0f + 0x8000) / 512.0f;
+}
+
+vec3 Unquantize10(vec3 x) {
+	return (x * 65536.0f + 0x8000) / 1024.0f;
+}
+
+vec3 FinishUnquantize(vec3 endpoint0Unq, vec3 endpoint1Unq, float weight) {
+	vec3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 4096.0f);
+	return f16tof32(uvec3(comp));
+}
+#else
+//SF
+
+vec3 cmpSign(vec3 value) {
+	vec3 signVal;
+	signVal.x = value.x >= 0.0f ? 1.0f : -1.0f;
+	signVal.y = value.y >= 0.0f ? 1.0f : -1.0f;
+	signVal.z = value.z >= 0.0f ? 1.0f : -1.0f;
+	return signVal;
+}
+
+vec3 Quantize7(vec3 x) {
+	vec3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 64.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Quantize9(vec3 x) {
+	vec3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 256.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Quantize10(vec3 x) {
+	vec3 signVal = cmpSign(x);
+	return signVal * (f32tof16(abs(x)) * 512.0f) / (0x7bff + 1.0f);
+}
+
+vec3 Unquantize7(vec3 x) {
+	vec3 signVal = sign(x);
+	x = abs(x);
+	vec3 finalVal = signVal * (x * 32768.0f + 0x4000) / 64.0f;
+	finalVal.x = x.x >= 64.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 64.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 64.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+vec3 Unquantize9(vec3 x) {
+	vec3 signVal = sign(x);
+	x = abs(x);
+	vec3 finalVal = signVal * (x * 32768.0f + 0x4000) / 256.0f;
+	finalVal.x = x.x >= 256.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 256.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 256.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+vec3 Unquantize10(vec3 x) {
+	vec3 signVal = sign(x);
+	x = abs(x);
+	vec3 finalVal = signVal * (x * 32768.0f + 0x4000) / 512.0f;
+	finalVal.x = x.x >= 512.0f ? 32767.0 : finalVal.x;
+	finalVal.y = x.y >= 512.0f ? 32767.0 : finalVal.y;
+	finalVal.z = x.z >= 512.0f ? 32767.0 : finalVal.z;
+	return finalVal;
+}
+
+vec3 FinishUnquantize(vec3 endpoint0Unq, vec3 endpoint1Unq, float weight) {
+	vec3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 2048.0f);
+	return f16tof32(uvec3(comp));
+}
+#endif
+
+void Swap(inout vec3 a, inout vec3 b) {
+	vec3 tmp = a;
+	a = b;
+	b = tmp;
+}
+
+void Swap(inout float a, inout float b) {
+	float tmp = a;
+	a = b;
+	b = tmp;
+}
+
+uint ComputeIndex3(float texelPos, float endPoint0Pos, float endPoint1Pos) {
+	float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
+	return uint(clamp(r * 6.98182f + 0.00909f + 0.5f, 0.0f, 7.0f));
+}
+
+uint ComputeIndex4(float texelPos, float endPoint0Pos, float endPoint1Pos) {
+	float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
+	return uint(clamp(r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f));
+}
+
+// This adds a bitflag to quantized values that signifies whether they are negative.
+void SignExtend(inout vec3 v1, uint mask, uint signFlag) {
+	ivec3 v = ivec3(v1);
+	v.x = (v.x & int(mask)) | (v.x < 0 ? int(signFlag) : 0);
+	v.y = (v.y & int(mask)) | (v.y < 0 ? int(signFlag) : 0);
+	v.z = (v.z & int(mask)) | (v.z < 0 ? int(signFlag) : 0);
+	v1 = v;
+}
+
+// Encodes a block with mode 11 (2x 10-bit endpoints).
+void EncodeP1(inout uvec4 block, inout float blockMSLE, vec3 texels[16]) {
+	// compute endpoints (min/max RGB bbox)
+	vec3 blockMin = texels[0];
+	vec3 blockMax = texels[0];
+	for (uint i = 1u; i < 16u; ++i) {
+		blockMin = min(blockMin, texels[i]);
+		blockMax = max(blockMax, texels[i]);
+	}
+
+	// refine endpoints in log2 RGB space
+	vec3 refinedBlockMin = blockMax;
+	vec3 refinedBlockMax = blockMin;
+	for (uint i = 0u; i < 16u; ++i) {
+		refinedBlockMin = min(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]);
+		refinedBlockMax = max(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]);
+	}
+
+	vec3 logBlockMax = customLog2(blockMax);
+	vec3 logBlockMin = customLog2(blockMin);
+	vec3 logRefinedBlockMax = customLog2(refinedBlockMax);
+	vec3 logRefinedBlockMin = customLog2(refinedBlockMin);
+	vec3 logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f);
+
+	logBlockMin += min(logRefinedBlockMin - logBlockMin, logBlockMaxExt);
+	logBlockMax -= min(logBlockMax - logRefinedBlockMax, logBlockMaxExt);
+	blockMin = customExp2(logBlockMin);
+	blockMax = customExp2(logBlockMax);
+
+	vec3 blockDir = blockMax - blockMin;
+	blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z);
+
+	vec3 endpoint0 = Quantize10(blockMin);
+	vec3 endpoint1 = Quantize10(blockMax);
+	float endPoint0Pos = f32tof16(dot(blockMin, blockDir));
+	float endPoint1Pos = f32tof16(dot(blockMax, blockDir));
+
+#ifdef SIGNED
+	int maxVal10 = 0x1FF;
+	endpoint0 = clamp(endpoint0, -maxVal10, maxVal10);
+	endpoint1 = clamp(endpoint1, -maxVal10, maxVal10);
+#endif
+
+	// check if endpoint swap is required
+	float fixupTexelPos = f32tof16(dot(texels[0], blockDir));
+	uint fixupIndex = ComputeIndex4(fixupTexelPos, endPoint0Pos, endPoint1Pos);
+	if (fixupIndex > 7) {
+		Swap(endPoint0Pos, endPoint1Pos);
+		Swap(endpoint0, endpoint1);
+	}
+
+	// compute indices
+	uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
+	for (uint i = 0u; i < 16u; ++i) {
+		float texelPos = f32tof16(dot(texels[i], blockDir));
+		indices[i] = ComputeIndex4(texelPos, endPoint0Pos, endPoint1Pos);
+	}
+
+	// compute compression error (MSLE)
+	vec3 endpoint0Unq = Unquantize10(endpoint0);
+	vec3 endpoint1Unq = Unquantize10(endpoint1);
+	float msle = 0.0f;
+	for (uint i = 0u; i < 16u; ++i) {
+		float weight = floor((indices[i] * 64.0f) / 15.0f + 0.5f);
+		vec3 texelUnc = FinishUnquantize(endpoint0Unq, endpoint1Unq, weight);
+
+		msle += CalcMSLE(texels[i], texelUnc);
+	}
+
+#ifdef SIGNED
+	SignExtend(endpoint0, 0x1FF, 0x200);
+	SignExtend(endpoint1, 0x1FF, 0x200);
+#endif
+
+	// encode block for mode 11
+	blockMSLE = msle;
+	block.x = 0x03;
+
+	// endpoints
+	block.x |= uint(endpoint0.x) << 5u;
+	block.x |= uint(endpoint0.y) << 15u;
+	block.x |= uint(endpoint0.z) << 25u;
+	block.y |= uint(endpoint0.z) >> 7u;
+	block.y |= uint(endpoint1.x) << 3u;
+	block.y |= uint(endpoint1.y) << 13u;
+	block.y |= uint(endpoint1.z) << 23u;
+	block.z |= uint(endpoint1.z) >> 9u;
+
+	// indices
+	block.z |= indices[0] << 1u;
+	block.z |= indices[1] << 4u;
+	block.z |= indices[2] << 8u;
+	block.z |= indices[3] << 12u;
+	block.z |= indices[4] << 16u;
+	block.z |= indices[5] << 20u;
+	block.z |= indices[6] << 24u;
+	block.z |= indices[7] << 28u;
+	block.w |= indices[8] << 0u;
+	block.w |= indices[9] << 4u;
+	block.w |= indices[10] << 8u;
+	block.w |= indices[11] << 12u;
+	block.w |= indices[12] << 16u;
+	block.w |= indices[13] << 20u;
+	block.w |= indices[14] << 24u;
+	block.w |= indices[15] << 28u;
+}
+
+float DistToLineSq(vec3 PointOnLine, vec3 LineDirection, vec3 Point) {
+	vec3 w = Point - PointOnLine;
+	vec3 x = w - dot(w, LineDirection) * LineDirection;
+
+	return dot(x, x);
+}
+
+// Gets the deviation from the source data of a particular pattern (smaller is better).
+float EvaluateP2Pattern(uint pattern, vec3 texels[16]) {
+	vec3 p0BlockMin = vec3(HALF_MAX, HALF_MAX, HALF_MAX);
+	vec3 p0BlockMax = vec3(HALF_MIN, HALF_MIN, HALF_MIN);
+	vec3 p1BlockMin = vec3(HALF_MAX, HALF_MAX, HALF_MAX);
+	vec3 p1BlockMax = vec3(HALF_MIN, HALF_MIN, HALF_MIN);
+
+	for (uint i = 0; i < 16; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			p0BlockMin = min(p0BlockMin, texels[i]);
+			p0BlockMax = max(p0BlockMax, texels[i]);
+		} else {
+			p1BlockMin = min(p1BlockMin, texels[i]);
+			p1BlockMax = max(p1BlockMax, texels[i]);
+		}
+	}
+
+	vec3 p0BlockDir = normalize(p0BlockMax - p0BlockMin);
+	vec3 p1BlockDir = normalize(p1BlockMax - p1BlockMin);
+
+	float sqDistanceFromLine = 0.0f;
+
+	for (uint i = 0; i < 16; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			sqDistanceFromLine += DistToLineSq(p0BlockMin, p0BlockDir, texels[i]);
+		} else {
+			sqDistanceFromLine += DistToLineSq(p1BlockMin, p1BlockDir, texels[i]);
+		}
+	}
+
+	return sqDistanceFromLine;
+}
+
+// Encodes a block with either mode 2 (7-bit base, 3x 6-bit delta), or mode 6 (9-bit base, 3x 5-bit delta). Both use pattern encoding.
+void EncodeP2Pattern(inout uvec4 block, inout float blockMSLE, uint pattern, vec3 texels[16]) {
+	vec3 p0BlockMin = vec3(HALF_MAX, HALF_MAX, HALF_MAX);
+	vec3 p0BlockMax = vec3(HALF_MIN, HALF_MIN, HALF_MIN);
+	vec3 p1BlockMin = vec3(HALF_MAX, HALF_MAX, HALF_MAX);
+	vec3 p1BlockMax = vec3(HALF_MIN, HALF_MIN, HALF_MIN);
+
+	for (uint i = 0u; i < 16u; ++i) {
+		uint paletteID = Pattern(pattern, i);
+		if (paletteID == 0) {
+			p0BlockMin = min(p0BlockMin, texels[i]);
+			p0BlockMax = max(p0BlockMax, texels[i]);
+		} else {
+			p1BlockMin = min(p1BlockMin, texels[i]);
+			p1BlockMax = max(p1BlockMax, texels[i]);
+		}
+	}
+
+	vec3 p0BlockDir = p0BlockMax - p0BlockMin;
+	vec3 p1BlockDir = p1BlockMax - p1BlockMin;
+	p0BlockDir = p0BlockDir / (p0BlockDir.x + p0BlockDir.y + p0BlockDir.z);
+	p1BlockDir = p1BlockDir / (p1BlockDir.x + p1BlockDir.y + p1BlockDir.z);
+
+	float p0Endpoint0Pos = f32tof16(dot(p0BlockMin, p0BlockDir));
+	float p0Endpoint1Pos = f32tof16(dot(p0BlockMax, p0BlockDir));
+	float p1Endpoint0Pos = f32tof16(dot(p1BlockMin, p1BlockDir));
+	float p1Endpoint1Pos = f32tof16(dot(p1BlockMax, p1BlockDir));
+
+	uint fixupID = PatternFixupID(pattern);
+	float p0FixupTexelPos = f32tof16(dot(texels[0], p0BlockDir));
+	float p1FixupTexelPos = f32tof16(dot(texels[fixupID], p1BlockDir));
+	uint p0FixupIndex = ComputeIndex3(p0FixupTexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
+	uint p1FixupIndex = ComputeIndex3(p1FixupTexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
+	if (p0FixupIndex > 3u) {
+		Swap(p0Endpoint0Pos, p0Endpoint1Pos);
+		Swap(p0BlockMin, p0BlockMax);
+	}
+	if (p1FixupIndex > 3u) {
+		Swap(p1Endpoint0Pos, p1Endpoint1Pos);
+		Swap(p1BlockMin, p1BlockMax);
+	}
+
+	uint indices[16] = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u };
+	for (uint i = 0u; i < 16u; ++i) {
+		float p0TexelPos = f32tof16(dot(texels[i], p0BlockDir));
+		float p1TexelPos = f32tof16(dot(texels[i], p1BlockDir));
+		uint p0Index = ComputeIndex3(p0TexelPos, p0Endpoint0Pos, p0Endpoint1Pos);
+		uint p1Index = ComputeIndex3(p1TexelPos, p1Endpoint0Pos, p1Endpoint1Pos);
+
+		uint paletteID = Pattern(pattern, i);
+		indices[i] = paletteID == 0u ? p0Index : p1Index;
+	}
+
+	vec3 endpoint760 = floor(Quantize7(p0BlockMin));
+	vec3 endpoint761 = floor(Quantize7(p0BlockMax));
+	vec3 endpoint762 = floor(Quantize7(p1BlockMin));
+	vec3 endpoint763 = floor(Quantize7(p1BlockMax));
+
+	vec3 endpoint950 = floor(Quantize9(p0BlockMin));
+	vec3 endpoint951 = floor(Quantize9(p0BlockMax));
+	vec3 endpoint952 = floor(Quantize9(p1BlockMin));
+	vec3 endpoint953 = floor(Quantize9(p1BlockMax));
+
+	endpoint761 = endpoint761 - endpoint760;
+	endpoint762 = endpoint762 - endpoint760;
+	endpoint763 = endpoint763 - endpoint760;
+
+	endpoint951 = endpoint951 - endpoint950;
+	endpoint952 = endpoint952 - endpoint950;
+	endpoint953 = endpoint953 - endpoint950;
+
+	int maxVal76 = 0x1F;
+	endpoint761 = clamp(endpoint761, -maxVal76, maxVal76);
+	endpoint762 = clamp(endpoint762, -maxVal76, maxVal76);
+	endpoint763 = clamp(endpoint763, -maxVal76, maxVal76);
+
+	int maxVal95 = 0xF;
+	endpoint951 = clamp(endpoint951, -maxVal95, maxVal95);
+	endpoint952 = clamp(endpoint952, -maxVal95, maxVal95);
+	endpoint953 = clamp(endpoint953, -maxVal95, maxVal95);
+
+#ifdef SIGNED
+	int maxVal7 = 0x3F;
+	int maxVal9 = 0xFF;
+	endpoint760 = clamp(endpoint760, -maxVal7, maxVal7);
+	endpoint950 = clamp(endpoint950, -maxVal9, maxVal9);
+#endif
+
+	vec3 endpoint760Unq = Unquantize7(endpoint760);
+	vec3 endpoint761Unq = Unquantize7(endpoint760 + endpoint761);
+	vec3 endpoint762Unq = Unquantize7(endpoint760 + endpoint762);
+	vec3 endpoint763Unq = Unquantize7(endpoint760 + endpoint763);
+	vec3 endpoint950Unq = Unquantize9(endpoint950);
+	vec3 endpoint951Unq = Unquantize9(endpoint950 + endpoint951);
+	vec3 endpoint952Unq = Unquantize9(endpoint950 + endpoint952);
+	vec3 endpoint953Unq = Unquantize9(endpoint950 + endpoint953);
+
+	float msle76 = 0.0f;
+	float msle95 = 0.0f;
+	for (uint i = 0u; i < 16u; ++i) {
+		uint paletteID = Pattern(pattern, i);
+
+		vec3 tmp760Unq = paletteID == 0u ? endpoint760Unq : endpoint762Unq;
+		vec3 tmp761Unq = paletteID == 0u ? endpoint761Unq : endpoint763Unq;
+		vec3 tmp950Unq = paletteID == 0u ? endpoint950Unq : endpoint952Unq;
+		vec3 tmp951Unq = paletteID == 0u ? endpoint951Unq : endpoint953Unq;
+
+		float weight = floor((indices[i] * 64.0f) / 7.0f + 0.5f);
+		vec3 texelUnc76 = FinishUnquantize(tmp760Unq, tmp761Unq, weight);
+		vec3 texelUnc95 = FinishUnquantize(tmp950Unq, tmp951Unq, weight);
+
+		msle76 += CalcMSLE(texels[i], texelUnc76);
+		msle95 += CalcMSLE(texels[i], texelUnc95);
+	}
+
+	SignExtend(endpoint761, 0x1F, 0x20);
+	SignExtend(endpoint762, 0x1F, 0x20);
+	SignExtend(endpoint763, 0x1F, 0x20);
+
+	SignExtend(endpoint951, 0xF, 0x10);
+	SignExtend(endpoint952, 0xF, 0x10);
+	SignExtend(endpoint953, 0xF, 0x10);
+
+#ifdef SIGNED
+	SignExtend(endpoint760, 0x3F, 0x40);
+	SignExtend(endpoint950, 0xFF, 0x100);
+#endif
+
+	// encode block
+	float p2MSLE = min(msle76, msle95);
+	if (p2MSLE < blockMSLE) {
+		blockMSLE = p2MSLE;
+		block = uvec4(0u, 0u, 0u, 0u);
+
+		if (p2MSLE == msle76) {
+			// 7.6
+			block.x = 0x1u;
+			block.x |= (uint(endpoint762.y) & 0x20u) >> 3u;
+			block.x |= (uint(endpoint763.y) & 0x10u) >> 1u;
+			block.x |= (uint(endpoint763.y) & 0x20u) >> 1u;
+			block.x |= uint(endpoint760.x) << 5u;
+			block.x |= (uint(endpoint763.z) & 0x01u) << 12u;
+			block.x |= (uint(endpoint763.z) & 0x02u) << 12u;
+			block.x |= (uint(endpoint762.z) & 0x10u) << 10u;
+			block.x |= uint(endpoint760.y) << 15u;
+			block.x |= (uint(endpoint762.z) & 0x20u) << 17u;
+			block.x |= (uint(endpoint763.z) & 0x04u) << 21u;
+			block.x |= (uint(endpoint762.y) & 0x10u) << 20u;
+			block.x |= uint(endpoint760.z) << 25u;
+			block.y |= (uint(endpoint763.z) & 0x08u) >> 3u;
+			block.y |= (uint(endpoint763.z) & 0x20u) >> 4u;
+			block.y |= (uint(endpoint763.z) & 0x10u) >> 2u;
+			block.y |= uint(endpoint761.x) << 3u;
+			block.y |= (uint(endpoint762.y) & 0x0Fu) << 9u;
+			block.y |= uint(endpoint761.y) << 13u;
+			block.y |= (uint(endpoint763.y) & 0x0Fu) << 19u;
+			block.y |= uint(endpoint761.z) << 23u;
+			block.y |= (uint(endpoint762.z) & 0x07u) << 29u;
+			block.z |= (uint(endpoint762.z) & 0x08u) >> 3u;
+			block.z |= uint(endpoint762.x) << 1u;
+			block.z |= uint(endpoint763.x) << 7u;
+		} else {
+			// 9.5
+			block.x = 0xEu;
+			block.x |= uint(endpoint950.x) << 5u;
+			block.x |= (uint(endpoint952.z) & 0x10u) << 10u;
+			block.x |= uint(endpoint950.y) << 15u;
+			block.x |= (uint(endpoint952.y) & 0x10u) << 20u;
+			block.x |= uint(endpoint950.z) << 25u;
+			block.y |= uint(endpoint950.z) >> 7u;
+			block.y |= (uint(endpoint953.z) & 0x10u) >> 2u;
+			block.y |= uint(endpoint951.x) << 3u;
+			block.y |= (uint(endpoint953.y) & 0x10u) << 4u;
+			block.y |= (uint(endpoint952.y) & 0x0Fu) << 9u;
+			block.y |= uint(endpoint951.y) << 13u;
+			block.y |= (uint(endpoint953.z) & 0x01u) << 18u;
+			block.y |= (uint(endpoint953.y) & 0x0Fu) << 19u;
+			block.y |= uint(endpoint951.z) << 23u;
+			block.y |= (uint(endpoint953.z) & 0x02u) << 27u;
+			block.y |= uint(endpoint952.z) << 29u;
+			block.z |= (uint(endpoint952.z) & 0x08u) >> 3u;
+			block.z |= uint(endpoint952.x) << 1u;
+			block.z |= (uint(endpoint953.z) & 0x04u) << 4u;
+			block.z |= uint(endpoint953.x) << 7u;
+			block.z |= (uint(endpoint953.z) & 0x08u) << 9u;
+		}
+
+		block.z |= pattern << 13u;
+		uint blockFixupID = PatternFixupID(pattern);
+		if (blockFixupID == 15u) {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 26u;
+			block.z |= indices[4] << 29u;
+			block.w |= indices[5] << 0u;
+			block.w |= indices[6] << 3u;
+			block.w |= indices[7] << 6u;
+			block.w |= indices[8] << 9u;
+			block.w |= indices[9] << 12u;
+			block.w |= indices[10] << 15u;
+			block.w |= indices[11] << 18u;
+			block.w |= indices[12] << 21u;
+			block.w |= indices[13] << 24u;
+			block.w |= indices[14] << 27u;
+			block.w |= indices[15] << 30u;
+		} else if (blockFixupID == 2u) {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 25u;
+			block.z |= indices[4] << 28u;
+			block.z |= indices[5] << 31u;
+			block.w |= indices[5] >> 1u;
+			block.w |= indices[6] << 2u;
+			block.w |= indices[7] << 5u;
+			block.w |= indices[8] << 8u;
+			block.w |= indices[9] << 11u;
+			block.w |= indices[10] << 14u;
+			block.w |= indices[11] << 17u;
+			block.w |= indices[12] << 20u;
+			block.w |= indices[13] << 23u;
+			block.w |= indices[14] << 26u;
+			block.w |= indices[15] << 29u;
+		} else {
+			block.z |= indices[0] << 18u;
+			block.z |= indices[1] << 20u;
+			block.z |= indices[2] << 23u;
+			block.z |= indices[3] << 26u;
+			block.z |= indices[4] << 29u;
+			block.w |= indices[5] << 0u;
+			block.w |= indices[6] << 3u;
+			block.w |= indices[7] << 6u;
+			block.w |= indices[8] << 9u;
+			block.w |= indices[9] << 11u;
+			block.w |= indices[10] << 14u;
+			block.w |= indices[11] << 17u;
+			block.w |= indices[12] << 20u;
+			block.w |= indices[13] << 23u;
+			block.w |= indices[14] << 26u;
+			block.w |= indices[15] << 29u;
+		}
+	}
+}
+
+layout(local_size_x = 8,
+		local_size_y = 8,
+		local_size_z = 1) in;
+
+void main() {
+	// gather texels for current 4x4 block
+	// 0 1 2 3
+	// 4 5 6 7
+	// 8 9 10 11
+	// 12 13 14 15
+	vec2 uv = gl_GlobalInvocationID.xy * params.p_textureSizeRcp * 4.0f + params.p_textureSizeRcp;
+	vec2 block0UV = uv;
+	vec2 block1UV = uv + vec2(2.0f * params.p_textureSizeRcp.x, 0.0f);
+	vec2 block2UV = uv + vec2(0.0f, 2.0f * params.p_textureSizeRcp.y);
+	vec2 block3UV = uv + vec2(2.0f * params.p_textureSizeRcp.x, 2.0f * params.p_textureSizeRcp.y);
+	vec4 block0X = textureGather(srcTexture, block0UV, 0);
+	vec4 block1X = textureGather(srcTexture, block1UV, 0);
+	vec4 block2X = textureGather(srcTexture, block2UV, 0);
+	vec4 block3X = textureGather(srcTexture, block3UV, 0);
+	vec4 block0Y = textureGather(srcTexture, block0UV, 1);
+	vec4 block1Y = textureGather(srcTexture, block1UV, 1);
+	vec4 block2Y = textureGather(srcTexture, block2UV, 1);
+	vec4 block3Y = textureGather(srcTexture, block3UV, 1);
+	vec4 block0Z = textureGather(srcTexture, block0UV, 2);
+	vec4 block1Z = textureGather(srcTexture, block1UV, 2);
+	vec4 block2Z = textureGather(srcTexture, block2UV, 2);
+	vec4 block3Z = textureGather(srcTexture, block3UV, 2);
+
+	vec3 texels[16];
+	texels[0] = vec3(block0X.w, block0Y.w, block0Z.w);
+	texels[1] = vec3(block0X.z, block0Y.z, block0Z.z);
+	texels[2] = vec3(block1X.w, block1Y.w, block1Z.w);
+	texels[3] = vec3(block1X.z, block1Y.z, block1Z.z);
+	texels[4] = vec3(block0X.x, block0Y.x, block0Z.x);
+	texels[5] = vec3(block0X.y, block0Y.y, block0Z.y);
+	texels[6] = vec3(block1X.x, block1Y.x, block1Z.x);
+	texels[7] = vec3(block1X.y, block1Y.y, block1Z.y);
+	texels[8] = vec3(block2X.w, block2Y.w, block2Z.w);
+	texels[9] = vec3(block2X.z, block2Y.z, block2Z.z);
+	texels[10] = vec3(block3X.w, block3Y.w, block3Z.w);
+	texels[11] = vec3(block3X.z, block3Y.z, block3Z.z);
+	texels[12] = vec3(block2X.x, block2Y.x, block2Z.x);
+	texels[13] = vec3(block2X.y, block2Y.y, block2Z.y);
+	texels[14] = vec3(block3X.x, block3Y.x, block3Z.x);
+	texels[15] = vec3(block3X.y, block3Y.y, block3Z.y);
+
+	uvec4 block = uvec4(0u, 0u, 0u, 0u);
+	float blockMSLE = 0.0f;
+
+	EncodeP1(block, blockMSLE, texels);
+
+#ifdef QUALITY
+	float bestScore = EvaluateP2Pattern(0, texels);
+	uint bestPattern = 0;
+
+	for (uint i = 1u; i < PATTERN_NUM; ++i) {
+		float score = EvaluateP2Pattern(i, texels);
+
+		if (score < bestScore) {
+			bestPattern = i;
+			bestScore = score;
+		}
+	}
+
+	EncodeP2Pattern(block, blockMSLE, bestPattern, texels);
+#endif
+
+	imageStore(dstTexture, ivec2(gl_GlobalInvocationID.xy), block);
+}
--- a/modules/betsy/betsy_bc1.h
+++ b/modules/betsy/betsy_bc1.h
--- a/modules/betsy/config.py
+++ b/modules/betsy/config.py
@@ -0,0 +1,6 @@
+def can_build(env, platform):
+    return env.editor_build
+
+
+def configure(env):
+    pass
--- a/modules/betsy/image_compress_betsy.cpp
+++ b/modules/betsy/image_compress_betsy.cpp
@@ -0,0 +1,775 @@
+/**************************************************************************/
+/*  image_compress_betsy.cpp                                              */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "image_compress_betsy.h"
+
+#include "core/config/project_settings.h"
+
+#include "betsy_bc1.h"
+
+#include "alpha_stitch.glsl.gen.h"
+#include "bc1.glsl.gen.h"
+#include "bc4.glsl.gen.h"
+#include "bc6h.glsl.gen.h"
+#include "servers/display_server.h"
+
+static Mutex betsy_mutex;
+static BetsyCompressor *betsy = nullptr;
+
+static const BetsyShaderType FORMAT_TO_TYPE[BETSY_FORMAT_MAX] = {
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC1_DITHER,
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC6_SIGNED,
+	BETSY_SHADER_BC6_UNSIGNED,
+};
+
+static const RD::DataFormat BETSY_TO_RD_FORMAT[BETSY_FORMAT_MAX] = {
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32_UINT,
+	RD::DATA_FORMAT_R32G32B32A32_UINT,
+	RD::DATA_FORMAT_R32G32B32A32_UINT,
+};
+
+static const Image::Format BETSY_TO_IMAGE_FORMAT[BETSY_FORMAT_MAX] = {
+	Image::FORMAT_DXT1,
+	Image::FORMAT_DXT1,
+	Image::FORMAT_DXT5,
+	Image::FORMAT_RGTC_R,
+	Image::FORMAT_RGTC_R,
+	Image::FORMAT_RGTC_RG,
+	Image::FORMAT_RGTC_RG,
+	Image::FORMAT_BPTC_RGBF,
+	Image::FORMAT_BPTC_RGBFU,
+};
+
+void BetsyCompressor::_init() {
+	if (!DisplayServer::can_create_rendering_device()) {
+		return;
+	}
+
+	// Create local RD.
+	RenderingContextDriver *rcd = nullptr;
+	RenderingDevice *rd = RenderingServer::get_singleton()->create_local_rendering_device();
+
+	if (rd == nullptr) {
+#if defined(RD_ENABLED)
+#if defined(METAL_ENABLED)
+		rcd = memnew(RenderingContextDriverMetal);
+		rd = memnew(RenderingDevice);
+#endif
+#if defined(VULKAN_ENABLED)
+		if (rcd == nullptr) {
+			rcd = memnew(RenderingContextDriverVulkan);
+			rd = memnew(RenderingDevice);
+		}
+#endif
+#endif
+		if (rcd != nullptr && rd != nullptr) {
+			Error err = rcd->initialize();
+			if (err == OK) {
+				err = rd->initialize(rcd);
+			}
+
+			if (err != OK) {
+				memdelete(rd);
+				memdelete(rcd);
+				rd = nullptr;
+				rcd = nullptr;
+			}
+		}
+	}
+
+	ERR_FAIL_NULL_MSG(rd, "Unable to create a local RenderingDevice.");
+
+	compress_rd = rd;
+	compress_rcd = rcd;
+
+	// Create the sampler state.
+	RD::SamplerState src_sampler_state;
+	{
+		src_sampler_state.repeat_u = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE;
+		src_sampler_state.repeat_v = RD::SAMPLER_REPEAT_MODE_CLAMP_TO_EDGE;
+		src_sampler_state.mag_filter = RD::SAMPLER_FILTER_NEAREST;
+		src_sampler_state.min_filter = RD::SAMPLER_FILTER_NEAREST;
+		src_sampler_state.mip_filter = RD::SAMPLER_FILTER_NEAREST;
+	}
+
+	src_sampler = compress_rd->sampler_create(src_sampler_state);
+
+	// Initialize RDShaderFiles.
+	{
+		Ref<RDShaderFile> bc1_shader;
+		bc1_shader.instantiate();
+		Error err = bc1_shader->parse_versions_from_text(bc1_shader_glsl);
+
+		if (err != OK) {
+			bc1_shader->print_errors("Betsy BC1 compress shader");
+		}
+
+		// Standard BC1 compression.
+		cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled = compress_rd->shader_create_from_spirv(bc1_shader->get_spirv_stages("standard"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC1_STANDARD].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC1_STANDARD].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_STANDARD].pipeline.is_null());
+
+		// Dither BC1 variant. Unused, so comment out for now.
+		//cached_shaders[BETSY_SHADER_BC1_DITHER].compiled = compress_rd->shader_create_from_spirv(bc1_shader->get_spirv_stages("dithered"));
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_DITHER].compiled.is_null());
+
+		//cached_shaders[BETSY_SHADER_BC1_DITHER].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC1_DITHER].compiled);
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC1_DITHER].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> bc4_shader;
+		bc4_shader.instantiate();
+		Error err = bc4_shader->parse_versions_from_text(bc4_shader_glsl);
+
+		if (err != OK) {
+			bc4_shader->print_errors("Betsy BC4 compress shader");
+		}
+
+		// Signed BC4 compression. Unused, so comment out for now.
+		//cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled = compress_rd->shader_create_from_spirv(bc4_shader->get_spirv_stages("signed"));
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled.is_null());
+
+		//cached_shaders[BETSY_SHADER_BC4_SIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC4_SIGNED].compiled);
+		//ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_SIGNED].pipeline.is_null());
+
+		// Unsigned BC4 compression.
+		cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled = compress_rd->shader_create_from_spirv(bc4_shader->get_spirv_stages("unsigned"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC4_UNSIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC4_UNSIGNED].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> bc6h_shader;
+		bc6h_shader.instantiate();
+		Error err = bc6h_shader->parse_versions_from_text(bc6h_shader_glsl);
+
+		if (err != OK) {
+			bc6h_shader->print_errors("Betsy BC6 compress shader");
+		}
+
+		// Signed BC6 compression.
+		cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled = compress_rd->shader_create_from_spirv(bc6h_shader->get_spirv_stages("signed"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC6_SIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC6_SIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_SIGNED].pipeline.is_null());
+
+		// Unsigned BC6 compression.
+		cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled = compress_rd->shader_create_from_spirv(bc6h_shader->get_spirv_stages("unsigned"));
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_BC6_UNSIGNED].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_BC6_UNSIGNED].pipeline.is_null());
+	}
+
+	{
+		Ref<RDShaderFile> alpha_stitch_shader;
+		alpha_stitch_shader.instantiate();
+		Error err = alpha_stitch_shader->parse_versions_from_text(alpha_stitch_shader_glsl);
+
+		if (err != OK) {
+			alpha_stitch_shader->print_errors("Betsy alpha stitch shader");
+		}
+		cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled = compress_rd->shader_create_from_spirv(alpha_stitch_shader->get_spirv_stages());
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled.is_null());
+
+		cached_shaders[BETSY_SHADER_ALPHA_STITCH].pipeline = compress_rd->compute_pipeline_create(cached_shaders[BETSY_SHADER_ALPHA_STITCH].compiled);
+		ERR_FAIL_COND(cached_shaders[BETSY_SHADER_ALPHA_STITCH].pipeline.is_null());
+	}
+}
+
+void BetsyCompressor::init() {
+	WorkerThreadPool::TaskID tid = WorkerThreadPool::get_singleton()->add_task(callable_mp(this, &BetsyCompressor::_thread_loop), true, "Betsy pump task", true);
+	command_queue.set_pump_task_id(tid);
+	command_queue.push(this, &BetsyCompressor::_assign_mt_ids, tid);
+	command_queue.push_and_sync(this, &BetsyCompressor::_init);
+	DEV_ASSERT(task_id == tid);
+}
+
+void BetsyCompressor::_assign_mt_ids(WorkerThreadPool::TaskID p_pump_task_id) {
+	task_id = p_pump_task_id;
+}
+
+// Yield thread to WTP so other tasks can be done on it.
+// Automatically regains control as soon a task is pushed to the command queue.
+void BetsyCompressor::_thread_loop() {
+	while (!exit) {
+		WorkerThreadPool::get_singleton()->yield();
+		command_queue.flush_all();
+	}
+}
+
+void BetsyCompressor::_thread_exit() {
+	exit = true;
+
+	if (compress_rd != nullptr) {
+		if (dxt1_encoding_table_buffer.is_valid()) {
+			compress_rd->free(dxt1_encoding_table_buffer);
+		}
+
+		compress_rd->free(src_sampler);
+
+		// Clear the shader cache, pipelines will be unreferenced automatically.
+		for (int i = 0; i < BETSY_SHADER_MAX; i++) {
+			if (cached_shaders[i].compiled.is_valid()) {
+				compress_rd->free(cached_shaders[i].compiled);
+			}
+		}
+
+		// Free the RD (and RCD if necessary).
+		memdelete(compress_rd);
+		compress_rd = nullptr;
+		if (compress_rcd != nullptr) {
+			memdelete(compress_rcd);
+			compress_rcd = nullptr;
+		}
+	}
+}
+
+void BetsyCompressor::finish() {
+	command_queue.push(this, &BetsyCompressor::_thread_exit);
+	if (task_id != WorkerThreadPool::INVALID_TASK_ID) {
+		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+		task_id = WorkerThreadPool::INVALID_TASK_ID;
+	}
+}
+
+// Helper functions.
+
+static int get_next_multiple(int n, int m) {
+	return n + (m - (n % m));
+}
+
+static Error get_src_texture_format(Image *r_img, RD::DataFormat &r_format) {
+	switch (r_img->get_format()) {
+		case Image::FORMAT_L8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_LA8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_R8:
+			r_format = RD::DATA_FORMAT_R8_UNORM;
+			break;
+
+		case Image::FORMAT_RG8:
+			r_format = RD::DATA_FORMAT_R8G8_UNORM;
+			break;
+
+		case Image::FORMAT_RGB8:
+			r_img->convert(Image::FORMAT_RGBA8);
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_RGBA8:
+			r_format = RD::DATA_FORMAT_R8G8B8A8_UNORM;
+			break;
+
+		case Image::FORMAT_RH:
+			r_format = RD::DATA_FORMAT_R16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGH:
+			r_format = RD::DATA_FORMAT_R16G16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBH:
+			r_img->convert(Image::FORMAT_RGBAH);
+			r_format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBAH:
+			r_format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
+			break;
+
+		case Image::FORMAT_RF:
+			r_format = RD::DATA_FORMAT_R32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGF:
+			r_format = RD::DATA_FORMAT_R32G32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBF:
+			r_img->convert(Image::FORMAT_RGBAF);
+			r_format = RD::DATA_FORMAT_R32G32B32A32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBAF:
+			r_format = RD::DATA_FORMAT_R32G32B32A32_SFLOAT;
+			break;
+
+		case Image::FORMAT_RGBE9995:
+			r_format = RD::DATA_FORMAT_E5B9G9R9_UFLOAT_PACK32;
+			break;
+
+		default: {
+			return ERR_UNAVAILABLE;
+		}
+	}
+
+	return OK;
+}
+
+Error BetsyCompressor::_compress(BetsyFormat p_format, Image *r_img) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// Return an error so that the compression can fall back to cpu compression
+	if (compress_rd == nullptr) {
+		return ERR_CANT_CREATE;
+	}
+
+	if (r_img->is_compressed()) {
+		return ERR_INVALID_DATA;
+	}
+
+	int img_width = r_img->get_width();
+	int img_height = r_img->get_height();
+	if (img_width % 4 != 0 || img_height % 4 != 0) {
+		img_width = img_width <= 2 ? img_width : (img_width + 3) & ~3;
+		img_height = img_height <= 2 ? img_height : (img_height + 3) & ~3;
+	}
+
+	Error err = OK;
+
+	// Destination format.
+	Image::Format dest_format = BETSY_TO_IMAGE_FORMAT[p_format];
+	RD::DataFormat dst_rd_format = BETSY_TO_RD_FORMAT[p_format];
+
+	BetsyShaderType shader_type = FORMAT_TO_TYPE[p_format];
+	BetsyShader shader = cached_shaders[shader_type];
+	BetsyShader secondary_shader; // The secondary shader is used for alpha blocks. For BC it's BC4U and for ETC it's ETC2_RU (8-bit variant).
+	BetsyShader stitch_shader;
+	bool needs_alpha_block = false;
+
+	switch (p_format) {
+		case BETSY_FORMAT_BC3:
+		case BETSY_FORMAT_BC5_UNSIGNED:
+			needs_alpha_block = true;
+			secondary_shader = cached_shaders[BETSY_SHADER_BC4_UNSIGNED];
+			stitch_shader = cached_shaders[BETSY_SHADER_ALPHA_STITCH];
+			break;
+		default:
+			break;
+	}
+
+	// src_texture format information.
+	RD::TextureFormat src_texture_format;
+	{
+		src_texture_format.array_layers = 1;
+		src_texture_format.depth = 1;
+		src_texture_format.mipmaps = 1;
+		src_texture_format.texture_type = RD::TEXTURE_TYPE_2D;
+		src_texture_format.usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT;
+	}
+
+	err = get_src_texture_format(r_img, src_texture_format.format);
+
+	if (err != OK) {
+		return err;
+	}
+
+	// For the destination format just copy the source format and change the usage bits.
+	RD::TextureFormat dst_texture_format = src_texture_format;
+	dst_texture_format.usage_bits = RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_CAN_UPDATE_BIT;
+	dst_texture_format.format = dst_rd_format;
+
+	RD::TextureFormat dst_texture_format_alpha;
+	RD::TextureFormat dst_texture_format_combined;
+
+	if (needs_alpha_block) {
+		dst_texture_format_combined = dst_texture_format;
+		dst_texture_format_combined.format = RD::DATA_FORMAT_R32G32B32A32_UINT;
+
+		dst_texture_format.usage_bits |= RD::TEXTURE_USAGE_SAMPLING_BIT;
+
+		dst_texture_format_alpha = dst_texture_format;
+		dst_texture_format_alpha.format = RD::DATA_FORMAT_R32G32_UINT;
+	}
+
+	// Encoding table setup.
+	if ((dest_format == Image::FORMAT_DXT1 || dest_format == Image::FORMAT_DXT5) && dxt1_encoding_table_buffer.is_null()) {
+		dxt1_encoding_table_buffer = compress_rd->storage_buffer_create(1024 * 4, Span(dxt1_encoding_table).reinterpret<uint8_t>());
+	}
+
+	const int mip_count = r_img->get_mipmap_count() + 1;
+
+	// Container for the compressed data.
+	Vector<uint8_t> dst_data;
+	dst_data.resize(Image::get_image_data_size(img_width, img_height, dest_format, r_img->has_mipmaps()));
+	uint8_t *dst_data_ptr = dst_data.ptrw();
+
+	Vector<Vector<uint8_t>> src_images;
+	src_images.push_back(Vector<uint8_t>());
+	Vector<uint8_t> *src_image_ptr = src_images.ptrw();
+
+	// Compress each mipmap.
+	for (int i = 0; i < mip_count; i++) {
+		int width, height;
+		Image::get_image_mipmap_offset_and_dimensions(img_width, img_height, dest_format, i, width, height);
+
+		int64_t src_mip_ofs, src_mip_size;
+		int src_mip_w, src_mip_h;
+		r_img->get_mipmap_offset_size_and_dimensions(i, src_mip_ofs, src_mip_size, src_mip_w, src_mip_h);
+
+		// Set the source texture width and size.
+		src_texture_format.height = height;
+		src_texture_format.width = width;
+
+		// Set the destination texture width and size.
+		dst_texture_format.height = (height + 3) >> 2;
+		dst_texture_format.width = (width + 3) >> 2;
+
+		// Pad textures to nearest block by smearing.
+		if (width != src_mip_w || height != src_mip_h) {
+			const uint8_t *src_mip_read = r_img->ptr() + src_mip_ofs;
+
+			// Reserve the buffer for padded image data.
+			int px_size = Image::get_format_pixel_size(r_img->get_format());
+			src_image_ptr[0].resize(width * height * px_size);
+			uint8_t *ptrw = src_image_ptr[0].ptrw();
+
+			int x = 0, y = 0;
+			for (y = 0; y < src_mip_h; y++) {
+				for (x = 0; x < src_mip_w; x++) {
+					memcpy(ptrw + (width * y + x) * px_size, src_mip_read + (src_mip_w * y + x) * px_size, px_size);
+				}
+
+				// First, smear in x.
+				for (; x < width; x++) {
+					memcpy(ptrw + (width * y + x) * px_size, ptrw + (width * y + x - 1) * px_size, px_size);
+				}
+			}
+
+			// Then, smear in y.
+			for (; y < height; y++) {
+				for (x = 0; x < width; x++) {
+					memcpy(ptrw + (width * y + x) * px_size, ptrw + (width * y + x - width) * px_size, px_size);
+				}
+			}
+		} else {
+			// Create a buffer filled with the source mip layer data.
+			src_image_ptr[0].resize(src_mip_size);
+			memcpy(src_image_ptr[0].ptrw(), r_img->ptr() + src_mip_ofs, src_mip_size);
+		}
+
+		// Create the textures on the GPU.
+		RID src_texture = compress_rd->texture_create(src_texture_format, RD::TextureView(), src_images);
+		RID dst_texture_primary = compress_rd->texture_create(dst_texture_format, RD::TextureView());
+
+		{
+			Vector<RD::Uniform> uniforms;
+			{
+				{
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+					u.binding = 0;
+					u.append_id(src_sampler);
+					u.append_id(src_texture);
+					uniforms.push_back(u);
+				}
+				{
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+					u.binding = 1;
+					u.append_id(dst_texture_primary);
+					uniforms.push_back(u);
+				}
+
+				if (dest_format == Image::FORMAT_DXT1 || dest_format == Image::FORMAT_DXT5) {
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+					u.binding = 2;
+					u.append_id(dxt1_encoding_table_buffer);
+					uniforms.push_back(u);
+				}
+			}
+
+			RID uniform_set = compress_rd->uniform_set_create(uniforms, shader.compiled, 0);
+			RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+			compress_rd->compute_list_bind_compute_pipeline(compute_list, shader.pipeline);
+			compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+
+			switch (shader_type) {
+				case BETSY_SHADER_BC6_SIGNED:
+				case BETSY_SHADER_BC6_UNSIGNED: {
+					BC6PushConstant push_constant;
+					push_constant.sizeX = 1.0f / width;
+					push_constant.sizeY = 1.0f / height;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC6PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+				} break;
+
+				case BETSY_SHADER_BC1_STANDARD: {
+					BC1PushConstant push_constant;
+					push_constant.num_refines = 2;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC1PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+				} break;
+
+				case BETSY_SHADER_BC4_UNSIGNED: {
+					BC4PushConstant push_constant;
+					push_constant.channel_idx = 0;
+
+					compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
+					compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
+				} break;
+
+				default: {
+				} break;
+			}
+
+			compress_rd->compute_list_end();
+
+			if (!needs_alpha_block) {
+				compress_rd->submit();
+				compress_rd->sync();
+			}
+		}
+
+		RID dst_texture_rid = dst_texture_primary;
+
+		if (needs_alpha_block) {
+			// Set the destination texture width and size.
+			dst_texture_format_alpha.height = (height + 3) >> 2;
+			dst_texture_format_alpha.width = (width + 3) >> 2;
+
+			RID dst_texture_alpha = compress_rd->texture_create(dst_texture_format_alpha, RD::TextureView());
+
+			{
+				Vector<RD::Uniform> uniforms;
+				{
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 0;
+						u.append_id(src_sampler);
+						u.append_id(src_texture);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+						u.binding = 1;
+						u.append_id(dst_texture_alpha);
+						uniforms.push_back(u);
+					}
+				}
+
+				RID uniform_set = compress_rd->uniform_set_create(uniforms, secondary_shader.compiled, 0);
+				RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+				compress_rd->compute_list_bind_compute_pipeline(compute_list, secondary_shader.pipeline);
+				compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+
+				BC4PushConstant push_constant;
+				push_constant.channel_idx = dest_format == Image::FORMAT_DXT5 ? 3 : 1;
+
+				compress_rd->compute_list_set_push_constant(compute_list, &push_constant, sizeof(BC4PushConstant));
+				compress_rd->compute_list_dispatch(compute_list, 1, get_next_multiple(width, 16) / 16, get_next_multiple(height, 16) / 16);
+
+				compress_rd->compute_list_end();
+			}
+
+			// Stitching
+
+			// Set the destination texture width and size.
+			dst_texture_format_combined.height = (height + 3) >> 2;
+			dst_texture_format_combined.width = (width + 3) >> 2;
+
+			RID dst_texture_combined = compress_rd->texture_create(dst_texture_format_combined, RD::TextureView());
+
+			{
+				Vector<RD::Uniform> uniforms;
+				{
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 0;
+						u.append_id(src_sampler);
+						u.append_id(dest_format == Image::FORMAT_DXT5 ? dst_texture_alpha : dst_texture_primary);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE;
+						u.binding = 1;
+						u.append_id(src_sampler);
+						u.append_id(dest_format == Image::FORMAT_DXT5 ? dst_texture_primary : dst_texture_alpha);
+						uniforms.push_back(u);
+					}
+					{
+						RD::Uniform u;
+						u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
+						u.binding = 2;
+						u.append_id(dst_texture_combined);
+						uniforms.push_back(u);
+					}
+				}
+
+				RID uniform_set = compress_rd->uniform_set_create(uniforms, stitch_shader.compiled, 0);
+				RD::ComputeListID compute_list = compress_rd->compute_list_begin();
+
+				compress_rd->compute_list_bind_compute_pipeline(compute_list, stitch_shader.pipeline);
+				compress_rd->compute_list_bind_uniform_set(compute_list, uniform_set, 0);
+				compress_rd->compute_list_dispatch(compute_list, get_next_multiple(width, 32) / 32, get_next_multiple(height, 32) / 32, 1);
+
+				compress_rd->compute_list_end();
+
+				compress_rd->submit();
+				compress_rd->sync();
+			}
+
+			dst_texture_rid = dst_texture_combined;
+
+			compress_rd->free(dst_texture_primary);
+			compress_rd->free(dst_texture_alpha);
+		}
+
+		// Copy data from the GPU to the buffer.
+		const Vector<uint8_t> texture_data = compress_rd->texture_get_data(dst_texture_rid, 0);
+		int64_t dst_ofs = Image::get_image_mipmap_offset(img_width, img_height, dest_format, i);
+
+		memcpy(dst_data_ptr + dst_ofs, texture_data.ptr(), texture_data.size());
+
+		// Free the source and dest texture.
+		compress_rd->free(src_texture);
+		compress_rd->free(dst_texture_rid);
+	}
+
+	src_images.clear();
+
+	// Set the compressed data to the image.
+	r_img->set_data(img_width, img_height, r_img->has_mipmaps(), dest_format, dst_data);
+
+	print_verbose(
+			vformat("Betsy: Encoding a %dx%d image with %d mipmaps as %s took %d ms.",
+					img_width,
+					img_height,
+					r_img->get_mipmap_count(),
+					Image::get_format_name(dest_format),
+					OS::get_singleton()->get_ticks_msec() - start_time));
+
+	return OK;
+}
+
+void ensure_betsy_exists() {
+	betsy_mutex.lock();
+	if (betsy == nullptr) {
+		betsy = memnew(BetsyCompressor);
+		betsy->init();
+	}
+	betsy_mutex.unlock();
+}
+
+Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels) {
+	ensure_betsy_exists();
+	Image::Format format = r_img->get_format();
+	Error result = ERR_UNAVAILABLE;
+
+	if (format >= Image::FORMAT_RF && format <= Image::FORMAT_RGBE9995) {
+		if (r_img->detect_signed()) {
+			result = betsy->compress(BETSY_FORMAT_BC6_SIGNED, r_img);
+		} else {
+			result = betsy->compress(BETSY_FORMAT_BC6_UNSIGNED, r_img);
+		}
+	}
+
+	if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+		free_device();
+	}
+
+	return result;
+}
+
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels) {
+	ensure_betsy_exists();
+	Error result = ERR_UNAVAILABLE;
+
+	switch (p_channels) {
+		case Image::USED_CHANNELS_RGB:
+		case Image::USED_CHANNELS_L:
+			result = betsy->compress(BETSY_FORMAT_BC1, r_img);
+			break;
+
+		case Image::USED_CHANNELS_RGBA:
+		case Image::USED_CHANNELS_LA:
+			result = betsy->compress(BETSY_FORMAT_BC3, r_img);
+			break;
+
+		case Image::USED_CHANNELS_R:
+			result = betsy->compress(BETSY_FORMAT_BC4_UNSIGNED, r_img);
+			break;
+
+		case Image::USED_CHANNELS_RG:
+			result = betsy->compress(BETSY_FORMAT_BC5_UNSIGNED, r_img);
+			break;
+
+		default:
+			break;
+	}
+
+	if (!GLOBAL_GET("rendering/textures/vram_compression/cache_gpu_compressor")) {
+		free_device();
+	}
+
+	return result;
+}
+
+void free_device() {
+	if (betsy != nullptr) {
+		betsy->finish();
+		memdelete(betsy);
+	}
+}
--- a/modules/betsy/image_compress_betsy.h
+++ b/modules/betsy/image_compress_betsy.h
@@ -0,0 +1,129 @@
+/**************************************************************************/
+/*  image_compress_betsy.h                                                */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "core/io/image.h"
+#include "core/object/worker_thread_pool.h"
+#include "core/os/thread.h"
+#include "core/templates/command_queue_mt.h"
+
+#include "servers/rendering/rendering_device_binds.h"
+#include "servers/rendering/rendering_server_default.h"
+
+#if defined(VULKAN_ENABLED)
+#include "drivers/vulkan/rendering_context_driver_vulkan.h"
+#endif
+#if defined(METAL_ENABLED)
+#include "drivers/metal/rendering_context_driver_metal.h"
+#endif
+
+enum BetsyFormat {
+	BETSY_FORMAT_BC1,
+	BETSY_FORMAT_BC1_DITHER,
+	BETSY_FORMAT_BC3,
+	BETSY_FORMAT_BC4_SIGNED,
+	BETSY_FORMAT_BC4_UNSIGNED,
+	BETSY_FORMAT_BC5_SIGNED,
+	BETSY_FORMAT_BC5_UNSIGNED,
+	BETSY_FORMAT_BC6_SIGNED,
+	BETSY_FORMAT_BC6_UNSIGNED,
+	BETSY_FORMAT_MAX,
+};
+
+enum BetsyShaderType {
+	BETSY_SHADER_BC1_STANDARD,
+	BETSY_SHADER_BC1_DITHER,
+	BETSY_SHADER_BC4_SIGNED,
+	BETSY_SHADER_BC4_UNSIGNED,
+	BETSY_SHADER_BC6_SIGNED,
+	BETSY_SHADER_BC6_UNSIGNED,
+	BETSY_SHADER_ALPHA_STITCH,
+	BETSY_SHADER_MAX,
+};
+
+struct BC6PushConstant {
+	float sizeX;
+	float sizeY;
+	uint32_t padding[2] = { 0 };
+};
+
+struct BC1PushConstant {
+	uint32_t num_refines;
+	uint32_t padding[3] = { 0 };
+};
+
+struct BC4PushConstant {
+	uint32_t channel_idx;
+	uint32_t padding[3] = { 0 };
+};
+
+void free_device();
+
+Error _betsy_compress_bptc(Image *r_img, Image::UsedChannels p_channels);
+Error _betsy_compress_s3tc(Image *r_img, Image::UsedChannels p_channels);
+
+class BetsyCompressor : public Object {
+	mutable CommandQueueMT command_queue;
+	bool exit = false;
+	WorkerThreadPool::TaskID task_id = WorkerThreadPool::INVALID_TASK_ID;
+
+	struct BetsyShader {
+		RID compiled;
+		RID pipeline;
+	};
+
+	// Resources shared by all compression formats.
+	RenderingDevice *compress_rd = nullptr;
+	RenderingContextDriver *compress_rcd = nullptr;
+	BetsyShader cached_shaders[BETSY_SHADER_MAX];
+	RID src_sampler;
+
+	// Format-specific resources.
+	RID dxt1_encoding_table_buffer;
+
+	void _init();
+	void _assign_mt_ids(WorkerThreadPool::TaskID p_pump_task_id);
+	void _thread_loop();
+	void _thread_exit();
+
+	Error _get_shader(BetsyFormat p_format, const String &p_version, BetsyShader &r_shader);
+	Error _compress(BetsyFormat p_format, Image *r_img);
+
+public:
+	void init();
+	void finish();
+
+	Error compress(BetsyFormat p_format, Image *r_img) {
+		Error err;
+		command_queue.push_and_ret(this, &BetsyCompressor::_compress, &err, p_format, r_img);
+		return err;
+	}
+};
--- a/modules/betsy/register_types.cpp
+++ b/modules/betsy/register_types.cpp
@@ -0,0 +1,50 @@
+/**************************************************************************/
+/*  register_types.cpp                                                    */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "register_types.h"
+
+#include "image_compress_betsy.h"
+
+void initialize_betsy_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	Image::_image_compress_bptc_rd_func = _betsy_compress_bptc;
+	Image::_image_compress_bc_rd_func = _betsy_compress_s3tc;
+}
+
+void uninitialize_betsy_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	free_device();
+}
--- a/modules/betsy/register_types.h
+++ b/modules/betsy/register_types.h
@@ -0,0 +1,36 @@
+/**************************************************************************/
+/*  register_types.h                                                      */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#include "modules/register_module_types.h"
+
+void initialize_betsy_module(ModuleInitializationLevel p_level);
+void uninitialize_betsy_module(ModuleInitializationLevel p_level);