initial commit, 4.5 stable

2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions
--- a/thirdparty/etcpak/AUTHORS.txt
+++ b/thirdparty/etcpak/AUTHORS.txt
@@ -0,0 +1,5 @@
+Bartosz Taudul <wolf@nereid.pl>
+Daniel Jungmann <el.3d.source@gmail.com>
+Florian Penzkofer <fp@nullptr.de>
+Jae-Ho Nah <nahjaeho@gmail.com>
+Marcin Ławicki <marcin.lawicki@gmail.com>
--- a/thirdparty/etcpak/DecodeRGB.cpp
+++ b/thirdparty/etcpak/DecodeRGB.cpp
@@ -0,0 +1,797 @@
+#include "DecodeRGB.hpp"
+#include "Tables.hpp"
+#include "Math.hpp"
+
+#include <string.h>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#    define _bswap(x) _byteswap_ulong(x)
+#    define _bswap64(x) _byteswap_uint64(x)
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifndef _bswap
+#  define _bswap(x) __builtin_bswap32(x)
+#  define _bswap64(x) __builtin_bswap64(x)
+#endif
+
+static uint8_t table59T58H[8] = { 3,6,11,16,23,32,41,64 };
+
+namespace
+{
+
+static etcpak_force_inline int32_t expand6(uint32_t value)
+{
+    return (value << 2) | (value >> 4);
+}
+
+static etcpak_force_inline int32_t expand7(uint32_t value)
+{
+    return (value << 1) | (value >> 6);
+}
+
+static etcpak_force_inline void DecodeT( uint64_t block, uint32_t* dst, uint32_t w )
+{
+    const auto r0 = ( block >> 24 ) & 0x1B;
+    const auto rh0 = ( r0 >> 3 ) & 0x3;
+    const auto rl0 = r0 & 0x3;
+    const auto g0 = ( block >> 20 ) & 0xF;
+    const auto b0 = ( block >> 16 ) & 0xF;
+
+    const auto r1 = ( block >> 12 ) & 0xF;
+    const auto g1 = ( block >> 8 ) & 0xF;
+    const auto b1 = ( block >> 4 ) & 0xF;
+
+    const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
+    const auto cg0 = ( g0 << 4 ) | g0;
+    const auto cb0 = ( b0 << 4 ) | b0;
+
+    const auto cr1 = ( r1 << 4 ) | r1;
+    const auto cg1 = ( g1 << 4 ) | g1;
+    const auto cb1 = ( b1 << 4 ) | b1;
+
+    const auto codeword_hi = ( block >> 2 ) & 0x3;
+    const auto codeword_lo = block & 0x1;
+    const auto codeword = ( codeword_hi << 1 ) | codeword_lo;
+
+    const auto c2r = clampu8( cr1 + table59T58H[codeword] );
+    const auto c2g = clampu8( cg1 + table59T58H[codeword] );
+    const auto c2b = clampu8( cb1 + table59T58H[codeword] );
+
+    const auto c3r = clampu8( cr1 - table59T58H[codeword] );
+    const auto c3g = clampu8( cg1 - table59T58H[codeword] );
+    const auto c3b = clampu8( cb1 - table59T58H[codeword] );
+
+    const uint32_t col_tab[4] = {
+        uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000 ),
+        uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000 ),
+        uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000 ),
+        uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000 )
+    };
+
+    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
+    for( uint8_t j = 0; j < 4; j++ )
+    {
+        for( uint8_t i = 0; i < 4; i++ )
+        {
+            //2bit indices distributed on two lane 16bit numbers
+            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1) | ( ( indexes >> ( j + i * 4 ) ) & 0x1);
+            dst[j * w + i] = col_tab[index];
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeTAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
+{
+    const auto r0 = ( block >> 24 ) & 0x1B;
+    const auto rh0 = ( r0 >> 3 ) & 0x3;
+    const auto rl0 = r0 & 0x3;
+    const auto g0 = ( block >> 20 ) & 0xF;
+    const auto b0 = ( block >> 16 ) & 0xF;
+
+    const auto r1 = ( block >> 12 ) & 0xF;
+    const auto g1 = ( block >> 8 ) & 0xF;
+    const auto b1 = ( block >> 4 ) & 0xF;
+
+    const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
+    const auto cg0 = ( g0 << 4 ) | g0;
+    const auto cb0 = ( b0 << 4 ) | b0;
+
+    const auto cr1 = ( r1 << 4 ) | r1;
+    const auto cg1 = ( g1 << 4 ) | g1;
+    const auto cb1 = ( b1 << 4 ) | b1;
+
+    const auto codeword_hi = ( block >> 2 ) & 0x3;
+    const auto codeword_lo = block & 0x1;
+    const auto codeword = (codeword_hi << 1) | codeword_lo;
+
+    const int32_t base = alpha >> 56;
+    const int32_t mul = ( alpha >> 52 ) & 0xF;
+    const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
+
+    const auto c2r = clampu8( cr1 + table59T58H[codeword] );
+    const auto c2g = clampu8( cg1 + table59T58H[codeword] );
+    const auto c2b = clampu8( cb1 + table59T58H[codeword] );
+
+    const auto c3r = clampu8( cr1 - table59T58H[codeword] );
+    const auto c3g = clampu8( cg1 - table59T58H[codeword] );
+    const auto c3b = clampu8( cb1 - table59T58H[codeword] );
+
+    const uint32_t col_tab[4] = {
+        uint32_t( cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) ),
+        uint32_t( c2r | ( c2g << 8 ) | ( c2b << 16 ) ),
+        uint32_t( cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) ),
+        uint32_t( c3r | ( c3g << 8 ) | ( c3b << 16 ) )
+    };
+
+    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
+    for( uint8_t j = 0; j < 4; j++ )
+    {
+        for( uint8_t i = 0; i < 4; i++ )
+        {
+            //2bit indices distributed on two lane 16bit numbers
+            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
+            const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12 ) ) & 0x7];
+            const uint32_t a = clampu8( base + amod * mul );
+            dst[j * w + i] = col_tab[index] | ( a << 24 );
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeH( uint64_t block, uint32_t* dst, uint32_t w )
+{
+    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
+
+    const auto r0444 = ( block >> 27 ) & 0xF;
+    const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
+    const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
+
+    const auto r1444 = ( block >> 11 ) & 0xF;
+    const auto g1444 = ( block >> 7 ) & 0xF;
+    const auto b1444 = ( block >> 3 ) & 0xF;
+
+    const auto r0 = ( r0444 << 4 ) | r0444;
+    const auto g0 = ( g0444 << 4 ) | g0444;
+    const auto b0 = ( b0444 << 4 ) | b0444;
+
+    const auto r1 = ( r1444 << 4 ) | r1444;
+    const auto g1 = ( g1444 << 4 ) | g1444;
+    const auto b1 = ( b1444 << 4 ) | b1444;
+
+    const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
+    const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
+    const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
+    const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
+    const auto codeword = codeword_hi | codeword_lo;
+
+    const uint32_t col_tab[] = {
+        uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
+    };
+
+    for( uint8_t j = 0; j < 4; j++ )
+    {
+        for( uint8_t i = 0; i < 4; i++ )
+        {
+            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
+            dst[j * w + i] = col_tab[index] | 0xFF000000;
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeHAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
+{
+    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
+
+    const auto r0444 = ( block >> 27 ) & 0xF;
+    const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
+    const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
+
+    const auto r1444 = ( block >> 11 ) & 0xF;
+    const auto g1444 = ( block >> 7 ) & 0xF;
+    const auto b1444 = ( block >> 3 ) & 0xF;
+
+    const auto r0 = ( r0444 << 4 ) | r0444;
+    const auto g0 = ( g0444 << 4 ) | g0444;
+    const auto b0 = ( b0444 << 4 ) | b0444;
+
+    const auto r1 = ( r1444 << 4 ) | r1444;
+    const auto g1 = ( g1444 << 4 ) | g1444;
+    const auto b1 = ( b1444 << 4 ) | b1444;
+
+    const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
+    const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
+    const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
+    const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
+    const auto codeword = codeword_hi | codeword_lo;
+
+    const int32_t base = alpha >> 56;
+    const int32_t mul = ( alpha >> 52 ) & 0xF;
+    const auto tbl = g_alpha[(alpha >> 48) & 0xF];
+
+    const uint32_t col_tab[] = {
+        uint32_t( clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ) ),
+        uint32_t( clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ) )
+    };
+
+    for( uint8_t j = 0; j < 4; j++ )
+    {
+        for( uint8_t i = 0; i < 4; i++ )
+        {
+            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
+            const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12) ) & 0x7];
+            const uint32_t a = clampu8( base + amod * mul );
+            dst[j * w + i] = col_tab[index] | ( a << 24 );
+        }
+    }
+}
+
+static etcpak_force_inline void DecodePlanar( uint64_t block, uint32_t* dst, uint32_t w )
+{
+    const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
+    const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
+    const auto rv = expand6((block >> (13 + 32)) & 0x3F);
+
+    const auto bh = expand6((block >> (19 + 32)) & 0x3F);
+    const auto gh = expand7((block >> (25 + 32)) & 0x7F);
+
+    const auto rh0 = (block >> (32 - 32)) & 0x01;
+    const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
+    const auto rh = expand6(rh0 | rh1);
+
+    const auto bo0 = (block >> (39 - 32)) & 0x07;
+    const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
+    const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
+    const auto bo = expand6(bo0 | bo1 | bo2);
+    const auto go0 = (block >> (49 - 32)) & 0x3F;
+    const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
+    const auto go = expand7(go0 | go1);
+    const auto ro = expand6((block >> (57 - 32)) & 0x3F);
+
+#ifdef __ARM_NEON
+    uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
+    int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+    init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
+    int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+    init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 ) | ( uint64_t(0xFFF) << 48 );
+    int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            uint8x8_t c = vqshrun_n_s16( col, 2 );
+            vst1_lane_u32( dst+j*w+i, vreinterpret_u32_u8( c ), 0 );
+            col = vaddq_s16( col, chco );
+        }
+        col = vaddq_s16( col, cvco );
+    }
+#elif defined __AVX2__
+    const auto R0 = 4*ro+2;
+    const auto G0 = 4*go+2;
+    const auto B0 = 4*bo+2;
+    const auto RHO = rh-ro;
+    const auto GHO = gh-go;
+    const auto BHO = bh-bo;
+
+    __m256i cvco = _mm256_setr_epi16( rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0 );
+    __m256i col = _mm256_setr_epi16( R0, G0, B0, 0xFFF, R0+RHO, G0+GHO, B0+BHO, 0xFFF, R0+2*RHO, G0+2*GHO, B0+2*BHO, 0xFFF, R0+3*RHO, G0+3*GHO, B0+3*BHO, 0xFFF );
+
+    for( int j=0; j<4; j++ )
+    {
+        __m256i c = _mm256_srai_epi16( col, 2 );
+        __m128i s = _mm_packus_epi16( _mm256_castsi256_si128( c ), _mm256_extracti128_si256( c, 1 ) );
+        _mm_storeu_si128( (__m128i*)(dst+j*w), s );
+        col = _mm256_add_epi16( col, cvco );
+    }
+#elif defined __SSE4_1__
+    __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
+    __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
+    __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0xFFF, 0, 0, 0, 0 );
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            __m128i c = _mm_srai_epi16( col, 2 );
+            __m128i s = _mm_packus_epi16( c, c );
+            dst[j*w+i] = _mm_cvtsi128_si32( s );
+            col = _mm_add_epi16( col, chco );
+        }
+        col = _mm_add_epi16( col, cvco );
+    }
+#else
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
+            const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
+            const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
+            if( ( ( r | g | b ) & ~0xFF ) == 0 )
+            {
+                dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+            }
+            else
+            {
+                const auto rc = clampu8( r );
+                const auto gc = clampu8( g );
+                const auto bc = clampu8( b );
+                dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
+            }
+        }
+    }
+#endif
+}
+
+static etcpak_force_inline void DecodePlanarAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
+{
+    const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
+    const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
+    const auto rv = expand6((block >> (13 + 32)) & 0x3F);
+
+    const auto bh = expand6((block >> (19 + 32)) & 0x3F);
+    const auto gh = expand7((block >> (25 + 32)) & 0x7F);
+
+    const auto rh0 = (block >> (32 - 32)) & 0x01;
+    const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
+    const auto rh = expand6(rh0 | rh1);
+
+    const auto bo0 = (block >> (39 - 32)) & 0x07;
+    const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
+    const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
+    const auto bo = expand6(bo0 | bo1 | bo2);
+    const auto go0 = (block >> (49 - 32)) & 0x3F;
+    const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
+    const auto go = expand7(go0 | go1);
+    const auto ro = expand6((block >> (57 - 32)) & 0x3F);
+
+    const int32_t base = alpha >> 56;
+    const int32_t mul = ( alpha >> 52 ) & 0xF;
+    const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
+
+#ifdef __ARM_NEON
+    uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
+    int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+    init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
+    int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+    init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 );
+    int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t a = clampu8( base + amod * mul );
+            uint8x8_t c = vqshrun_n_s16( col, 2 );
+            dst[j*w+i] = vget_lane_u32( vreinterpret_u32_u8( c ), 0 ) | ( a << 24 );
+            col = vaddq_s16( col, chco );
+        }
+        col = vaddq_s16( col, cvco );
+    }
+#elif defined __SSE4_1__
+    __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
+    __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
+    __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0, 0, 0, 0, 0 );
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t a = clampu8( base + amod * mul );
+            __m128i c = _mm_srai_epi16( col, 2 );
+            __m128i s = _mm_packus_epi16( c, c );
+            dst[j*w+i] = _mm_cvtsi128_si32( s ) | ( a << 24 );
+            col = _mm_add_epi16( col, chco );
+        }
+        col = _mm_add_epi16( col, cvco );
+    }
+#else
+    for (auto j = 0; j < 4; j++)
+    {
+        for (auto i = 0; i < 4; i++)
+        {
+            const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
+            const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
+            const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
+            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t a = clampu8( base + amod * mul );
+            if( ( ( r | g | b ) & ~0xFF ) == 0 )
+            {
+                dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
+            }
+            else
+            {
+                const auto rc = clampu8( r );
+                const auto gc = clampu8( g );
+                const auto bc = clampu8( b );
+                dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
+            }
+        }
+    }
+#endif
+}
+}
+
+static etcpak_force_inline uint64_t ConvertByteOrder( uint64_t d )
+{
+    uint32_t word[2];
+    memcpy( word, &d, 8 );
+    word[0] = _bswap( word[0] );
+    word[1] = _bswap( word[1] );
+    memcpy( &d, word, 8 );
+    return d;
+}
+
+static etcpak_force_inline void DecodeRGBPart( uint64_t d, uint32_t* dst, uint32_t w )
+{
+    d = ConvertByteOrder( d );
+
+    uint32_t br[2], bg[2], bb[2];
+
+    if( d & 0x2 )
+    {
+        int32_t dr, dg, db;
+
+        uint32_t r0 = ( d & 0xF8000000 ) >> 27;
+        uint32_t g0 = ( d & 0x00F80000 ) >> 19;
+        uint32_t b0 = ( d & 0x0000F800 ) >> 11;
+
+        dr = ( int32_t(d) << 5 ) >> 29;
+        dg = ( int32_t(d) << 13 ) >> 29;
+        db = ( int32_t(d) << 21 ) >> 29;
+
+        int32_t r1 = int32_t(r0) + dr;
+        int32_t g1 = int32_t(g0) + dg;
+        int32_t b1 = int32_t(b0) + db;
+
+        // T mode
+        if ( (r1 < 0) || (r1 > 31) )
+        {
+            DecodeT( d, dst, w );
+            return;
+        }
+
+        // H mode
+        if ((g1 < 0) || (g1 > 31))
+        {
+            DecodeH( d, dst, w );
+            return;
+        }
+
+        // P mode
+        if( (b1 < 0) || (b1 > 31) )
+        {
+            DecodePlanar( d, dst, w );
+            return;
+        }
+
+        br[0] = ( r0 << 3 ) | ( r0 >> 2 );
+        br[1] = ( r1 << 3 ) | ( r1 >> 2 );
+        bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
+        bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
+        bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
+        bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
+    }
+    else
+    {
+        br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
+        br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
+        bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
+        bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
+        bb[0] = ( ( d & 0x0000F000 ) >> 8  ) | ( ( d & 0x0000F000 ) >> 12 );
+        bb[1] = ( ( d & 0x00000F00 ) >> 4  ) | ( ( d & 0x00000F00 ) >> 8  );
+    }
+
+    unsigned int tcw[2];
+    tcw[0] = ( d & 0xE0 ) >> 5;
+    tcw[1] = ( d & 0x1C ) >> 2;
+
+    uint32_t b1 = ( d >> 32 ) & 0xFFFF;
+    uint32_t b2 = ( d >> 48 );
+
+    b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
+    b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
+    b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
+    b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
+
+    b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
+    b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
+    b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
+    b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
+
+    uint32_t idx = b1 | ( b2 << 1 );
+
+    if( d & 0x1 )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = g_table[tcw[j/2]][idx & 0x3];
+                const auto r = br[j/2] + mod;
+                const auto g = bg[j/2] + mod;
+                const auto b = bb[j/2] + mod;
+                if( ( ( r | g | b ) & ~0xFF ) == 0 )
+                {
+                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+                }
+                else
+                {
+                    const auto rc = clampu8( r );
+                    const auto gc = clampu8( g );
+                    const auto bc = clampu8( b );
+                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
+                }
+                idx >>= 2;
+            }
+        }
+    }
+    else
+    {
+        for( int i=0; i<4; i++ )
+        {
+            const auto tbl = g_table[tcw[i/2]];
+            const auto cr = br[i/2];
+            const auto cg = bg[i/2];
+            const auto cb = bb[i/2];
+
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = tbl[idx & 0x3];
+                const auto r = cr + mod;
+                const auto g = cg + mod;
+                const auto b = cb + mod;
+                if( ( ( r | g | b ) & ~0xFF ) == 0 )
+                {
+                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+                }
+                else
+                {
+                    const auto rc = clampu8( r );
+                    const auto gc = clampu8( g );
+                    const auto bc = clampu8( b );
+                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
+                }
+                idx >>= 2;
+            }
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeRGBAPart( uint64_t d, uint64_t alpha, uint32_t* dst, uint32_t w )
+{
+    d = ConvertByteOrder( d );
+    alpha = _bswap64( alpha );
+
+    uint32_t br[2], bg[2], bb[2];
+
+    if( d & 0x2 )
+    {
+        int32_t dr, dg, db;
+
+        uint32_t r0 = ( d & 0xF8000000 ) >> 27;
+        uint32_t g0 = ( d & 0x00F80000 ) >> 19;
+        uint32_t b0 = ( d & 0x0000F800 ) >> 11;
+
+        dr = ( int32_t(d) << 5 ) >> 29;
+        dg = ( int32_t(d) << 13 ) >> 29;
+        db = ( int32_t(d) << 21 ) >> 29;
+
+        int32_t r1 = int32_t(r0) + dr;
+        int32_t g1 = int32_t(g0) + dg;
+        int32_t b1 = int32_t(b0) + db;
+
+        // T mode
+        if ( (r1 < 0) || (r1 > 31) )
+        {
+            DecodeTAlpha( d, alpha, dst, w );
+            return;
+        }
+
+        // H mode
+        if ( (g1 < 0) || (g1 > 31) )
+        {
+            DecodeHAlpha( d, alpha, dst, w );
+            return;
+        }
+
+        // P mode
+        if ( (b1 < 0) || (b1 > 31) )
+        {
+            DecodePlanarAlpha( d, alpha, dst, w );
+            return;
+        }
+
+        br[0] = ( r0 << 3 ) | ( r0 >> 2 );
+        br[1] = ( r1 << 3 ) | ( r1 >> 2 );
+        bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
+        bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
+        bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
+        bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
+    }
+    else
+    {
+        br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
+        br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
+        bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
+        bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
+        bb[0] = ( ( d & 0x0000F000 ) >> 8  ) | ( ( d & 0x0000F000 ) >> 12 );
+        bb[1] = ( ( d & 0x00000F00 ) >> 4  ) | ( ( d & 0x00000F00 ) >> 8  );
+    }
+
+    unsigned int tcw[2];
+    tcw[0] = ( d & 0xE0 ) >> 5;
+    tcw[1] = ( d & 0x1C ) >> 2;
+
+    uint32_t b1 = ( d >> 32 ) & 0xFFFF;
+    uint32_t b2 = ( d >> 48 );
+
+    b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
+    b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
+    b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
+    b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
+
+    b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
+    b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
+    b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
+    b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
+
+    uint32_t idx = b1 | ( b2 << 1 );
+
+    const int32_t base = alpha >> 56;
+    const int32_t mul = ( alpha >> 52 ) & 0xF;
+    const auto atbl = g_alpha[( alpha >> 48 ) & 0xF];
+
+    if( d & 0x1 )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = g_table[tcw[j/2]][idx & 0x3];
+                const auto r = br[j/2] + mod;
+                const auto g = bg[j/2] + mod;
+                const auto b = bb[j/2] + mod;
+                const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
+                const uint32_t a = clampu8( base + amod * mul );
+                if( ( ( r | g | b ) & ~0xFF ) == 0 )
+                {
+                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
+                }
+                else
+                {
+                    const auto rc = clampu8( r );
+                    const auto gc = clampu8( g );
+                    const auto bc = clampu8( b );
+                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
+                }
+                idx >>= 2;
+            }
+        }
+    }
+    else
+    {
+        for( int i=0; i<4; i++ )
+        {
+            const auto tbl = g_table[tcw[i/2]];
+            const auto cr = br[i/2];
+            const auto cg = bg[i/2];
+            const auto cb = bb[i/2];
+
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = tbl[idx & 0x3];
+                const auto r = cr + mod;
+                const auto g = cg + mod;
+                const auto b = cb + mod;
+                const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
+                const uint32_t a = clampu8( base + amod * mul );
+                if( ( ( r | g | b ) & ~0xFF ) == 0 )
+                {
+                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
+                }
+                else
+                {
+                    const auto rc = clampu8( r );
+                    const auto gc = clampu8( g );
+                    const auto bc = clampu8( b );
+                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
+                }
+                idx >>= 2;
+            }
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeRPart( uint64_t r, uint32_t* dst, uint32_t w )
+{
+    r = _bswap64( r );
+
+    const int32_t base = ( r >> 56 )*8+4;
+    const int32_t mul = ( r >> 52 ) & 0xF;
+    const auto atbl = g_alpha[( r >> 48 ) & 0xF];
+
+    for( int i=0; i<4; i++ )
+    {
+        for ( int j=0; j<4; j++ )
+        {
+            const auto amod = atbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t rc = clampu8( ( base + amod * g_alpha11Mul[mul] )/8 );
+            dst[j*w+i] = rc | 0xFF000000;
+        }
+    }
+}
+
+static etcpak_force_inline void DecodeRGPart( uint64_t r, uint64_t g, uint32_t* dst, uint32_t w )
+{
+    r = _bswap64( r );
+    g = _bswap64( g );
+
+    const int32_t rbase = ( r >> 56 )*8+4;
+    const int32_t rmul = ( r >> 52 ) & 0xF;
+    const auto rtbl = g_alpha[( r >> 48 ) & 0xF];
+
+    const int32_t gbase = ( g >> 56 )*8+4;
+    const int32_t gmul = ( g >> 52 ) & 0xF;
+    const auto gtbl = g_alpha[( g >> 48 ) & 0xF];
+
+    for( int i=0; i<4; i++ )
+    {
+        for( int j=0; j<4; j++ )
+        {
+            const auto rmod = rtbl[(r >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t rc = clampu8( ( rbase + rmod * g_alpha11Mul[rmul] )/8 );
+
+            const auto gmod = gtbl[(g >> ( 45 - j*3 - i*12 )) & 0x7];
+            const uint32_t gc = clampu8( ( gbase + gmod * g_alpha11Mul[gmul] )/8 );
+
+            dst[j*w+i] = rc | (gc << 8) | 0xFF000000;
+        }
+    }
+}
+
+void DecodeRBlock( const void* src, void* dst, size_t width )
+{
+	uint64_t* srcPtr = (uint64_t*)src;
+    uint64_t r = *srcPtr++;
+    DecodeRPart( r, (uint32_t*)dst, width );
+}
+
+void DecodeRGBlock( const void* src, void* dst, size_t width )
+{
+	uint64_t* srcPtr = (uint64_t*)src;
+    uint64_t r = *srcPtr++;
+    uint64_t g = *srcPtr++;
+    DecodeRGPart( r, g, (uint32_t*)dst, width );
+}
+
+void DecodeRGBBlock( const void* src, void* dst, size_t width )
+{
+	uint64_t* srcPtr = (uint64_t*)src;
+    uint64_t d = *srcPtr++;
+    DecodeRGBPart( d, (uint32_t*)dst, width );
+}
+
+void DecodeRGBABlock( const void* src, void* dst, size_t width )
+{
+	uint64_t* srcPtr = (uint64_t*)src;
+    uint64_t a = *srcPtr++;
+    uint64_t d = *srcPtr++;
+    DecodeRGBAPart( d, a, (uint32_t*)dst, width );
+}
--- a/thirdparty/etcpak/DecodeRGB.hpp
+++ b/thirdparty/etcpak/DecodeRGB.hpp
@@ -0,0 +1,12 @@
+#ifndef __DECODERGB_HPP__
+#define __DECODERGB_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+void DecodeRBlock( const void* src, void* dst, size_t width );
+void DecodeRGBlock( const void* src, void* dst, size_t width );
+void DecodeRGBBlock( const void* src, void* dst, size_t width );
+void DecodeRGBABlock( const void* src, void* dst, size_t width );
+
+#endif
--- a/thirdparty/etcpak/Dither.cpp
+++ b/thirdparty/etcpak/Dither.cpp
@@ -0,0 +1,120 @@
+#include <algorithm>
+#include <string.h>
+
+#include "Dither.hpp"
+#include "Math.hpp"
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i l0 = _mm256_inserti128_si256( _mm256_castsi128_si256( px0 ), px1, 1 );
+    __m256i l1 = _mm256_inserti128_si256( _mm256_castsi128_si256( px2 ), px3, 1 );
+
+    __m256i a0 = _mm256_adds_epu8( l0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( l1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+
+}
+#endif
+
+void Dither( uint8_t* data )
+{
+#ifdef __AVX2__
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i px0 = _mm256_loadu_si256( (__m256i*)(data   ) );
+    __m256i px1 = _mm256_loadu_si256( (__m256i*)(data+32) );
+
+    __m256i a0 = _mm256_adds_epu8( px0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( px1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+#else
+    static constexpr int8_t Bayer31[16] = {
+        ( 0-8)*2/3, ( 8-8)*2/3, ( 2-8)*2/3, (10-8)*2/3,
+        (12-8)*2/3, ( 4-8)*2/3, (14-8)*2/3, ( 6-8)*2/3,
+        ( 3-8)*2/3, (11-8)*2/3, ( 1-8)*2/3, ( 9-8)*2/3,
+        (15-8)*2/3, ( 7-8)*2/3, (13-8)*2/3, ( 5-8)*2/3
+    };
+    static constexpr int8_t Bayer63[16] = {
+        ( 0-8)*2/6, ( 8-8)*2/6, ( 2-8)*2/6, (10-8)*2/6,
+        (12-8)*2/6, ( 4-8)*2/6, (14-8)*2/6, ( 6-8)*2/6,
+        ( 3-8)*2/6, (11-8)*2/6, ( 1-8)*2/6, ( 9-8)*2/6,
+        (15-8)*2/6, ( 7-8)*2/6, (13-8)*2/6, ( 5-8)*2/6
+    };
+
+    for( int i=0; i<16; i++ )
+    {
+        uint32_t col;
+        memcpy( &col, data, 4 );
+        uint8_t r = col & 0xFF;
+        uint8_t g = ( col >> 8 ) & 0xFF;
+        uint8_t b = ( col >> 16 ) & 0xFF;
+
+        r = clampu8( r + Bayer31[i] );
+        g = clampu8( g + Bayer63[i] );
+        b = clampu8( b + Bayer31[i] );
+
+        col = r | ( g << 8 ) | ( b << 16 );
+        memcpy( data, &col, 4 );
+        data += 4;
+    }
+#endif
+}
--- a/thirdparty/etcpak/Dither.hpp
+++ b/thirdparty/etcpak/Dither.hpp
@@ -0,0 +1,21 @@
+#ifndef __DITHER_HPP__
+#define __DITHER_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+void Dither( uint8_t* data );
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 );
+#endif
+
+#endif
--- a/thirdparty/etcpak/ForceInline.hpp
+++ b/thirdparty/etcpak/ForceInline.hpp
@@ -0,0 +1,20 @@
+#ifndef __FORCEINLINE_HPP__
+#define __FORCEINLINE_HPP__
+
+#if defined(__GNUC__)
+#  define etcpak_force_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#  define etcpak_force_inline __forceinline
+#else
+#  define etcpak_force_inline inline
+#endif
+
+#if defined(__GNUC__)
+#  define etcpak_no_inline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#  define etcpak_no_inline __declspec(noinline)
+#else
+#  define etcpak_no_inline
+#endif
+
+#endif
--- a/thirdparty/etcpak/LICENSE.txt
+++ b/thirdparty/etcpak/LICENSE.txt
@@ -0,0 +1,26 @@
+etcpak, an extremely fast ETC compression utility (https://github.com/wolfpld/etcpak)
+
+Copyright (c) 2013-2022, Bartosz Taudul <wolf@nereid.pl>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/thirdparty/etcpak/Math.hpp
+++ b/thirdparty/etcpak/Math.hpp
@@ -0,0 +1,92 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <stdint.h>
+
+#include "ForceInline.hpp"
+
+template<typename T>
+static etcpak_force_inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+static etcpak_force_inline int CountSetBits( uint32_t val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+static etcpak_force_inline int CountLeadingZeros( uint32_t val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+static etcpak_force_inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return pow( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+static etcpak_force_inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+static etcpak_force_inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+static etcpak_force_inline uint8_t clampu8( int32_t val )
+{
+    if( ( val & ~0xFF ) == 0 ) return val;
+    return ( ( ~val ) >> 31 ) & 0xFF;
+}
+
+template<class T>
+static etcpak_force_inline T sq( T val )
+{
+    return val * val;
+}
+
+static etcpak_force_inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
--- a/thirdparty/etcpak/ProcessCommon.hpp
+++ b/thirdparty/etcpak/ProcessCommon.hpp
@@ -0,0 +1,50 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64_t FixByteOrder( uint64_t d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
--- a/thirdparty/etcpak/ProcessDxtc.cpp
+++ b/thirdparty/etcpak/ProcessDxtc.cpp
--- a/thirdparty/etcpak/ProcessDxtc.hpp
+++ b/thirdparty/etcpak/ProcessDxtc.hpp
@@ -0,0 +1,14 @@
+#ifndef __PROCESSDXT1_HPP__
+#define __PROCESSDXT1_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+void CompressBc1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressBc1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressBc3( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
--- a/thirdparty/etcpak/ProcessRGB.cpp
+++ b/thirdparty/etcpak/ProcessRGB.cpp
--- a/thirdparty/etcpak/ProcessRGB.hpp
+++ b/thirdparty/etcpak/ProcessRGB.hpp
@@ -0,0 +1,14 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include <stdint.h>
+
+void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
+void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, bool useHeuristics );
+
+void CompressEacR( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEacRg( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
--- a/thirdparty/etcpak/Tables.cpp
+++ b/thirdparty/etcpak/Tables.cpp
@@ -0,0 +1,223 @@
+#include "Tables.hpp"
+
+const int32_t g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64_t g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32_t g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32_t g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32_t g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+const int32_t g_alpha[16][8] = {
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int32_t g_alpha11Mul[16] = { 1, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120 };
+
+const int32_t g_alphaRange[16] = {
+    0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ),
+    0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ),
+    0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ),
+    0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ),
+    0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ),
+    0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ),
+    0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ),
+    0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ),
+    0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ),
+    0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ),
+    0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ),
+    0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ),
+    0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ),
+    0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ),
+    0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ),
+    0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ),
+};
+
+#ifdef __SSE4_1__
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+
+const __m128i g_alpha_SIMD[16] = {
+    _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ),
+    _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ),
+    _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ),
+    _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ),
+    _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ),
+    _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ),
+    _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ),
+    _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ),
+    _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ),
+    _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ),
+    _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ),
+    _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ),
+    _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ),
+    _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ),
+    _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ),
+    _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ),
+};
+
+const __m128i g_alphaRange_SIMD = _mm_setr_epi16(
+    g_alphaRange[0],
+    g_alphaRange[1],
+    g_alphaRange[4],
+    g_alphaRange[5],
+    g_alphaRange[8],
+    g_alphaRange[14],
+    0,
+    0 );
+#endif
+
+#ifdef __AVX2__
+const __m256i g_alpha_AVX[8] = {
+    _mm256_setr_epi16( g_alpha[ 0][0], g_alpha[ 1][0], g_alpha[ 2][0], g_alpha[ 3][0], g_alpha[ 4][0], g_alpha[ 5][0], g_alpha[ 6][0], g_alpha[ 7][0], g_alpha[ 8][0], g_alpha[ 9][0], g_alpha[10][0], g_alpha[11][0], g_alpha[12][0], g_alpha[13][0], g_alpha[14][0], g_alpha[15][0] ),
+    _mm256_setr_epi16( g_alpha[ 0][1], g_alpha[ 1][1], g_alpha[ 2][1], g_alpha[ 3][1], g_alpha[ 4][1], g_alpha[ 5][1], g_alpha[ 6][1], g_alpha[ 7][1], g_alpha[ 8][1], g_alpha[ 9][1], g_alpha[10][1], g_alpha[11][1], g_alpha[12][1], g_alpha[13][1], g_alpha[14][1], g_alpha[15][1] ),
+    _mm256_setr_epi16( g_alpha[ 0][2], g_alpha[ 1][2], g_alpha[ 2][2], g_alpha[ 3][2], g_alpha[ 4][2], g_alpha[ 5][2], g_alpha[ 6][2], g_alpha[ 7][2], g_alpha[ 8][2], g_alpha[ 9][2], g_alpha[10][2], g_alpha[11][2], g_alpha[12][2], g_alpha[13][2], g_alpha[14][2], g_alpha[15][2] ),
+    _mm256_setr_epi16( g_alpha[ 0][3], g_alpha[ 1][3], g_alpha[ 2][3], g_alpha[ 3][3], g_alpha[ 4][3], g_alpha[ 5][3], g_alpha[ 6][3], g_alpha[ 7][3], g_alpha[ 8][3], g_alpha[ 9][3], g_alpha[10][3], g_alpha[11][3], g_alpha[12][3], g_alpha[13][3], g_alpha[14][3], g_alpha[15][3] ),
+    _mm256_setr_epi16( g_alpha[ 0][4], g_alpha[ 1][4], g_alpha[ 2][4], g_alpha[ 3][4], g_alpha[ 4][4], g_alpha[ 5][4], g_alpha[ 6][4], g_alpha[ 7][4], g_alpha[ 8][4], g_alpha[ 9][4], g_alpha[10][4], g_alpha[11][4], g_alpha[12][4], g_alpha[13][4], g_alpha[14][4], g_alpha[15][4] ),
+    _mm256_setr_epi16( g_alpha[ 0][5], g_alpha[ 1][5], g_alpha[ 2][5], g_alpha[ 3][5], g_alpha[ 4][5], g_alpha[ 5][5], g_alpha[ 6][5], g_alpha[ 7][5], g_alpha[ 8][5], g_alpha[ 9][5], g_alpha[10][5], g_alpha[11][5], g_alpha[12][5], g_alpha[13][5], g_alpha[14][5], g_alpha[15][5] ),
+    _mm256_setr_epi16( g_alpha[ 0][6], g_alpha[ 1][6], g_alpha[ 2][6], g_alpha[ 3][6], g_alpha[ 4][6], g_alpha[ 5][6], g_alpha[ 6][6], g_alpha[ 7][6], g_alpha[ 8][6], g_alpha[ 9][6], g_alpha[10][6], g_alpha[11][6], g_alpha[12][6], g_alpha[13][6], g_alpha[14][6], g_alpha[15][6] ),
+    _mm256_setr_epi16( g_alpha[ 0][7], g_alpha[ 1][7], g_alpha[ 2][7], g_alpha[ 3][7], g_alpha[ 4][7], g_alpha[ 5][7], g_alpha[ 6][7], g_alpha[ 7][7], g_alpha[ 8][7], g_alpha[ 9][7], g_alpha[10][7], g_alpha[11][7], g_alpha[12][7], g_alpha[13][7], g_alpha[14][7], g_alpha[15][7] ),
+};
+
+const __m256i g_alphaRange_AVX = _mm256_setr_epi16(
+    g_alphaRange[ 0], g_alphaRange[ 1], g_alphaRange[ 2], g_alphaRange[ 3], g_alphaRange[ 4], g_alphaRange[ 5], g_alphaRange[ 6], g_alphaRange[ 7],
+    g_alphaRange[ 8], g_alphaRange[ 9], g_alphaRange[10], g_alphaRange[11], g_alphaRange[12], g_alphaRange[13], g_alphaRange[14], g_alphaRange[15]
+);
+#endif
+
+#ifdef __ARM_NEON
+const int16x8_t g_table128_NEON[2] =
+{
+    { 2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128 },
+    { 8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128 }
+};
+
+const int32x4_t g_table256_NEON[4] =
+{
+    {  2*256,   5*256,   9*256,  13*256 },
+    {  8*256,  17*256,  29*256,  42*256 },
+    { 18*256,  24*256,  33*256,  47*256 },
+    { 60*256,  80*256, 106*256, 183*256 }
+};
+
+const int16x8_t g_alpha_NEON[16] =
+{
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int16x8_t g_alphaRange_NEON =
+{
+    (int16_t)g_alphaRange[0],
+    (int16_t)g_alphaRange[1],
+    (int16_t)g_alphaRange[4],
+    (int16_t)g_alphaRange[5],
+    (int16_t)g_alphaRange[8],
+    (int16_t)g_alphaRange[14],
+    0,
+    0
+};
+#endif
--- a/thirdparty/etcpak/Tables.hpp
+++ b/thirdparty/etcpak/Tables.hpp
@@ -0,0 +1,50 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif
+#ifdef __SSE4_1__
+#  include <smmintrin.h>
+#endif
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+extern const int32_t g_table[8][4];
+extern const int64_t g_table256[8][4];
+
+extern const uint32_t g_id[4][16];
+
+extern const uint32_t g_avg2[16];
+
+extern const uint32_t g_flags[64];
+
+extern const int32_t g_alpha[16][8];
+extern const int32_t g_alpha11Mul[16];
+extern const int32_t g_alphaRange[16];
+
+#ifdef __SSE4_1__
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+
+extern const __m128i g_alpha_SIMD[16];
+extern const __m128i g_alphaRange_SIMD;
+#endif
+
+#ifdef __AVX2__
+extern const __m256i g_alpha_AVX[8];
+extern const __m256i g_alphaRange_AVX;
+#endif
+
+#ifdef __ARM_NEON
+extern const int16x8_t g_table128_NEON[2];
+extern const int32x4_t g_table256_NEON[4];
+extern const int16x8_t g_alpha_NEON[16];
+extern const int16x8_t g_alphaRange_NEON;
+#endif
+
+#endif
--- a/thirdparty/etcpak/Vector.hpp
+++ b/thirdparty/etcpak/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+#include <stdint.h>
+
+#include "Math.hpp"
+
+template<class T>
+struct Vector2
+{
+    Vector2() : x( 0 ), y( 0 ) {}
+    Vector2( T v ) : x( v ), y( v ) {}
+    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    Vector2<T>& operator+=( const Vector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator-=( const Vector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator*=( const Vector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
+{
+    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
+{
+    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef Vector2<int32_t> v2i;
+typedef Vector2<float> v2f;
+
+
+template<class T>
+struct Vector3
+{
+    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    Vector3( T v ) : x( v ), y( v ), z( v ) {}
+    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( unsigned int idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( unsigned int idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    Vector3<T> operator+=( const Vector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const Vector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
+{
+    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
+{
+    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef Vector3<int32_t> v3i;
+typedef Vector3<float> v3f;
+typedef Vector3<uint8_t> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8_t( std::min( 1.f, v.x ) * 255 ), uint8_t( std::min( 1.f, v.y ) * 255 ), uint8_t( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v )
+{
+    T l = v.Luminance();
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> pow( const Vector3<T>& base, float exponent )
+{
+    return Vector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+Vector3<T> sRGB2linear( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+Vector3<T> linear2sRGB( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
--- a/thirdparty/etcpak/patches/0001-remove-bc7enc.patch
+++ b/thirdparty/etcpak/patches/0001-remove-bc7enc.patch
@@ -0,0 +1,52 @@
+diff --git a/thirdparty/etcpak/ProcessDxtc.cpp b/thirdparty/etcpak/ProcessDxtc.cpp
+index 5373b75cdc..e1bc6a5cb6 100644
+--- a/thirdparty/etcpak/ProcessDxtc.cpp
+++ b/thirdparty/etcpak/ProcessDxtc.cpp
+@@ -1,4 +1,3 @@
+-#include "bc7enc.h"
+ #include "Dither.hpp"
+ #include "ForceInline.hpp"
+ #include "ProcessDxtc.hpp"
+@@ -1085,29 +1084,3 @@ void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t wi
+ #endif
+     } while( --blocks );
+ }
+-
+-void CompressBc7( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, const bc7enc_compress_block_params* params )
+-{
+-    int i = 0;
+-    auto ptr = dst;
+-    do
+-    {
+-        uint32_t rgba[4*4];
+-
+-        auto tmp = (char*)rgba;
+-        memcpy( tmp,        src + width * 0, 4*4 );
+-        memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+-        memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+-        memcpy( tmp + 12*4, src + width * 3, 4*4 );
+-        src += 4;
+-        if( ++i == width/4 )
+-        {
+-            src += width * 3;
+-            i = 0;
+-        }
+-
+-        bc7enc_compress_block( ptr, rgba, params );
+-        ptr += 2;
+-    }
+-    while( --blocks );
+-}
+diff --git a/thirdparty/etcpak/ProcessDxtc.hpp b/thirdparty/etcpak/ProcessDxtc.hpp
+index 7655bb33be..8145493872 100644
+--- a/thirdparty/etcpak/ProcessDxtc.hpp
+++ b/thirdparty/etcpak/ProcessDxtc.hpp
+@@ -11,8 +11,4 @@ void CompressBc3( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t wi
+ void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+ void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+ 
+-struct bc7enc_compress_block_params;
+-
+-void CompressBc7( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width, const bc7enc_compress_block_params* params );
+-
+ #endif