initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
This commit is contained in:
622
thirdparty/astcenc/astcenc_decompress_symbolic.cpp
vendored
Normal file
622
thirdparty/astcenc/astcenc_decompress_symbolic.cpp
vendored
Normal file
@@ -0,0 +1,622 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions to decompress a symbolic block.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
/**
|
||||
* @brief Compute the integer linear interpolation of two color endpoints.
|
||||
*
|
||||
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
|
||||
* @param color0 The endpoint0 color.
|
||||
* @param color1 The endpoint1 color.
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
*
|
||||
* @return The interpolated color.
|
||||
*/
|
||||
static vint4 lerp_color_int(
|
||||
vmask4 u8_mask,
|
||||
vint4 color0,
|
||||
vint4 color1,
|
||||
vint4 weights
|
||||
) {
|
||||
vint4 weight1 = weights;
|
||||
vint4 weight0 = vint4(64) - weight1;
|
||||
|
||||
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
|
||||
color = asr<6>(color);
|
||||
|
||||
// For decode_unorm8 values force the codec to bit replicate. This allows the
|
||||
// rest of the codec to assume the full 0xFFFF range for everything and ignore
|
||||
// the decode_mode setting
|
||||
vint4 color_u8 = asr<8>(color) * vint4(257);
|
||||
color = select(color, color_u8, u8_mask);
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert integer color value into a float value for the decoder.
|
||||
*
|
||||
* @param data The integer color value post-interpolation.
|
||||
* @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
|
||||
*
|
||||
* @return The float color value.
|
||||
*/
|
||||
static inline vfloat4 decode_texel(
|
||||
vint4 data,
|
||||
vmask4 lns_mask
|
||||
) {
|
||||
vint4 color_lns = vint4::zero();
|
||||
vint4 color_unorm = vint4::zero();
|
||||
|
||||
if (any(lns_mask))
|
||||
{
|
||||
color_lns = lns_to_sf16(data);
|
||||
}
|
||||
|
||||
if (!all(lns_mask))
|
||||
{
|
||||
color_unorm = unorm16_to_sf16(data);
|
||||
}
|
||||
|
||||
// Pick components and then convert to FP16
|
||||
vint4 datai = select(color_unorm, color_lns, lns_mask);
|
||||
return float16_to_float(datai);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void unpack_weights(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const decimation_info& di,
|
||||
bool is_dual_plane,
|
||||
int weights_plane1[BLOCK_MAX_TEXELS],
|
||||
int weights_plane2[BLOCK_MAX_TEXELS]
|
||||
) {
|
||||
// Safe to overshoot as all arrays are allocated to full size
|
||||
if (!is_dual_plane)
|
||||
{
|
||||
// Build full 64-entry weight lookup table
|
||||
vtable_64x8 table;
|
||||
vtable_prepare(table, scb.weights);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint summed_value(8);
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax_s(weight_count);
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(summed_value), weights_plane1 + i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Build a 32-entry weight lookup table per plane
|
||||
// Plane 1
|
||||
vtable_32x8 tab_plane1;
|
||||
vtable_prepare(tab_plane1, scb.weights);
|
||||
|
||||
// Plane 2
|
||||
vtable_32x8 tab_plane2;
|
||||
vtable_prepare(tab_plane2, scb.weights + 32);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint sum_plane1(8);
|
||||
vint sum_plane2(8);
|
||||
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax_s(weight_count);
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
|
||||
sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(sum_plane1), weights_plane1 + i);
|
||||
store(lsr<4>(sum_plane2), weights_plane2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return an FP32 NaN value for use in error colors.
|
||||
*
|
||||
* This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
|
||||
*
|
||||
* @return The float color value.
|
||||
*/
|
||||
static float error_color_nan()
|
||||
{
|
||||
if32 v;
|
||||
v.u = 0xFFFFE000U;
|
||||
return v.f;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void decompress_symbolic_block(
|
||||
astcenc_profile decode_mode,
|
||||
const block_size_descriptor& bsd,
|
||||
int xpos,
|
||||
int ypos,
|
||||
int zpos,
|
||||
const symbolic_compressed_block& scb,
|
||||
image_block& blk
|
||||
) {
|
||||
blk.xpos = xpos;
|
||||
blk.ypos = ypos;
|
||||
blk.zpos = zpos;
|
||||
|
||||
blk.data_min = vfloat4::zero();
|
||||
blk.data_mean = vfloat4::zero();
|
||||
blk.data_max = vfloat4::zero();
|
||||
blk.grayscale = false;
|
||||
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i++)
|
||||
{
|
||||
blk.data_r[i] = error_color_nan();
|
||||
blk.data_g[i] = error_color_nan();
|
||||
blk.data_b[i] = error_color_nan();
|
||||
blk.data_a[i] = error_color_nan();
|
||||
blk.rgb_lns[i] = 0;
|
||||
blk.alpha_lns[i] = 0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
|
||||
(scb.block_type == SYM_BTYPE_CONST_U16))
|
||||
{
|
||||
vfloat4 color;
|
||||
uint8_t use_lns = 0;
|
||||
|
||||
// UNORM16 constant color block
|
||||
if (scb.block_type == SYM_BTYPE_CONST_U16)
|
||||
{
|
||||
vint4 colori(scb.constant_color);
|
||||
|
||||
// Determine the UNORM8 rounding on the decode
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
// The real decoder would just use the top 8 bits, but we rescale
|
||||
// in to a 16-bit value that rounds correctly.
|
||||
vint4 colori_u8 = asr<8>(colori) * 257;
|
||||
colori = select(colori, colori_u8, u8_mask);
|
||||
|
||||
vint4 colorf16 = unorm16_to_sf16(colori);
|
||||
color = float16_to_float(colorf16);
|
||||
}
|
||||
// FLOAT16 constant color block
|
||||
else
|
||||
{
|
||||
switch (decode_mode)
|
||||
{
|
||||
case ASTCENC_PRF_LDR_SRGB:
|
||||
case ASTCENC_PRF_LDR:
|
||||
color = vfloat4(error_color_nan());
|
||||
break;
|
||||
case ASTCENC_PRF_HDR_RGB_LDR_A:
|
||||
case ASTCENC_PRF_HDR:
|
||||
// Constant-color block; unpack from FP16 to FP32.
|
||||
color = float16_to_float(vint4(scb.constant_color));
|
||||
use_lns = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i++)
|
||||
{
|
||||
blk.data_r[i] = color.lane<0>();
|
||||
blk.data_g[i] = color.lane<1>();
|
||||
blk.data_b[i] = color.lane<2>();
|
||||
blk.data_a[i] = color.lane<3>();
|
||||
blk.rgb_lns[i] = use_lns;
|
||||
blk.alpha_lns[i] = use_lns;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the appropriate partition-table entry
|
||||
int partition_count = scb.partition_count;
|
||||
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
|
||||
|
||||
// Get the appropriate block descriptors
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
|
||||
|
||||
// Now that we have endpoint colors and weights, we can unpack texel colors
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(decode_mode,
|
||||
scb.color_formats[i],
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
|
||||
|
||||
int texel_count = pi.partition_texel_count[i];
|
||||
for (int j = 0; j < texel_count; j++)
|
||||
{
|
||||
int tix = pi.texels_of_partition[i][j];
|
||||
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
|
||||
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
vfloat4 colorf = decode_texel(color, lns_mask);
|
||||
|
||||
blk.data_r[tix] = colorf.lane<0>();
|
||||
blk.data_g[tix] = colorf.lane<1>();
|
||||
blk.data_b[tix] = colorf.lane<2>();
|
||||
blk.data_a[tix] = colorf.lane<3>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_2plane(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
assert(scb.partition_count == 1);
|
||||
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
|
||||
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(i);
|
||||
|
||||
// Compare error using a perceptual decode metric for RGBM textures
|
||||
if (config.flags & ASTCENC_FLG_MAP_RGBM)
|
||||
{
|
||||
// Fail encodings that result in zero weight M pixels. Note that this can cause
|
||||
// "interesting" artifacts if we reject all useful encodings - we typically get max
|
||||
// brightness encodings instead which look just as bad. We recommend users apply a
|
||||
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
|
||||
// getting small M values post-quantization, but we can't prove it would never
|
||||
// happen, especially at low bit rates ...
|
||||
if (color.lane<3>() == 0.0f)
|
||||
{
|
||||
return -ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
// Compute error based on decoded RGBM color
|
||||
color = vfloat4(
|
||||
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
|
||||
oldColor = vfloat4(
|
||||
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
}
|
||||
|
||||
vfloat4 error = oldColor - color;
|
||||
error = min(abs(error), 1e15f);
|
||||
error = error * error;
|
||||
|
||||
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
|
||||
}
|
||||
|
||||
return summa.lane<0>();
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_1plane(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
|
||||
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
|
||||
// Get the appropriate partition-table entry
|
||||
unsigned int partition_count = scb.partition_count;
|
||||
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[i],
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = pi.partition_texel_count[i];
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
unsigned int tix = pi.texels_of_partition[i][j];
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
|
||||
vint4(plane1_weights[tix]));
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(tix);
|
||||
|
||||
// Compare error using a perceptual decode metric for RGBM textures
|
||||
if (config.flags & ASTCENC_FLG_MAP_RGBM)
|
||||
{
|
||||
// Fail encodings that result in zero weight M pixels. Note that this can cause
|
||||
// "interesting" artifacts if we reject all useful encodings - we typically get max
|
||||
// brightness encodings instead which look just as bad. We recommend users apply a
|
||||
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
|
||||
// getting small M values post-quantization, but we can't prove it would never
|
||||
// happen, especially at low bit rates ...
|
||||
if (color.lane<3>() == 0.0f)
|
||||
{
|
||||
return -ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
// Compute error based on decoded RGBM color
|
||||
color = vfloat4(
|
||||
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
|
||||
oldColor = vfloat4(
|
||||
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
}
|
||||
|
||||
vfloat4 error = oldColor - color;
|
||||
error = min(abs(error), 1e15f);
|
||||
error = error * error;
|
||||
|
||||
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
|
||||
}
|
||||
}
|
||||
|
||||
return summa.lane<0>();
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_1plane_1partition(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
vfloatacc summav = vfloatacc::zero();
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
// Compute EP1 contribution
|
||||
vint weight1 = vint::loada(plane1_weights + i);
|
||||
vint ep1_r = vint(ep1.lane<0>()) * weight1;
|
||||
vint ep1_g = vint(ep1.lane<1>()) * weight1;
|
||||
vint ep1_b = vint(ep1.lane<2>()) * weight1;
|
||||
vint ep1_a = vint(ep1.lane<3>()) * weight1;
|
||||
|
||||
// Compute EP0 contribution
|
||||
vint weight0 = vint(64) - weight1;
|
||||
vint ep0_r = vint(ep0.lane<0>()) * weight0;
|
||||
vint ep0_g = vint(ep0.lane<1>()) * weight0;
|
||||
vint ep0_b = vint(ep0.lane<2>()) * weight0;
|
||||
vint ep0_a = vint(ep0.lane<3>()) * weight0;
|
||||
|
||||
// Combine contributions
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
|
||||
|
||||
// If using a U8 decode mode bit replicate top 8 bits
|
||||
// so rest of codec can assume 0xFFFF max range everywhere
|
||||
vint colori_r8 = asr<8>(colori_r) * vint(257);
|
||||
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
|
||||
|
||||
vint colori_g8 = asr<8>(colori_g) * vint(257);
|
||||
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
|
||||
|
||||
vint colori_b8 = asr<8>(colori_b) * vint(257);
|
||||
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
|
||||
|
||||
vint colori_a8 = asr<8>(colori_a) * vint(257);
|
||||
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
|
||||
|
||||
// Compute color diff
|
||||
vfloat color_r = int_to_float(colori_r);
|
||||
vfloat color_g = int_to_float(colori_g);
|
||||
vfloat color_b = int_to_float(colori_b);
|
||||
vfloat color_a = int_to_float(colori_a);
|
||||
|
||||
vfloat color_orig_r = loada(blk.data_r + i);
|
||||
vfloat color_orig_g = loada(blk.data_g + i);
|
||||
vfloat color_orig_b = loada(blk.data_b + i);
|
||||
vfloat color_orig_a = loada(blk.data_a + i);
|
||||
|
||||
vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
|
||||
vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
|
||||
vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
|
||||
vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
|
||||
|
||||
// Compute squared error metric
|
||||
color_error_r = color_error_r * color_error_r;
|
||||
color_error_g = color_error_g * color_error_g;
|
||||
color_error_b = color_error_b * color_error_b;
|
||||
color_error_a = color_error_a * color_error_a;
|
||||
|
||||
vfloat metric = color_error_r * blk.channel_weight.lane<0>()
|
||||
+ color_error_g * blk.channel_weight.lane<1>()
|
||||
+ color_error_b * blk.channel_weight.lane<2>()
|
||||
+ color_error_a * blk.channel_weight.lane<3>();
|
||||
|
||||
// Mask off bad lanes
|
||||
vmask mask = lane_id < vint(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
haccumulate(summav, metric, mask);
|
||||
}
|
||||
|
||||
return hadd_s(summav);
|
||||
}
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user