initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

575
thirdparty/libwebp/sharpyuv/sharpyuv.c vendored Normal file
View File

@@ -0,0 +1,575 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Sharp RGB to YUV conversion.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv.h"
#include <assert.h>
#include <limits.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include "src/webp/types.h"
#include "sharpyuv/sharpyuv_cpu.h"
#include "sharpyuv/sharpyuv_dsp.h"
#include "sharpyuv/sharpyuv_gamma.h"
//------------------------------------------------------------------------------
int SharpYuvGetVersion(void) {
return SHARPYUV_VERSION;
}
//------------------------------------------------------------------------------
// Sharp RGB->YUV conversion
static const int kNumIterations = 4;
#define YUV_FIX 16 // fixed-point precision for RGB->YUV
static const int kYuvHalf = 1 << (YUV_FIX - 1);
// Max bit depth so that intermediate calculations fit in 16 bits.
static const int kMaxBitDepth = 14;
// Returns the precision shift to use based on the input rgb_bit_depth.
static int GetPrecisionShift(int rgb_bit_depth) {
// Try to add 2 bits of precision if it fits in kMaxBitDepth. Otherwise remove
// bits if needed.
return ((rgb_bit_depth + 2) <= kMaxBitDepth) ? 2
: (kMaxBitDepth - rgb_bit_depth);
}
typedef int16_t fixed_t; // signed type with extra precision for UV
typedef uint16_t fixed_y_t; // unsigned type with extra precision for W
//------------------------------------------------------------------------------
static uint8_t clip_8b(fixed_t v) {
return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
}
static uint16_t clip(fixed_t v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static fixed_y_t clip_bit_depth(int y, int bit_depth) {
const int max = (1 << bit_depth) - 1;
return (!(y & ~max)) ? (fixed_y_t)y : (y < 0) ? 0 : max;
}
//------------------------------------------------------------------------------
static int RGBToGray(int64_t r, int64_t g, int64_t b) {
const int64_t luma = 13933 * r + 46871 * g + 4732 * b + kYuvHalf;
return (int)(luma >> YUV_FIX);
}
static uint32_t ScaleDown(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
const uint32_t A = SharpYuvGammaToLinear(a, bit_depth, transfer_type);
const uint32_t B = SharpYuvGammaToLinear(b, bit_depth, transfer_type);
const uint32_t C = SharpYuvGammaToLinear(c, bit_depth, transfer_type);
const uint32_t D = SharpYuvGammaToLinear(d, bit_depth, transfer_type);
return SharpYuvLinearToGamma((A + B + C + D + 2) >> 2, bit_depth,
transfer_type);
}
static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w,
int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
int i = 0;
do {
const uint32_t R =
SharpYuvGammaToLinear(src[0 * w + i], bit_depth, transfer_type);
const uint32_t G =
SharpYuvGammaToLinear(src[1 * w + i], bit_depth, transfer_type);
const uint32_t B =
SharpYuvGammaToLinear(src[2 * w + i], bit_depth, transfer_type);
const uint32_t Y = RGBToGray(R, G, B);
dst[i] = (fixed_y_t)SharpYuvLinearToGamma(Y, bit_depth, transfer_type);
} while (++i < w);
}
static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
fixed_t* dst, int uv_w, int rgb_bit_depth,
SharpYuvTransferFunctionType transfer_type) {
int i = 0;
do {
const int r =
ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1], src2[0 * uv_w + 0],
src2[0 * uv_w + 1], rgb_bit_depth, transfer_type);
const int g =
ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1], src2[2 * uv_w + 0],
src2[2 * uv_w + 1], rgb_bit_depth, transfer_type);
const int b =
ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1], src2[4 * uv_w + 0],
src2[4 * uv_w + 1], rgb_bit_depth, transfer_type);
const int W = RGBToGray(r, g, b);
dst[0 * uv_w] = (fixed_t)(r - W);
dst[1 * uv_w] = (fixed_t)(g - W);
dst[2 * uv_w] = (fixed_t)(b - W);
dst += 1;
src1 += 2;
src2 += 2;
} while (++i < uv_w);
}
static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
int i = 0;
assert(w > 0);
do {
y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
} while (++i < w);
}
//------------------------------------------------------------------------------
static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0, int bit_depth) {
const int v0 = (A * 3 + B + 2) >> 2;
return clip_bit_depth(v0 + W0, bit_depth);
}
//------------------------------------------------------------------------------
static WEBP_INLINE int Shift(int v, int shift) {
return (shift >= 0) ? (v << shift) : (v >> -shift);
}
static void ImportOneRow(const uint8_t* const r_ptr,
const uint8_t* const g_ptr,
const uint8_t* const b_ptr,
int rgb_step,
int rgb_bit_depth,
int pic_width,
fixed_y_t* const dst) {
// Convert the rgb_step from a number of bytes to a number of uint8_t or
// uint16_t values depending the bit depth.
const int step = (rgb_bit_depth > 8) ? rgb_step / 2 : rgb_step;
int i = 0;
const int w = (pic_width + 1) & ~1;
do {
const int off = i * step;
const int shift = GetPrecisionShift(rgb_bit_depth);
if (rgb_bit_depth == 8) {
dst[i + 0 * w] = Shift(r_ptr[off], shift);
dst[i + 1 * w] = Shift(g_ptr[off], shift);
dst[i + 2 * w] = Shift(b_ptr[off], shift);
} else {
dst[i + 0 * w] = Shift(((uint16_t*)r_ptr)[off], shift);
dst[i + 1 * w] = Shift(((uint16_t*)g_ptr)[off], shift);
dst[i + 2 * w] = Shift(((uint16_t*)b_ptr)[off], shift);
}
} while (++i < pic_width);
if (pic_width & 1) { // replicate rightmost pixel
dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
}
}
static void InterpolateTwoRows(const fixed_y_t* const best_y,
const fixed_t* prev_uv,
const fixed_t* cur_uv,
const fixed_t* next_uv,
int w,
fixed_y_t* out1,
fixed_y_t* out2,
int rgb_bit_depth) {
const int uv_w = w >> 1;
const int len = (w - 1) >> 1; // length to filter
int k = 3;
const int bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
while (k-- > 0) { // process each R/G/B segments in turn
// special boundary case for i==0
out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0], bit_depth);
out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w], bit_depth);
SharpYuvFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1,
bit_depth);
SharpYuvFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1,
bit_depth);
// special boundary case for i == w - 1 when w is even
if (!(w & 1)) {
out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
best_y[w - 1 + 0], bit_depth);
out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
best_y[w - 1 + w], bit_depth);
}
out1 += w;
out2 += w;
prev_uv += uv_w;
cur_uv += uv_w;
next_uv += uv_w;
}
}
static WEBP_INLINE int RGBToYUVComponent(int r, int g, int b,
const int coeffs[4], int sfix) {
const int srounder = 1 << (YUV_FIX + sfix - 1);
const int luma = coeffs[0] * r + coeffs[1] * g + coeffs[2] * b +
coeffs[3] + srounder;
return (luma >> (YUV_FIX + sfix));
}
static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
uint8_t* y_ptr, int y_stride, uint8_t* u_ptr,
int u_stride, uint8_t* v_ptr, int v_stride,
int rgb_bit_depth,
int yuv_bit_depth, int width, int height,
const SharpYuvConversionMatrix* yuv_matrix) {
int i, j;
const fixed_t* const best_uv_base = best_uv;
const int w = (width + 1) & ~1;
const int h = (height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
const int sfix = GetPrecisionShift(rgb_bit_depth);
const int yuv_max = (1 << yuv_bit_depth) - 1;
best_uv = best_uv_base;
j = 0;
do {
i = 0;
do {
const int off = (i >> 1);
const int W = best_y[i];
const int r = best_uv[off + 0 * uv_w] + W;
const int g = best_uv[off + 1 * uv_w] + W;
const int b = best_uv[off + 2 * uv_w] + W;
const int y = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_y, sfix);
if (yuv_bit_depth <= 8) {
y_ptr[i] = clip_8b(y);
} else {
((uint16_t*)y_ptr)[i] = clip(y, yuv_max);
}
} while (++i < width);
best_y += w;
best_uv += (j & 1) * 3 * uv_w;
y_ptr += y_stride;
} while (++j < height);
best_uv = best_uv_base;
j = 0;
do {
i = 0;
do {
// Note r, g and b values here are off by W, but a constant offset on all
// 3 components doesn't change the value of u and v with a YCbCr matrix.
const int r = best_uv[i + 0 * uv_w];
const int g = best_uv[i + 1 * uv_w];
const int b = best_uv[i + 2 * uv_w];
const int u = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_u, sfix);
const int v = RGBToYUVComponent(r, g, b, yuv_matrix->rgb_to_v, sfix);
if (yuv_bit_depth <= 8) {
u_ptr[i] = clip_8b(u);
v_ptr[i] = clip_8b(v);
} else {
((uint16_t*)u_ptr)[i] = clip(u, yuv_max);
((uint16_t*)v_ptr)[i] = clip(v, yuv_max);
}
} while (++i < uv_w);
best_uv += 3 * uv_w;
u_ptr += u_stride;
v_ptr += v_stride;
} while (++j < uv_h);
return 1;
}
//------------------------------------------------------------------------------
// Main function
static void* SafeMalloc(uint64_t nmemb, size_t size) {
const uint64_t total_size = nmemb * (uint64_t)size;
if (total_size != (size_t)total_size) return NULL;
return malloc((size_t)total_size);
}
#define SAFE_ALLOC(W, H, T) ((T*)SafeMalloc((uint64_t)(W) * (H), sizeof(T)))
static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
const uint8_t* b_ptr, int rgb_step, int rgb_stride,
int rgb_bit_depth, uint8_t* y_ptr, int y_stride,
uint8_t* u_ptr, int u_stride, uint8_t* v_ptr,
int v_stride, int yuv_bit_depth, int width,
int height,
const SharpYuvConversionMatrix* yuv_matrix,
SharpYuvTransferFunctionType transfer_type) {
// we expand the right/bottom border if needed
const int w = (width + 1) & ~1;
const int h = (height + 1) & ~1;
const int uv_w = w >> 1;
const int uv_h = h >> 1;
const int y_bit_depth = rgb_bit_depth + GetPrecisionShift(rgb_bit_depth);
uint64_t prev_diff_y_sum = ~0;
int j, iter;
// TODO(skal): allocate one big memory chunk. But for now, it's easier
// for valgrind debugging to have several chunks.
fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t); // scratch
fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
fixed_y_t* best_y = best_y_base;
fixed_y_t* target_y = target_y_base;
fixed_t* best_uv = best_uv_base;
fixed_t* target_uv = target_uv_base;
const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
int ok;
assert(w > 0);
assert(h > 0);
if (best_y_base == NULL || best_uv_base == NULL ||
target_y_base == NULL || target_uv_base == NULL ||
best_rgb_y == NULL || best_rgb_uv == NULL ||
tmp_buffer == NULL) {
ok = 0;
goto End;
}
// Import RGB samples to W/RGB representation.
for (j = 0; j < height; j += 2) {
const int is_last_row = (j == height - 1);
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
// prepare two rows of input
ImportOneRow(r_ptr, g_ptr, b_ptr, rgb_step, rgb_bit_depth, width,
src1);
if (!is_last_row) {
ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
rgb_step, rgb_bit_depth, width, src2);
} else {
memcpy(src2, src1, 3 * w * sizeof(*src2));
}
StoreGray(src1, best_y + 0, w);
StoreGray(src2, best_y + w, w);
UpdateW(src1, target_y, w, rgb_bit_depth, transfer_type);
UpdateW(src2, target_y + w, w, rgb_bit_depth, transfer_type);
UpdateChroma(src1, src2, target_uv, uv_w, rgb_bit_depth, transfer_type);
memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
r_ptr += 2 * rgb_stride;
g_ptr += 2 * rgb_stride;
b_ptr += 2 * rgb_stride;
}
// Iterate and resolve clipping conflicts.
for (iter = 0; iter < kNumIterations; ++iter) {
const fixed_t* cur_uv = best_uv_base;
const fixed_t* prev_uv = best_uv_base;
uint64_t diff_y_sum = 0;
best_y = best_y_base;
best_uv = best_uv_base;
target_y = target_y_base;
target_uv = target_uv_base;
j = 0;
do {
fixed_y_t* const src1 = tmp_buffer + 0 * w;
fixed_y_t* const src2 = tmp_buffer + 3 * w;
{
const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w,
src1, src2, rgb_bit_depth);
prev_uv = cur_uv;
cur_uv = next_uv;
}
UpdateW(src1, best_rgb_y + 0 * w, w, rgb_bit_depth, transfer_type);
UpdateW(src2, best_rgb_y + 1 * w, w, rgb_bit_depth, transfer_type);
UpdateChroma(src1, src2, best_rgb_uv, uv_w, rgb_bit_depth, transfer_type);
// update two rows of Y and one row of RGB
diff_y_sum +=
SharpYuvUpdateY(target_y, best_rgb_y, best_y, 2 * w, y_bit_depth);
SharpYuvUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
best_y += 2 * w;
best_uv += 3 * uv_w;
target_y += 2 * w;
target_uv += 3 * uv_w;
j += 2;
} while (j < h);
// test exit condition
if (iter > 0) {
if (diff_y_sum < diff_y_threshold) break;
if (diff_y_sum > prev_diff_y_sum) break;
}
prev_diff_y_sum = diff_y_sum;
}
// final reconstruction
ok = ConvertWRGBToYUV(best_y_base, best_uv_base, y_ptr, y_stride, u_ptr,
u_stride, v_ptr, v_stride, rgb_bit_depth, yuv_bit_depth,
width, height, yuv_matrix);
End:
free(best_y_base);
free(best_uv_base);
free(target_y_base);
free(target_uv_base);
free(best_rgb_y);
free(best_rgb_uv);
free(tmp_buffer);
return ok;
}
#undef SAFE_ALLOC
#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
#include <pthread.h> // NOLINT
#define LOCK_ACCESS \
static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
if (pthread_mutex_lock(&sharpyuv_lock)) return
#define UNLOCK_ACCESS_AND_RETURN \
do { \
(void)pthread_mutex_unlock(&sharpyuv_lock); \
return; \
} while (0)
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
#define LOCK_ACCESS do {} while (0)
#define UNLOCK_ACCESS_AND_RETURN return
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
// Hidden exported init function.
// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
// users can declare it as extern and call it with an alternate VP8CPUInfo
// function.
extern VP8CPUInfo SharpYuvGetCPUInfo;
SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
void SharpYuvInit(VP8CPUInfo cpu_info_func) {
static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
(VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
LOCK_ACCESS;
// Only update SharpYuvGetCPUInfo when called from external code to avoid a
// race on reading the value in SharpYuvConvert().
if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
SharpYuvGetCPUInfo = cpu_info_func;
}
if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
UNLOCK_ACCESS_AND_RETURN;
}
SharpYuvInitDsp();
SharpYuvInitGammaTables();
sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
UNLOCK_ACCESS_AND_RETURN;
}
int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
int rgb_step, int rgb_stride, int rgb_bit_depth,
void* y_ptr, int y_stride, void* u_ptr, int u_stride,
void* v_ptr, int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvConversionMatrix* yuv_matrix) {
SharpYuvOptions options;
options.yuv_matrix = yuv_matrix;
options.transfer_type = kSharpYuvTransferFunctionSrgb;
return SharpYuvConvertWithOptions(
r_ptr, g_ptr, b_ptr, rgb_step, rgb_stride, rgb_bit_depth, y_ptr, y_stride,
u_ptr, u_stride, v_ptr, v_stride, yuv_bit_depth, width, height, &options);
}
int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix* yuv_matrix,
SharpYuvOptions* options, int version) {
const int major = (version >> 24);
const int minor = (version >> 16) & 0xff;
if (options == NULL || yuv_matrix == NULL ||
(major == SHARPYUV_VERSION_MAJOR && major == 0 &&
minor != SHARPYUV_VERSION_MINOR) ||
(major != SHARPYUV_VERSION_MAJOR)) {
return 0;
}
options->yuv_matrix = yuv_matrix;
options->transfer_type = kSharpYuvTransferFunctionSrgb;
return 1;
}
int SharpYuvConvertWithOptions(const void* r_ptr, const void* g_ptr,
const void* b_ptr, int rgb_step, int rgb_stride,
int rgb_bit_depth, void* y_ptr, int y_stride,
void* u_ptr, int u_stride, void* v_ptr,
int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvOptions* options) {
const SharpYuvConversionMatrix* yuv_matrix = options->yuv_matrix;
SharpYuvTransferFunctionType transfer_type = options->transfer_type;
SharpYuvConversionMatrix scaled_matrix;
const int rgb_max = (1 << rgb_bit_depth) - 1;
const int rgb_round = 1 << (rgb_bit_depth - 1);
const int yuv_max = (1 << yuv_bit_depth) - 1;
const int sfix = GetPrecisionShift(rgb_bit_depth);
if (width < 1 || height < 1 || width == INT_MAX || height == INT_MAX ||
r_ptr == NULL || g_ptr == NULL || b_ptr == NULL || y_ptr == NULL ||
u_ptr == NULL || v_ptr == NULL) {
return 0;
}
if (rgb_bit_depth != 8 && rgb_bit_depth != 10 && rgb_bit_depth != 12 &&
rgb_bit_depth != 16) {
return 0;
}
if (yuv_bit_depth != 8 && yuv_bit_depth != 10 && yuv_bit_depth != 12) {
return 0;
}
if (rgb_bit_depth > 8 && (rgb_step % 2 != 0 || rgb_stride % 2 != 0)) {
// Step/stride should be even for uint16_t buffers.
return 0;
}
if (yuv_bit_depth > 8 &&
(y_stride % 2 != 0 || u_stride % 2 != 0 || v_stride % 2 != 0)) {
// Stride should be even for uint16_t buffers.
return 0;
}
// The address of the function pointer is used to avoid a read race.
SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
// Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
// rgb->yuv conversion matrix.
if (rgb_bit_depth == yuv_bit_depth) {
memcpy(&scaled_matrix, yuv_matrix, sizeof(scaled_matrix));
} else {
int i;
for (i = 0; i < 3; ++i) {
scaled_matrix.rgb_to_y[i] =
(yuv_matrix->rgb_to_y[i] * yuv_max + rgb_round) / rgb_max;
scaled_matrix.rgb_to_u[i] =
(yuv_matrix->rgb_to_u[i] * yuv_max + rgb_round) / rgb_max;
scaled_matrix.rgb_to_v[i] =
(yuv_matrix->rgb_to_v[i] * yuv_max + rgb_round) / rgb_max;
}
}
// Also incorporate precision change scaling.
scaled_matrix.rgb_to_y[3] = Shift(yuv_matrix->rgb_to_y[3], sfix);
scaled_matrix.rgb_to_u[3] = Shift(yuv_matrix->rgb_to_u[3], sfix);
scaled_matrix.rgb_to_v[3] = Shift(yuv_matrix->rgb_to_v[3], sfix);
return DoSharpArgbToYuv(
(const uint8_t*)r_ptr, (const uint8_t*)g_ptr, (const uint8_t*)b_ptr,
rgb_step, rgb_stride, rgb_bit_depth, (uint8_t*)y_ptr, y_stride,
(uint8_t*)u_ptr, u_stride, (uint8_t*)v_ptr, v_stride, yuv_bit_depth,
width, height, &scaled_matrix, transfer_type);
}
//------------------------------------------------------------------------------

181
thirdparty/libwebp/sharpyuv/sharpyuv.h vendored Normal file
View File

@@ -0,0 +1,181 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Sharp RGB to YUV conversion.
#ifndef WEBP_SHARPYUV_SHARPYUV_H_
#define WEBP_SHARPYUV_SHARPYUV_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifndef SHARPYUV_EXTERN
#ifdef WEBP_EXTERN
#define SHARPYUV_EXTERN WEBP_EXTERN
#else
// This explicitly marks library functions and allows for changing the
// signature for e.g., Windows DLL builds.
#if defined(_WIN32) && defined(WEBP_DLL)
#define SHARPYUV_EXTERN __declspec(dllexport)
#elif defined(__GNUC__) && __GNUC__ >= 4
#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
#else
#define SHARPYUV_EXTERN extern
#endif /* defined(_WIN32) && defined(WEBP_DLL) */
#endif /* WEBP_EXTERN */
#endif /* SHARPYUV_EXTERN */
#ifndef SHARPYUV_INLINE
#ifdef WEBP_INLINE
#define SHARPYUV_INLINE WEBP_INLINE
#else
#ifndef _MSC_VER
#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define SHARPYUV_INLINE inline
#else
#define SHARPYUV_INLINE
#endif
#else
#define SHARPYUV_INLINE __forceinline
#endif /* _MSC_VER */
#endif /* WEBP_INLINE */
#endif /* SHARPYUV_INLINE */
// SharpYUV API version following the convention from semver.org
#define SHARPYUV_VERSION_MAJOR 0
#define SHARPYUV_VERSION_MINOR 4
#define SHARPYUV_VERSION_PATCH 1
// Version as a uint32_t. The major number is the high 8 bits.
// The minor number is the middle 8 bits. The patch number is the low 16 bits.
#define SHARPYUV_MAKE_VERSION(MAJOR, MINOR, PATCH) \
(((MAJOR) << 24) | ((MINOR) << 16) | (PATCH))
#define SHARPYUV_VERSION \
SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
SHARPYUV_VERSION_PATCH)
// Returns the library's version number, packed in hexadecimal. See
// SHARPYUV_VERSION.
SHARPYUV_EXTERN int SharpYuvGetVersion(void);
// RGB to YUV conversion matrix, in 16 bit fixed point.
// y_ = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
// u_ = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
// v_ = rgb_to_v[0] * r + rgb_to_v[1] * g + rgb_to_v[2] * b + rgb_to_v[3]
// Then the values are divided by 1<<16 and rounded.
// y = (y_ + (1 << 15)) >> 16
// u = (u_ + (1 << 15)) >> 16
// v = (v_ + (1 << 15)) >> 16
//
// Typically, the offset values rgb_to_y[3], rgb_to_u[3] and rgb_to_v[3] depend
// on the input's bit depth, e.g., rgb_to_u[3] = 1 << (rgb_bit_depth - 1 + 16).
// See also sharpyuv_csp.h to get a predefined matrix or generate a matrix.
typedef struct {
int rgb_to_y[4];
int rgb_to_u[4];
int rgb_to_v[4];
} SharpYuvConversionMatrix;
typedef struct SharpYuvOptions SharpYuvOptions;
// Enums for transfer functions, as defined in H.273,
// https://www.itu.int/rec/T-REC-H.273-202107-I/en
typedef enum SharpYuvTransferFunctionType {
// 0 is reserved
kSharpYuvTransferFunctionBt709 = 1,
// 2 is unspecified
// 3 is reserved
kSharpYuvTransferFunctionBt470M = 4,
kSharpYuvTransferFunctionBt470Bg = 5,
kSharpYuvTransferFunctionBt601 = 6,
kSharpYuvTransferFunctionSmpte240 = 7,
kSharpYuvTransferFunctionLinear = 8,
kSharpYuvTransferFunctionLog100 = 9,
kSharpYuvTransferFunctionLog100_Sqrt10 = 10,
kSharpYuvTransferFunctionIec61966 = 11,
kSharpYuvTransferFunctionBt1361 = 12,
kSharpYuvTransferFunctionSrgb = 13,
kSharpYuvTransferFunctionBt2020_10Bit = 14,
kSharpYuvTransferFunctionBt2020_12Bit = 15,
kSharpYuvTransferFunctionSmpte2084 = 16, // PQ
kSharpYuvTransferFunctionSmpte428 = 17,
kSharpYuvTransferFunctionHlg = 18,
kSharpYuvTransferFunctionNum
} SharpYuvTransferFunctionType;
// Converts RGB to YUV420 using a downsampling algorithm that minimizes
// artefacts caused by chroma subsampling.
// This is slower than standard downsampling (averaging of 4 UV values).
// Assumes that the image will be upsampled using a bilinear filter. If nearest
// neighbor is used instead, the upsampled image might look worse than with
// standard downsampling.
// r_ptr, g_ptr, b_ptr: pointers to the source r, g and b channels. Should point
// to uint8_t buffers if rgb_bit_depth is 8, or uint16_t buffers otherwise.
// rgb_step: distance in bytes between two horizontally adjacent pixels on the
// r, g and b channels. If rgb_bit_depth is > 8, it should be a
// multiple of 2.
// rgb_stride: distance in bytes between two vertically adjacent pixels on the
// r, g, and b channels. If rgb_bit_depth is > 8, it should be a
// multiple of 2.
// rgb_bit_depth: number of bits for each r/g/b value. One of: 8, 10, 12, 16.
// Note: 16 bit input is truncated to 14 bits before conversion to yuv.
// yuv_bit_depth: number of bits for each y/u/v value. One of: 8, 10, 12.
// y_ptr, u_ptr, v_ptr: pointers to the destination y, u and v channels. Should
// point to uint8_t buffers if yuv_bit_depth is 8, or uint16_t buffers
// otherwise.
// y_stride, u_stride, v_stride: distance in bytes between two vertically
// adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
// should be multiples of 2.
// width, height: width and height of the image in pixels
// yuv_matrix: RGB to YUV conversion matrix. The matrix values typically
// depend on the input's rgb_bit_depth.
// This function calls SharpYuvConvertWithOptions with a default transfer
// function of kSharpYuvTransferFunctionSrgb.
SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
const void* b_ptr, int rgb_step,
int rgb_stride, int rgb_bit_depth,
void* y_ptr, int y_stride, void* u_ptr,
int u_stride, void* v_ptr, int v_stride,
int yuv_bit_depth, int width, int height,
const SharpYuvConversionMatrix* yuv_matrix);
struct SharpYuvOptions {
// This matrix cannot be NULL and can be initialized by
// SharpYuvComputeConversionMatrix.
const SharpYuvConversionMatrix* yuv_matrix;
SharpYuvTransferFunctionType transfer_type;
};
// Internal, version-checked, entry point
SHARPYUV_EXTERN int SharpYuvOptionsInitInternal(const SharpYuvConversionMatrix*,
SharpYuvOptions*, int);
// Should always be called, to initialize a fresh SharpYuvOptions
// structure before modification. SharpYuvOptionsInit() must have succeeded
// before using the 'options' object.
static SHARPYUV_INLINE int SharpYuvOptionsInit(
const SharpYuvConversionMatrix* yuv_matrix, SharpYuvOptions* options) {
return SharpYuvOptionsInitInternal(yuv_matrix, options, SHARPYUV_VERSION);
}
SHARPYUV_EXTERN int SharpYuvConvertWithOptions(
const void* r_ptr, const void* g_ptr, const void* b_ptr, int rgb_step,
int rgb_stride, int rgb_bit_depth, void* y_ptr, int y_stride, void* u_ptr,
int u_stride, void* v_ptr, int v_stride, int yuv_bit_depth, int width,
int height, const SharpYuvOptions* options);
// TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
// support (it's rarely used in practice, especially for images).
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_H_

View File

@@ -0,0 +1,14 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
#include "sharpyuv/sharpyuv_cpu.h"
// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
// function pointer is renamed in sharpyuv_cpu.h.
#include "src/dsp/cpu.c"

View File

@@ -0,0 +1,22 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
#include "sharpyuv/sharpyuv.h"
// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
// SharpYuvInit() replaces the use of the function pointer.
#undef WEBP_EXTERN
#define WEBP_EXTERN extern
#define VP8GetCPUInfo SharpYuvGetCPUInfo
#include "src/dsp/cpu.h"
#endif // WEBP_SHARPYUV_SHARPYUV_CPU_H_

View File

@@ -0,0 +1,114 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Colorspace utilities.
#include "sharpyuv/sharpyuv_csp.h"
#include <assert.h>
#include <math.h>
#include <stddef.h>
static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
SharpYuvConversionMatrix* matrix) {
const float kr = yuv_color_space->kr;
const float kb = yuv_color_space->kb;
const float kg = 1.0f - kr - kb;
const float cb = 0.5f / (1.0f - kb);
const float cr = 0.5f / (1.0f - kr);
const int shift = yuv_color_space->bit_depth - 8;
const float denom = (float)((1 << yuv_color_space->bit_depth) - 1);
float scale_y = 1.0f;
float add_y = 0.0f;
float scale_u = cb;
float scale_v = cr;
float add_uv = (float)(128 << shift);
assert(yuv_color_space->bit_depth >= 8);
if (yuv_color_space->range == kSharpYuvRangeLimited) {
scale_y *= (219 << shift) / denom;
scale_u *= (224 << shift) / denom;
scale_v *= (224 << shift) / denom;
add_y = (float)(16 << shift);
}
matrix->rgb_to_y[0] = ToFixed16(kr * scale_y);
matrix->rgb_to_y[1] = ToFixed16(kg * scale_y);
matrix->rgb_to_y[2] = ToFixed16(kb * scale_y);
matrix->rgb_to_y[3] = ToFixed16(add_y);
matrix->rgb_to_u[0] = ToFixed16(-kr * scale_u);
matrix->rgb_to_u[1] = ToFixed16(-kg * scale_u);
matrix->rgb_to_u[2] = ToFixed16((1 - kb) * scale_u);
matrix->rgb_to_u[3] = ToFixed16(add_uv);
matrix->rgb_to_v[0] = ToFixed16((1 - kr) * scale_v);
matrix->rgb_to_v[1] = ToFixed16(-kg * scale_v);
matrix->rgb_to_v[2] = ToFixed16(-kb * scale_v);
matrix->rgb_to_v[3] = ToFixed16(add_uv);
}
// Matrices are in YUV_FIX fixed point precision.
// WebP's matrix, similar but not identical to kRec601LimitedMatrix
// Derived using the following formulas:
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
static const SharpYuvConversionMatrix kWebpMatrix = {
{16839, 33059, 6420, 16 << 16},
{-9719, -19081, 28800, 128 << 16},
{28800, -24116, -4684, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec601LimitedMatrix = {
{16829, 33039, 6416, 16 << 16},
{-9714, -19071, 28784, 128 << 16},
{28784, -24103, -4681, 128 << 16},
};
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec601FullMatrix = {
{19595, 38470, 7471, 0},
{-11058, -21710, 32768, 128 << 16},
{32768, -27439, -5329, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
static const SharpYuvConversionMatrix kRec709LimitedMatrix = {
{11966, 40254, 4064, 16 << 16},
{-6596, -22189, 28784, 128 << 16},
{28784, -26145, -2639, 128 << 16},
};
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
static const SharpYuvConversionMatrix kRec709FullMatrix = {
{13933, 46871, 4732, 0},
{-7509, -25259, 32768, 128 << 16},
{32768, -29763, -3005, 128 << 16},
};
const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
SharpYuvMatrixType matrix_type) {
switch (matrix_type) {
case kSharpYuvMatrixWebp:
return &kWebpMatrix;
case kSharpYuvMatrixRec601Limited:
return &kRec601LimitedMatrix;
case kSharpYuvMatrixRec601Full:
return &kRec601FullMatrix;
case kSharpYuvMatrixRec709Limited:
return &kRec709LimitedMatrix;
case kSharpYuvMatrixRec709Full:
return &kRec709FullMatrix;
case kSharpYuvMatrixNum:
return NULL;
}
return NULL;
}

View File

@@ -0,0 +1,65 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Colorspace utilities.
#ifndef WEBP_SHARPYUV_SHARPYUV_CSP_H_
#define WEBP_SHARPYUV_SHARPYUV_CSP_H_
#include "sharpyuv/sharpyuv.h"
#ifdef __cplusplus
extern "C" {
#endif
// Range of YUV values.
typedef enum {
kSharpYuvRangeFull, // YUV values between [0;255] (for 8 bit)
kSharpYuvRangeLimited // Y in [16;235], YUV in [16;240] (for 8 bit)
} SharpYuvRange;
// Constants that define a YUV color space.
typedef struct {
// Kr and Kb are defined such that:
// Y = Kr * r + Kg * g + Kb * b where Kg = 1 - Kr - Kb.
float kr;
float kb;
int bit_depth; // 8, 10 or 12
SharpYuvRange range;
} SharpYuvColorSpace;
// Fills in 'matrix' for the given YUVColorSpace.
SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
const SharpYuvColorSpace* yuv_color_space,
SharpYuvConversionMatrix* matrix);
// Enums for precomputed conversion matrices.
typedef enum {
// WebP's matrix, similar but not identical to kSharpYuvMatrixRec601Limited
kSharpYuvMatrixWebp = 0,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec601Limited,
// Kr=0.2990f Kb=0.1140f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec601Full,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeLimited
kSharpYuvMatrixRec709Limited,
// Kr=0.2126f Kb=0.0722f bit_depth=8 range=kSharpYuvRangeFull
kSharpYuvMatrixRec709Full,
kSharpYuvMatrixNum
} SharpYuvMatrixType;
// Returns a pointer to a matrix for one of the predefined colorspaces.
SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
SharpYuvMatrixType matrix_type);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_CSP_H_

View File

@@ -0,0 +1,104 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#include <assert.h>
#include <stdlib.h>
#include "sharpyuv/sharpyuv_cpu.h"
#include "src/webp/types.h"
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
static uint16_t clip(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_C(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
uint64_t diff = 0;
int i;
const int max_y = (1 << bit_depth) - 1;
for (i = 0; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip(new_y, max_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYuvUpdateRGB_C(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow_C(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
int i;
const int max_y = (1 << bit_depth) - 1;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip(best_y[2 * i + 1] + v1, max_y);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//-----------------------------------------------------------------------------
uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len, int bit_depth);
void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref, int16_t* dst,
int len);
void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out, int bit_depth);
extern VP8CPUInfo SharpYuvGetCPUInfo;
extern void InitSharpYuvSSE2(void);
extern void InitSharpYuvNEON(void);
void SharpYuvInitDsp(void) {
#if !WEBP_NEON_OMIT_C_CODE
SharpYuvUpdateY = SharpYuvUpdateY_C;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
SharpYuvFilterRow = SharpYuvFilterRow_C;
#endif
if (SharpYuvGetCPUInfo != NULL) {
#if defined(WEBP_HAVE_SSE2)
if (SharpYuvGetCPUInfo(kSSE2)) {
InitSharpYuvSSE2();
}
#endif // WEBP_HAVE_SSE2
}
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
InitSharpYuvNEON();
}
#endif // WEBP_HAVE_NEON
assert(SharpYuvUpdateY != NULL);
assert(SharpYuvUpdateRGB != NULL);
assert(SharpYuvFilterRow != NULL);
}

View File

@@ -0,0 +1,28 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
#ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
#define WEBP_SHARPYUV_SHARPYUV_DSP_H_
#include "sharpyuv/sharpyuv_cpu.h"
#include "src/webp/types.h"
extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len, int bit_depth);
extern void (*SharpYuvUpdateRGB)(const int16_t* src, const int16_t* ref,
int16_t* dst, int len);
extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth);
void SharpYuvInitDsp(void);
#endif // WEBP_SHARPYUV_SHARPYUV_DSP_H_

View File

@@ -0,0 +1,423 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Gamma correction utilities.
#include "sharpyuv/sharpyuv_gamma.h"
#include <assert.h>
#include <float.h>
#include <math.h>
#include "src/webp/types.h"
// Gamma correction compensates loss of resolution during chroma subsampling.
// Size of pre-computed table for converting from gamma to linear.
#define GAMMA_TO_LINEAR_TAB_BITS 10
#define GAMMA_TO_LINEAR_TAB_SIZE (1 << GAMMA_TO_LINEAR_TAB_BITS)
static uint32_t kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 2];
#define LINEAR_TO_GAMMA_TAB_BITS 9
#define LINEAR_TO_GAMMA_TAB_SIZE (1 << LINEAR_TO_GAMMA_TAB_BITS)
static uint32_t kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 2];
#if defined(_MSC_VER)
static const double kGammaF = 2.222222222222222;
#else
static const double kGammaF = 1. / 0.45;
#endif
#define GAMMA_TO_LINEAR_BITS 16
static volatile int kGammaTablesSOk = 0;
void SharpYuvInitGammaTables(void) {
assert(GAMMA_TO_LINEAR_BITS <= 16);
if (!kGammaTablesSOk) {
int v;
const double a = 0.09929682680944;
const double thresh = 0.018053968510807;
const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
// Precompute gamma to linear table.
{
const double norm = 1. / GAMMA_TO_LINEAR_TAB_SIZE;
const double a_rec = 1. / (1. + a);
for (v = 0; v <= GAMMA_TO_LINEAR_TAB_SIZE; ++v) {
const double g = norm * v;
double value;
if (g <= thresh * 4.5) {
value = g / 4.5;
} else {
value = pow(a_rec * (g + a), kGammaF);
}
kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
}
// to prevent small rounding errors to cause read-overflow:
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE + 1] =
kGammaToLinearTabS[GAMMA_TO_LINEAR_TAB_SIZE];
}
// Precompute linear to gamma table.
{
const double scale = 1. / LINEAR_TO_GAMMA_TAB_SIZE;
for (v = 0; v <= LINEAR_TO_GAMMA_TAB_SIZE; ++v) {
const double g = scale * v;
double value;
if (g <= thresh) {
value = 4.5 * g;
} else {
value = (1. + a) * pow(g, 1. / kGammaF) - a;
}
kLinearToGammaTabS[v] =
(uint32_t)(final_scale * value + 0.5);
}
// to prevent small rounding errors to cause read-overflow:
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE + 1] =
kLinearToGammaTabS[LINEAR_TO_GAMMA_TAB_SIZE];
}
kGammaTablesSOk = 1;
}
}
static WEBP_INLINE int Shift(int v, int shift) {
return (shift >= 0) ? (v << shift) : (v >> -shift);
}
static WEBP_INLINE uint32_t FixedPointInterpolation(int v, uint32_t* tab,
int tab_pos_shift_right,
int tab_value_shift) {
const uint32_t tab_pos = Shift(v, -tab_pos_shift_right);
// fractional part, in 'tab_pos_shift' fixed-point precision
const uint32_t x = v - (tab_pos << tab_pos_shift_right); // fractional part
// v0 / v1 are in kGammaToLinearBits fixed-point precision (range [0..1])
const uint32_t v0 = Shift(tab[tab_pos + 0], tab_value_shift);
const uint32_t v1 = Shift(tab[tab_pos + 1], tab_value_shift);
// Final interpolation.
const uint32_t v2 = (v1 - v0) * x; // note: v1 >= v0.
const int half =
(tab_pos_shift_right > 0) ? 1 << (tab_pos_shift_right - 1) : 0;
const uint32_t result = v0 + ((v2 + half) >> tab_pos_shift_right);
return result;
}
static uint32_t ToLinearSrgb(uint16_t v, int bit_depth) {
const int shift = GAMMA_TO_LINEAR_TAB_BITS - bit_depth;
if (shift > 0) {
return kGammaToLinearTabS[v << shift];
}
return FixedPointInterpolation(v, kGammaToLinearTabS, -shift, 0);
}
static uint16_t FromLinearSrgb(uint32_t value, int bit_depth) {
return FixedPointInterpolation(
value, kLinearToGammaTabS,
(GAMMA_TO_LINEAR_BITS - LINEAR_TO_GAMMA_TAB_BITS),
bit_depth - GAMMA_TO_LINEAR_BITS);
}
////////////////////////////////////////////////////////////////////////////////
#define CLAMP(x, low, high) \
(((x) < (low)) ? (low) : (((high) < (x)) ? (high) : (x)))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
static WEBP_INLINE float Roundf(float x) {
if (x < 0)
return (float)ceil((double)(x - 0.5f));
else
return (float)floor((double)(x + 0.5f));
}
static WEBP_INLINE float Powf(float base, float exp) {
return (float)pow((double)base, (double)exp);
}
static WEBP_INLINE float Log10f(float x) { return (float)log10((double)x); }
static float ToLinear709(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinear709(float linear) {
if (linear < 0.f) {
return 0.f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
} else if (linear < 1.f) {
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
return 1.f;
}
static float ToLinear470M(float gamma) {
return Powf(CLAMP(gamma, 0.f, 1.f), 2.2f);
}
static float FromLinear470M(float linear) {
return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.2f);
}
static float ToLinear470Bg(float gamma) {
return Powf(CLAMP(gamma, 0.f, 1.f), 2.8f);
}
static float FromLinear470Bg(float linear) {
return Powf(CLAMP(linear, 0.f, 1.f), 1.f / 2.8f);
}
static float ToLinearSmpte240(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma < 4.f * 0.022821585529445f) {
return gamma / 4.f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.111572195921731f) / 1.111572195921731f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinearSmpte240(float linear) {
if (linear < 0.f) {
return 0.f;
} else if (linear < 0.022821585529445f) {
return linear * 4.f;
} else if (linear < 1.f) {
return 1.111572195921731f * Powf(linear, 0.45f) - 0.111572195921731f;
}
return 1.f;
}
static float ToLinearLog100(float gamma) {
// The function is non-bijective so choose the middle of [0, 0.01].
const float mid_interval = 0.01f / 2.f;
return (gamma <= 0.0f) ? mid_interval
: Powf(10.0f, 2.f * (MIN(gamma, 1.f) - 1.0f));
}
static float FromLinearLog100(float linear) {
return (linear < 0.01f) ? 0.0f : 1.0f + Log10f(MIN(linear, 1.f)) / 2.0f;
}
static float ToLinearLog100Sqrt10(float gamma) {
// The function is non-bijective so choose the middle of [0, 0.00316227766f[.
const float mid_interval = 0.00316227766f / 2.f;
return (gamma <= 0.0f) ? mid_interval
: Powf(10.0f, 2.5f * (MIN(gamma, 1.f) - 1.0f));
}
static float FromLinearLog100Sqrt10(float linear) {
return (linear < 0.00316227766f) ? 0.0f
: 1.0f + Log10f(MIN(linear, 1.f)) / 2.5f;
}
static float ToLinearIec61966(float gamma) {
if (gamma <= -4.5f * 0.018053968510807f) {
return Powf((-gamma + 0.09929682680944f) / -1.09929682680944f, 1.f / 0.45f);
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
}
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
static float FromLinearIec61966(float linear) {
if (linear <= -0.018053968510807f) {
return -1.09929682680944f * Powf(-linear, 0.45f) + 0.09929682680944f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
}
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
static float ToLinearBt1361(float gamma) {
if (gamma < -0.25f) {
return -0.25f;
} else if (gamma < 0.f) {
return Powf((gamma - 0.02482420670236f) / -0.27482420670236f, 1.f / 0.45f) /
-4.f;
} else if (gamma < 4.5f * 0.018053968510807f) {
return gamma / 4.5f;
} else if (gamma < 1.f) {
return Powf((gamma + 0.09929682680944f) / 1.09929682680944f, 1.f / 0.45f);
}
return 1.f;
}
static float FromLinearBt1361(float linear) {
if (linear < -0.25f) {
return -0.25f;
} else if (linear < 0.f) {
return -0.27482420670236f * Powf(-4.f * linear, 0.45f) + 0.02482420670236f;
} else if (linear < 0.018053968510807f) {
return linear * 4.5f;
} else if (linear < 1.f) {
return 1.09929682680944f * Powf(linear, 0.45f) - 0.09929682680944f;
}
return 1.f;
}
static float ToLinearPq(float gamma) {
if (gamma > 0.f) {
const float pow_gamma = Powf(gamma, 32.f / 2523.f);
const float num = MAX(pow_gamma - 107.f / 128.f, 0.0f);
const float den = MAX(2413.f / 128.f - 2392.f / 128.f * pow_gamma, FLT_MIN);
return Powf(num / den, 4096.f / 653.f);
}
return 0.f;
}
static float FromLinearPq(float linear) {
if (linear > 0.f) {
const float pow_linear = Powf(linear, 653.f / 4096.f);
const float num = 107.f / 128.f + 2413.f / 128.f * pow_linear;
const float den = 1.0f + 2392.f / 128.f * pow_linear;
return Powf(num / den, 2523.f / 32.f);
}
return 0.f;
}
static float ToLinearSmpte428(float gamma) {
return Powf(MAX(gamma, 0.f), 2.6f) / 0.91655527974030934f;
}
static float FromLinearSmpte428(float linear) {
return Powf(0.91655527974030934f * MAX(linear, 0.f), 1.f / 2.6f);
}
// Conversion in BT.2100 requires RGB info. Simplify to gamma correction here.
static float ToLinearHlg(float gamma) {
if (gamma < 0.f) {
return 0.f;
} else if (gamma <= 0.5f) {
return Powf((gamma * gamma) * (1.f / 3.f), 1.2f);
}
return Powf((expf((gamma - 0.55991073f) / 0.17883277f) + 0.28466892f) / 12.0f,
1.2f);
}
static float FromLinearHlg(float linear) {
linear = Powf(linear, 1.f / 1.2f);
if (linear < 0.f) {
return 0.f;
} else if (linear <= (1.f / 12.f)) {
return sqrtf(3.f * linear);
}
return 0.17883277f * logf(12.f * linear - 0.28466892f) + 0.55991073f;
}
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type) {
float v_float, linear;
if (transfer_type == kSharpYuvTransferFunctionSrgb) {
return ToLinearSrgb(v, bit_depth);
}
v_float = (float)v / ((1 << bit_depth) - 1);
switch (transfer_type) {
case kSharpYuvTransferFunctionBt709:
case kSharpYuvTransferFunctionBt601:
case kSharpYuvTransferFunctionBt2020_10Bit:
case kSharpYuvTransferFunctionBt2020_12Bit:
linear = ToLinear709(v_float);
break;
case kSharpYuvTransferFunctionBt470M:
linear = ToLinear470M(v_float);
break;
case kSharpYuvTransferFunctionBt470Bg:
linear = ToLinear470Bg(v_float);
break;
case kSharpYuvTransferFunctionSmpte240:
linear = ToLinearSmpte240(v_float);
break;
case kSharpYuvTransferFunctionLinear:
return v;
case kSharpYuvTransferFunctionLog100:
linear = ToLinearLog100(v_float);
break;
case kSharpYuvTransferFunctionLog100_Sqrt10:
linear = ToLinearLog100Sqrt10(v_float);
break;
case kSharpYuvTransferFunctionIec61966:
linear = ToLinearIec61966(v_float);
break;
case kSharpYuvTransferFunctionBt1361:
linear = ToLinearBt1361(v_float);
break;
case kSharpYuvTransferFunctionSmpte2084:
linear = ToLinearPq(v_float);
break;
case kSharpYuvTransferFunctionSmpte428:
linear = ToLinearSmpte428(v_float);
break;
case kSharpYuvTransferFunctionHlg:
linear = ToLinearHlg(v_float);
break;
default:
assert(0);
linear = 0;
break;
}
return (uint32_t)Roundf(linear * ((1 << 16) - 1));
}
uint16_t SharpYuvLinearToGamma(uint32_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type) {
float v_float, linear;
if (transfer_type == kSharpYuvTransferFunctionSrgb) {
return FromLinearSrgb(v, bit_depth);
}
v_float = (float)v / ((1 << 16) - 1);
switch (transfer_type) {
case kSharpYuvTransferFunctionBt709:
case kSharpYuvTransferFunctionBt601:
case kSharpYuvTransferFunctionBt2020_10Bit:
case kSharpYuvTransferFunctionBt2020_12Bit:
linear = FromLinear709(v_float);
break;
case kSharpYuvTransferFunctionBt470M:
linear = FromLinear470M(v_float);
break;
case kSharpYuvTransferFunctionBt470Bg:
linear = FromLinear470Bg(v_float);
break;
case kSharpYuvTransferFunctionSmpte240:
linear = FromLinearSmpte240(v_float);
break;
case kSharpYuvTransferFunctionLinear:
return v;
case kSharpYuvTransferFunctionLog100:
linear = FromLinearLog100(v_float);
break;
case kSharpYuvTransferFunctionLog100_Sqrt10:
linear = FromLinearLog100Sqrt10(v_float);
break;
case kSharpYuvTransferFunctionIec61966:
linear = FromLinearIec61966(v_float);
break;
case kSharpYuvTransferFunctionBt1361:
linear = FromLinearBt1361(v_float);
break;
case kSharpYuvTransferFunctionSmpte2084:
linear = FromLinearPq(v_float);
break;
case kSharpYuvTransferFunctionSmpte428:
linear = FromLinearSmpte428(v_float);
break;
case kSharpYuvTransferFunctionHlg:
linear = FromLinearHlg(v_float);
break;
default:
assert(0);
linear = 0;
break;
}
return (uint16_t)Roundf(linear * ((1 << bit_depth) - 1));
}

View File

@@ -0,0 +1,38 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Gamma correction utilities.
#ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
#define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
#include "sharpyuv/sharpyuv.h"
#include "src/webp/types.h"
#ifdef __cplusplus
extern "C" {
#endif
// Initializes precomputed tables. Must be called once before calling
// SharpYuvGammaToLinear or SharpYuvLinearToGamma.
void SharpYuvInitGammaTables(void);
// Converts a 'bit_depth'-bit gamma color value to a 16-bit linear value.
uint32_t SharpYuvGammaToLinear(uint16_t v, int bit_depth,
SharpYuvTransferFunctionType transfer_type);
// Converts a 16-bit linear color value to a 'bit_depth'-bit gamma value.
uint16_t SharpYuvLinearToGamma(uint32_t value, int bit_depth,
SharpYuvTransferFunctionType transfer_type);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_SHARPYUV_SHARPYUV_GAMMA_H_

View File

@@ -0,0 +1,181 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <stdlib.h>
#include <arm_neon.h>
static uint16_t clip_NEON(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(max_y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_NEON(new_y, max_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYuvUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow16_NEON(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const int16x8_t max = vdupq_n_s16(max_y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t e0 = vrhaddq_s16(c1, a0);
const int16x8_t e1 = vrhaddq_s16(c0, a1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow32_NEON(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const uint16x8_t max = vdupq_n_u16(max_y);
for (i = 0; i + 4 <= len; i += 4) {
const int16x4_t a0 = vld1_s16(A + i + 0);
const int16x4_t a1 = vld1_s16(A + i + 1);
const int16x4_t b0 = vld1_s16(B + i + 0);
const int16x4_t b1 = vld1_s16(B + i + 1);
const int32x4_t a0b1 = vaddl_s16(a0, b1);
const int32x4_t a1b0 = vaddl_s16(a1, b0);
const int32x4_t a0a1b0b1 = vaddq_s32(a0b1, a1b0); // A0+A1+B0+B1
const int32x4_t a0b1_2 = vaddq_s32(a0b1, a0b1); // 2*(A0+B1)
const int32x4_t a1b0_2 = vaddq_s32(a1b0, a1b0); // 2*(A1+B0)
const int32x4_t c0 = vshrq_n_s32(vaddq_s32(a0b1_2, a0a1b0b1), 3);
const int32x4_t c1 = vshrq_n_s32(vaddq_s32(a1b0_2, a0a1b0b1), 3);
const int32x4_t e0 = vrhaddq_s32(c1, vmovl_s16(a0));
const int32x4_t e1 = vrhaddq_s32(c0, vmovl_s16(a1));
const int32x4x2_t f = vzipq_s32(e0, e1);
const int16x8_t g = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i));
const int32x4_t h0 = vaddw_s16(f.val[0], vget_low_s16(g));
const int32x4_t h1 = vaddw_s16(f.val[1], vget_high_s16(g));
const uint16x8_t i_16 = vcombine_u16(vqmovun_s32(h0), vqmovun_s32(h1));
const uint16x8_t i_clamped = vminq_u16(i_16, max);
vst1q_u16(out + 2 * i + 0, i_clamped);
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_NEON(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_NEON(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
if (bit_depth <= 10) {
SharpYuvFilterRow16_NEON(A, B, len, best_y, out, bit_depth);
} else {
SharpYuvFilterRow32_NEON(A, B, len, best_y, out, bit_depth);
}
}
//------------------------------------------------------------------------------
extern void InitSharpYuvNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
SharpYuvUpdateY = SharpYuvUpdateY_NEON;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
SharpYuvFilterRow = SharpYuvFilterRow_NEON;
}
#else // !WEBP_USE_NEON
extern void InitSharpYuvNEON(void);
void InitSharpYuvNEON(void) {}
#endif // WEBP_USE_NEON

View File

@@ -0,0 +1,201 @@
// Copyright 2022 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions for Sharp YUV.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "sharpyuv/sharpyuv_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <stdlib.h>
#include <emmintrin.h>
static uint16_t clip_SSE2(int v, int max) {
return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
}
static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
uint64_t diff = 0;
uint32_t tmp[4];
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(max_y);
const __m128i one = _mm_set1_epi16(1);
__m128i sum = zero;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_y
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
const __m128i F = _mm_add_epi16(C, D); // new_y
const __m128i G = _mm_or_si128(E, one); // -1 or 1
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
_mm_storeu_si128((__m128i*)(dst + i), H);
sum = _mm_add_epi32(sum, I);
}
_mm_storeu_si128((__m128i*)tmp, sum);
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_SSE2(new_y, max_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i = 0;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
const __m128i E = _mm_add_epi16(C, D); // new_uv
_mm_storeu_si128((__m128i*)(dst + i), E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const __m128i kCst8 = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(max_y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 8 <= len; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
const __m128i a0b1 = _mm_add_epi16(a0, b1);
const __m128i a1b0 = _mm_add_epi16(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi16(c1, a0);
const __m128i d1 = _mm_add_epi16(c0, a1);
const __m128i e0 = _mm_srai_epi16(d0, 1);
const __m128i e1 = _mm_srai_epi16(d1, 1);
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
const __m128i h0 = _mm_add_epi16(g0, f0);
const __m128i h1 = _mm_add_epi16(g1, f1);
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
}
}
static WEBP_INLINE __m128i s16_to_s32(__m128i in) {
return _mm_srai_epi32(_mm_unpacklo_epi16(in, in), 16);
}
static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B,
int len, const uint16_t* best_y,
uint16_t* out, int bit_depth) {
const int max_y = (1 << bit_depth) - 1;
int i;
const __m128i kCst8 = _mm_set1_epi32(8);
const __m128i max = _mm_set1_epi16(max_y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= len; i += 4) {
const __m128i a0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 0)));
const __m128i a1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(A + i + 1)));
const __m128i b0 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 0)));
const __m128i b1 = s16_to_s32(_mm_loadl_epi64((const __m128i*)(B + i + 1)));
const __m128i a0b1 = _mm_add_epi32(a0, b1);
const __m128i a1b0 = _mm_add_epi32(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi32(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi32(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi32(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi32(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi32(_mm_add_epi32(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi32(_mm_add_epi32(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi32(c1, a0);
const __m128i d1 = _mm_add_epi32(c0, a1);
const __m128i e0 = _mm_srai_epi32(d0, 1);
const __m128i e1 = _mm_srai_epi32(d1, 1);
const __m128i f0 = _mm_unpacklo_epi32(e0, e1);
const __m128i f1 = _mm_unpackhi_epi32(e0, e1);
const __m128i g = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i h_16 = _mm_add_epi16(g, _mm_packs_epi32(f0, f1));
const __m128i final = _mm_max_epi16(_mm_min_epi16(h_16, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), final);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_SSE2(best_y[2 * i + 0] + v0, max_y);
out[2 * i + 1] = clip_SSE2(best_y[2 * i + 1] + v1, max_y);
}
}
static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out,
int bit_depth) {
if (bit_depth <= 10) {
SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth);
} else {
SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth);
}
}
//------------------------------------------------------------------------------
extern void InitSharpYuvSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
SharpYuvUpdateY = SharpYuvUpdateY_SSE2;
SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2;
SharpYuvFilterRow = SharpYuvFilterRow_SSE2;
}
#else // !WEBP_USE_SSE2
extern void InitSharpYuvSSE2(void);
void InitSharpYuvSSE2(void) {}
#endif // WEBP_USE_SSE2