initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

903
thirdparty/libtheora/x86/mmxencfrag.c vendored Normal file
View File

@@ -0,0 +1,903 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#if defined(OC_X86_ASM)
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride){
ptrdiff_t ystride3;
ptrdiff_t ret;
__asm__ __volatile__(
/*Load the first 4 rows of each block.*/
"movq (%[src]),%%mm0\n\t"
"movq (%[ref]),%%mm1\n\t"
"movq (%[src],%[ystride]),%%mm2\n\t"
"movq (%[ref],%[ystride]),%%mm3\n\t"
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
"movq (%[src],%[ystride],2),%%mm4\n\t"
"movq (%[ref],%[ystride],2),%%mm5\n\t"
"movq (%[src],%[ystride3]),%%mm6\n\t"
"movq (%[ref],%[ystride3]),%%mm7\n\t"
/*Compute their SADs and add them in %%mm0*/
"psadbw %%mm1,%%mm0\n\t"
"psadbw %%mm3,%%mm2\n\t"
"lea (%[src],%[ystride],4),%[src]\n\t"
"paddw %%mm2,%%mm0\n\t"
"lea (%[ref],%[ystride],4),%[ref]\n\t"
/*Load the next 3 rows as registers become available.*/
"movq (%[src]),%%mm2\n\t"
"movq (%[ref]),%%mm3\n\t"
"psadbw %%mm5,%%mm4\n\t"
"psadbw %%mm7,%%mm6\n\t"
"paddw %%mm4,%%mm0\n\t"
"movq (%[ref],%[ystride]),%%mm5\n\t"
"movq (%[src],%[ystride]),%%mm4\n\t"
"paddw %%mm6,%%mm0\n\t"
"movq (%[ref],%[ystride],2),%%mm7\n\t"
"movq (%[src],%[ystride],2),%%mm6\n\t"
/*Start adding their SADs to %%mm0*/
"psadbw %%mm3,%%mm2\n\t"
"psadbw %%mm5,%%mm4\n\t"
"paddw %%mm2,%%mm0\n\t"
"psadbw %%mm7,%%mm6\n\t"
/*Load last row as registers become available.*/
"movq (%[src],%[ystride3]),%%mm2\n\t"
"movq (%[ref],%[ystride3]),%%mm3\n\t"
/*And finish adding up their SADs.*/
"paddw %%mm4,%%mm0\n\t"
"psadbw %%mm3,%%mm2\n\t"
"paddw %%mm6,%%mm0\n\t"
"paddw %%mm2,%%mm0\n\t"
"movd %%mm0,%[ret]\n\t"
:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
:[ystride]"r"((ptrdiff_t)_ystride)
);
return (unsigned)ret;
}
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride,unsigned _thresh){
/*Early termination is for suckers.*/
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
}
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
We pre-load the next two rows of data as registers become available.*/
#define OC_SAD2_LOOP \
"#OC_SAD2_LOOP\n\t" \
/*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
pavgb computes (%%mm0+%%mm1+1>>1). \
The latter is exactly 1 too large when the low bit of two corresponding \
bytes is only set in one of them. \
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
correct the output of pavgb. \
TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
schedules better; currently, however, this function is unused.*/ \
"movq %%mm0,%%mm6\n\t" \
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
"pxor %%mm1,%%mm0\n\t" \
"pavgb %%mm1,%%mm6\n\t" \
"lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
"movq %%mm2,%%mm1\n\t" \
"pand %%mm7,%%mm0\n\t" \
"pavgb %%mm3,%%mm2\n\t" \
"pxor %%mm3,%%mm1\n\t" \
"movq (%[ref2],%[ystride]),%%mm3\n\t" \
"psubb %%mm0,%%mm6\n\t" \
"movq (%[ref1]),%%mm0\n\t" \
"pand %%mm7,%%mm1\n\t" \
"psadbw %%mm6,%%mm4\n\t" \
"movd %[ret],%%mm6\n\t" \
"psubb %%mm1,%%mm2\n\t" \
"movq (%[ref2]),%%mm1\n\t" \
"lea (%[src],%[ystride],2),%[src]\n\t" \
"psadbw %%mm2,%%mm5\n\t" \
"movq (%[ref1],%[ystride]),%%mm2\n\t" \
"paddw %%mm4,%%mm5\n\t" \
"movq (%[src]),%%mm4\n\t" \
"paddw %%mm5,%%mm6\n\t" \
"movq (%[src],%[ystride]),%%mm5\n\t" \
"movd %%mm6,%[ret]\n\t" \
/*Same as above, but does not pre-load the next two rows.*/
#define OC_SAD2_TAIL \
"#OC_SAD2_TAIL\n\t" \
"movq %%mm0,%%mm6\n\t" \
"pavgb %%mm1,%%mm0\n\t" \
"pxor %%mm1,%%mm6\n\t" \
"movq %%mm2,%%mm1\n\t" \
"pand %%mm7,%%mm6\n\t" \
"pavgb %%mm3,%%mm2\n\t" \
"pxor %%mm3,%%mm1\n\t" \
"psubb %%mm6,%%mm0\n\t" \
"pand %%mm7,%%mm1\n\t" \
"psadbw %%mm0,%%mm4\n\t" \
"psubb %%mm1,%%mm2\n\t" \
"movd %[ret],%%mm6\n\t" \
"psadbw %%mm2,%%mm5\n\t" \
"paddw %%mm4,%%mm5\n\t" \
"paddw %%mm5,%%mm6\n\t" \
"movd %%mm6,%[ret]\n\t" \
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh){
ptrdiff_t ret;
__asm__ __volatile__(
"movq (%[ref1]),%%mm0\n\t"
"movq (%[ref2]),%%mm1\n\t"
"movq (%[ref1],%[ystride]),%%mm2\n\t"
"movq (%[ref2],%[ystride]),%%mm3\n\t"
"xor %[ret],%[ret]\n\t"
"movq (%[src]),%%mm4\n\t"
"pxor %%mm7,%%mm7\n\t"
"pcmpeqb %%mm6,%%mm6\n\t"
"movq (%[src],%[ystride]),%%mm5\n\t"
"psubb %%mm6,%%mm7\n\t"
OC_SAD2_LOOP
OC_SAD2_LOOP
OC_SAD2_LOOP
OC_SAD2_TAIL
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
:[ystride]"r"((ptrdiff_t)_ystride)
);
return (unsigned)ret;
}
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
16-bit difference in %%mm0...%%mm7.*/
#define OC_LOAD_SUB_8x4(_off) \
"#OC_LOAD_SUB_8x4\n\t" \
"movd "#_off"(%[src]),%%mm0\n\t" \
"movd "#_off"(%[ref]),%%mm4\n\t" \
"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"movd "#_off"(%[src]),%%mm2\n\t" \
"movd "#_off"(%[ref]),%%mm7\n\t" \
"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
"punpcklbw %%mm4,%%mm0\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"punpcklbw %%mm4,%%mm4\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"psubw %%mm4,%%mm0\n\t" \
"movd "#_off"(%[src]),%%mm4\n\t" \
"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
"movd "#_off"(%[ref]),%%mm0\n\t" \
"punpcklbw %%mm5,%%mm1\n\t" \
"punpcklbw %%mm5,%%mm5\n\t" \
"psubw %%mm5,%%mm1\n\t" \
"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
"punpcklbw %%mm7,%%mm2\n\t" \
"punpcklbw %%mm7,%%mm7\n\t" \
"psubw %%mm7,%%mm2\n\t" \
"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
"punpcklbw %%mm6,%%mm3\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"punpcklbw %%mm6,%%mm6\n\t" \
"psubw %%mm6,%%mm3\n\t" \
"movd "#_off"(%[src]),%%mm6\n\t" \
"punpcklbw %%mm0,%%mm4\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"punpcklbw %%mm0,%%mm0\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"psubw %%mm0,%%mm4\n\t" \
"movd "#_off"(%[ref]),%%mm0\n\t" \
"punpcklbw %%mm7,%%mm5\n\t" \
"neg %[src_ystride]\n\t" \
"punpcklbw %%mm7,%%mm7\n\t" \
"psubw %%mm7,%%mm5\n\t" \
"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
"punpcklbw %%mm0,%%mm6\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"punpcklbw %%mm0,%%mm0\n\t" \
"neg %[ref_ystride]\n\t" \
"psubw %%mm0,%%mm6\n\t" \
"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
"punpcklbw %%mm0,%%mm7\n\t" \
"neg %[src_ystride]\n\t" \
"punpcklbw %%mm0,%%mm0\n\t" \
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
"psubw %%mm0,%%mm7\n\t" \
"neg %[ref_ystride]\n\t" \
"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
#define OC_LOAD_8x4(_off) \
"#OC_LOAD_8x4\n\t" \
"movd "#_off"(%[src]),%%mm0\n\t" \
"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
"pxor %%mm7,%%mm7\n\t" \
"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
"punpcklbw %%mm7,%%mm0\n\t" \
"movd "#_off"(%[src4]),%%mm4\n\t" \
"punpcklbw %%mm7,%%mm1\n\t" \
"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
"punpcklbw %%mm7,%%mm2\n\t" \
"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
"punpcklbw %%mm7,%%mm3\n\t" \
"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
"punpcklbw %%mm4,%%mm4\n\t" \
"punpcklbw %%mm5,%%mm5\n\t" \
"psrlw $8,%%mm4\n\t" \
"psrlw $8,%%mm5\n\t" \
"punpcklbw %%mm6,%%mm6\n\t" \
"punpcklbw %%mm7,%%mm7\n\t" \
"psrlw $8,%%mm6\n\t" \
"psrlw $8,%%mm7\n\t" \
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 \
"#OC_HADAMARD_AB_8x4\n\t" \
/*Stage A: \
Outputs 0-3 are swapped with 4-7 here.*/ \
"paddw %%mm1,%%mm5\n\t" \
"paddw %%mm2,%%mm6\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"psubw %%mm5,%%mm1\n\t" \
"psubw %%mm6,%%mm2\n\t" \
"paddw %%mm3,%%mm7\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"paddw %%mm3,%%mm3\n\t" \
"paddw %%mm0,%%mm0\n\t" \
"psubw %%mm7,%%mm3\n\t" \
"psubw %%mm4,%%mm0\n\t" \
/*Stage B:*/ \
"paddw %%mm2,%%mm0\n\t" \
"paddw %%mm3,%%mm1\n\t" \
"paddw %%mm6,%%mm4\n\t" \
"paddw %%mm7,%%mm5\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"paddw %%mm3,%%mm3\n\t" \
"paddw %%mm6,%%mm6\n\t" \
"paddw %%mm7,%%mm7\n\t" \
"psubw %%mm0,%%mm2\n\t" \
"psubw %%mm1,%%mm3\n\t" \
"psubw %%mm4,%%mm6\n\t" \
"psubw %%mm5,%%mm7\n\t" \
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 \
"#OC_HADAMARD_C_8x4\n\t" \
/*Stage C:*/ \
"paddw %%mm1,%%mm0\n\t" \
"paddw %%mm3,%%mm2\n\t" \
"paddw %%mm5,%%mm4\n\t" \
"paddw %%mm7,%%mm6\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"paddw %%mm3,%%mm3\n\t" \
"paddw %%mm5,%%mm5\n\t" \
"paddw %%mm7,%%mm7\n\t" \
"psubw %%mm0,%%mm1\n\t" \
"psubw %%mm2,%%mm3\n\t" \
"psubw %%mm4,%%mm5\n\t" \
"psubw %%mm6,%%mm7\n\t" \
/*Performs an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
in place with no temporary registers).*/
#define OC_HADAMARD_8x4 \
OC_HADAMARD_AB_8x4 \
OC_HADAMARD_C_8x4 \
/*Performs the first part of the final stage of the Hadamard transform and
summing of absolute values.
At the end of this part, %%mm1 will contain the DC coefficient of the
transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
/*We use the fact that \
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
to merge the final butterfly with the abs and the first stage of \
accumulation. \
Thus we can avoid using pabsw, which is not available until SSSE3. \
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
registers). \
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
This implementation is only 26 (+4 for spilling registers).*/ \
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
/*mm7={0x7FFF}x4 \
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
"pcmpeqb %%mm7,%%mm7\n\t" \
"movq %%mm0,%%mm6\n\t" \
"psrlw $1,%%mm7\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"pmaxsw %%mm1,%%mm0\n\t" \
"paddsw %%mm7,%%mm6\n\t" \
"psubw %%mm6,%%mm0\n\t" \
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
"movq %%mm2,%%mm6\n\t" \
"movq %%mm4,%%mm1\n\t" \
"pmaxsw %%mm3,%%mm2\n\t" \
"pmaxsw %%mm5,%%mm4\n\t" \
"paddw %%mm3,%%mm6\n\t" \
"paddw %%mm5,%%mm1\n\t" \
"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
/*Performs the second part of the final stage of the Hadamard transform and
summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
"paddsw %%mm7,%%mm6\n\t" \
"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
"paddsw %%mm7,%%mm1\n\t" \
"psubw %%mm6,%%mm2\n\t" \
"psubw %%mm1,%%mm4\n\t" \
/*mm7={1}x4 (needed for the horizontal add that follows) \
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
"movq %%mm3,%%mm6\n\t" \
"pmaxsw %%mm5,%%mm3\n\t" \
"paddw %%mm2,%%mm0\n\t" \
"paddw %%mm5,%%mm6\n\t" \
"paddw %%mm4,%%mm0\n\t" \
"paddsw %%mm7,%%mm6\n\t" \
"paddw %%mm3,%%mm0\n\t" \
"psrlw $14,%%mm7\n\t" \
"psubw %%mm6,%%mm0\n\t" \
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
absolute value of each component, and accumulates everything into mm0.
This is the only portion of SATD which requires MMXEXT (we could use plain
MMX, but it takes 4 instructions and an extra register to work around the
lack of a pmaxsw, which is a pretty serious penalty).*/
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
component, and accumulates everything into mm0.
Note that mm0 will have an extra 4 added to each column, and that after
removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
OC_HADAMARD_AB_8x4 \
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
/*Performs two 4x4 transposes (mostly) in place.
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
contains rows {a,b,c,d}.
On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
#define OC_TRANSPOSE_4x4x2(_off) \
"#OC_TRANSPOSE_4x4x2\n\t" \
/*First 4x4 transpose:*/ \
"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
/*mm0 = e3 e2 e1 e0 \
mm1 = f3 f2 f1 f0 \
mm2 = g3 g2 g1 g0 \
mm3 = h3 h2 h1 h0*/ \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm3,%%mm2\n\t" \
"punpckhwd %%mm3,%%mm5\n\t" \
"movq %%mm0,%%mm3\n\t" \
"punpcklwd %%mm1,%%mm0\n\t" \
"punpckhwd %%mm1,%%mm3\n\t" \
/*mm0 = f1 e1 f0 e0 \
mm3 = f3 e3 f2 e2 \
mm2 = h1 g1 h0 g0 \
mm5 = h3 g3 h2 g2*/ \
"movq %%mm0,%%mm1\n\t" \
"punpckldq %%mm2,%%mm0\n\t" \
"punpckhdq %%mm2,%%mm1\n\t" \
"movq %%mm3,%%mm2\n\t" \
"punpckhdq %%mm5,%%mm3\n\t" \
"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
"punpckldq %%mm5,%%mm2\n\t" \
/*mm0 = h0 g0 f0 e0 \
mm1 = h1 g1 f1 e1 \
mm2 = h2 g2 f2 e2 \
mm3 = h3 g3 f3 e3*/ \
"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
/*Second 4x4 transpose:*/ \
/*mm4 = a3 a2 a1 a0 \
mm5 = b3 b2 b1 b0 \
mm6 = c3 c2 c1 c0 \
mm7 = d3 d2 d1 d0*/ \
"movq %%mm6,%%mm0\n\t" \
"punpcklwd %%mm7,%%mm6\n\t" \
"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
"punpckhwd %%mm7,%%mm0\n\t" \
"movq %%mm4,%%mm7\n\t" \
"punpcklwd %%mm5,%%mm4\n\t" \
"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
"punpckhwd %%mm5,%%mm7\n\t" \
/*mm4 = b1 a1 b0 a0 \
mm7 = b3 a3 b2 a2 \
mm6 = d1 c1 d0 c0 \
mm0 = d3 c3 d2 c2*/ \
"movq %%mm4,%%mm5\n\t" \
"punpckldq %%mm6,%%mm4\n\t" \
"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
"punpckhdq %%mm6,%%mm5\n\t" \
"movq %%mm7,%%mm6\n\t" \
"punpckhdq %%mm0,%%mm7\n\t" \
"punpckldq %%mm0,%%mm6\n\t" \
/*mm4 = d0 c0 b0 a0 \
mm5 = d1 c1 b1 a1 \
mm6 = d2 c2 b2 a2 \
mm7 = d3 c3 b3 a3*/ \
static unsigned oc_int_frag_satd_mmxext(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
unsigned ret;
unsigned ret2;
int dc;
__asm__ __volatile__(
OC_LOAD_SUB_8x4(0x00)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x00)
/*Finish swapping out this 8x4 block to make room for the next one.
mm0...mm3 have been swapped out already.*/
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
OC_LOAD_SUB_8x4(0x04)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x08)
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place, so
we only have to do half the loads.*/
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x4
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
"movd %%mm1,%[dc]\n\t"
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
latency of pmaddwd by starting the next series of loads now.*/
"pmaddwd %%mm7,%%mm0\n\t"
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
"movq %%mm0,%%mm4\n\t"
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
"punpckhdq %%mm0,%%mm0\n\t"
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
"paddd %%mm0,%%mm4\n\t"
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
"movd %%mm4,%[ret2]\n\t"
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
"pmaddwd %%mm7,%%mm0\n\t"
/*Subtract abs(dc) from 2*ret2.*/
"movsx %w[dc],%[dc]\n\t"
"cdq\n\t"
"lea (%[ret],%[ret2],2),%[ret2]\n\t"
"movq %%mm0,%%mm4\n\t"
"punpckhdq %%mm0,%%mm0\n\t"
"xor %[dc],%[ret]\n\t"
"paddd %%mm0,%%mm4\n\t"
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
added to them, a factor of two removed, and the DC value included;
correct the final sum here.*/
"sub %[ret],%[ret2]\n\t"
"movd %%mm4,%[ret]\n\t"
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
and %[ret2] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf].*/
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
constraints, otherwise if gcc can prove they're equal it will allocate
them to the same register (which is bad); _src and _ref face a similar
problem, though those are never actually the same.*/
:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
/*We have to use neg, so we actually clobber the condition codes for once
(not to mention cmp, sub, and add).*/
:"cc"
);
*_dc=dc;
return ret;
}
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
}
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
we can share code with oc_enc_frag_satd2_mmxext().*/
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
__asm__ __volatile__(
/*Load the first 3 rows.*/
"movq (%[src1]),%%mm0\n\t"
"movq (%[src2]),%%mm1\n\t"
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
"pxor %%mm7,%%mm7\n\t"
"movq (%[src1]),%%mm4\n\t"
"pcmpeqb %%mm6,%%mm6\n\t"
"movq (%[src2]),%%mm5\n\t"
/*mm7={1}x8.*/
"psubb %%mm6,%%mm7\n\t"
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
"movq %%mm0,%%mm6\n\t"
"pxor %%mm1,%%mm0\n\t"
"pavgb %%mm1,%%mm6\n\t"
/*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
"movq %%mm2,%%mm1\n\t"
"pand %%mm7,%%mm0\n\t"
"pavgb %%mm3,%%mm2\n\t"
"pxor %%mm3,%%mm1\n\t"
/*%%mm3 is free.*/
"psubb %%mm0,%%mm6\n\t"
/*%%mm0 is free, start loading the next row.*/
"movq (%[src1],%[src_ystride]),%%mm0\n\t"
/*Start averaging %%mm5 and %%mm4 using %%mm3.*/
"movq %%mm4,%%mm3\n\t"
/*%%mm6 (row 0) is done; write it out.*/
"movq %%mm6,(%[dst])\n\t"
"pand %%mm7,%%mm1\n\t"
"pavgb %%mm5,%%mm4\n\t"
"psubb %%mm1,%%mm2\n\t"
/*%%mm1 is free, continue loading the next row.*/
"movq (%[src2],%[src_ystride]),%%mm1\n\t"
"pxor %%mm5,%%mm3\n\t"
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
/*%%mm2 (row 1) is done; write it out.*/
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
"pand %%mm7,%%mm3\n\t"
/*Start loading the next row.*/
"movq (%[src1]),%%mm2\n\t"
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
"psubb %%mm3,%%mm4\n\t"
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
/*%%mm4 (row 2) is done; write it out.*/
"movq %%mm4,(%[dst])\n\t"
/*Continue loading the next row.*/
"movq (%[src2]),%%mm3\n\t"
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
"movq %%mm0,%%mm6\n\t"
"pxor %%mm1,%%mm0\n\t"
/*Start loading the next row.*/
"movq (%[src1],%[src_ystride]),%%mm4\n\t"
"pavgb %%mm1,%%mm6\n\t"
/*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
"movq %%mm2,%%mm1\n\t"
"pand %%mm7,%%mm0\n\t"
/*Continue loading the next row.*/
"movq (%[src2],%[src_ystride]),%%mm5\n\t"
"pavgb %%mm3,%%mm2\n\t"
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
"pxor %%mm3,%%mm1\n\t"
/*%%mm3 is free.*/
"psubb %%mm0,%%mm6\n\t"
/*%%mm0 is free, start loading the next row.*/
"movq (%[src1]),%%mm0\n\t"
/*Start averaging %%mm5 into %%mm4 using %%mm3.*/
"movq %%mm4,%%mm3\n\t"
/*%%mm6 (row 3) is done; write it out.*/
"movq %%mm6,(%[dst],%[dst_ystride])\n\t"
"pand %%mm7,%%mm1\n\t"
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
"pavgb %%mm5,%%mm4\n\t"
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
"psubb %%mm1,%%mm2\n\t"
/*%%mm1 is free; continue loading the next row.*/
"movq (%[src2]),%%mm1\n\t"
"pxor %%mm5,%%mm3\n\t"
/*%%mm2 (row 4) is done; write it out.*/
"movq %%mm2,(%[dst])\n\t"
"pand %%mm7,%%mm3\n\t"
/*Start loading the next row.*/
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
"psubb %%mm3,%%mm4\n\t"
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
"movq %%mm0,%%mm6\n\t"
/*Continue loading the next row.*/
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
/*%%mm4 (row 5) is done; write it out.*/
"movq %%mm4,(%[dst],%[dst_ystride])\n\t"
"pxor %%mm1,%%mm0\n\t"
"pavgb %%mm1,%%mm6\n\t"
/*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
"movq %%mm2,%%mm4\n\t"
"pand %%mm7,%%mm0\n\t"
"pavgb %%mm3,%%mm2\n\t"
"pxor %%mm3,%%mm4\n\t"
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
"psubb %%mm0,%%mm6\n\t"
"pand %%mm7,%%mm4\n\t"
/*%%mm6 (row 6) is done, write it out.*/
"movq %%mm6,(%[dst])\n\t"
"psubb %%mm4,%%mm2\n\t"
/*%%mm2 (row 7) is done, write it out.*/
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
[src_ystride]"r"((ptrdiff_t)_src_ystride)
:"memory"
);
}
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
}
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
const unsigned char *_src,int _ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
unsigned ret;
unsigned ret2;
int dc;
__asm__ __volatile__(
OC_LOAD_8x4(0x00)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x00)
/*Finish swapping out this 8x4 block to make room for the next one.
mm0...mm3 have been swapped out already.*/
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
OC_LOAD_8x4(0x04)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x08)
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place, so
we only have to do half the loads.*/
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x4
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
"movd %%mm1,%[dc]\n\t"
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
latency of pmaddwd by starting the next series of loads now.*/
"pmaddwd %%mm7,%%mm0\n\t"
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
"movq %%mm0,%%mm4\n\t"
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
"punpckhdq %%mm0,%%mm0\n\t"
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
"paddd %%mm0,%%mm4\n\t"
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
"movd %%mm4,%[ret]\n\t"
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
"pmaddwd %%mm7,%%mm0\n\t"
/*We assume that the DC coefficient is always positive (which is true,
because the input to the INTRA transform was not a difference).*/
"movzx %w[dc],%[dc]\n\t"
"add %[ret],%[ret]\n\t"
"sub %[dc],%[ret]\n\t"
"movq %%mm0,%%mm4\n\t"
"punpckhdq %%mm0,%%mm0\n\t"
"paddd %%mm0,%%mm4\n\t"
"movd %%mm4,%[ret2]\n\t"
"lea -64(%[ret],%[ret2],2),%[ret]\n\t"
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
and %[ret2] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf] (which is also
listed as an output to ensure gcc _doesn't_ alias them against it).*/
:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
/*We have to use sub, so we actually clobber the condition codes for once
(not to mention add).*/
:"cc"
);
*_dc=dc;
return ret;
}
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
const unsigned char *_src,const unsigned char *_ref,int _ystride){
int i;
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
for(i=4;i-->0;){
__asm__ __volatile__(
/*mm0=[src]*/
"movq (%[src]),%%mm0\n\t"
/*mm1=[ref]*/
"movq (%[ref]),%%mm1\n\t"
/*mm4=[src+ystride]*/
"movq (%[src],%[ystride]),%%mm4\n\t"
/*mm5=[ref+ystride]*/
"movq (%[ref],%[ystride]),%%mm5\n\t"
/*Compute [src]-[ref].*/
"movq %%mm0,%%mm2\n\t"
"punpcklbw %%mm7,%%mm0\n\t"
"movq %%mm1,%%mm3\n\t"
"punpckhbw %%mm7,%%mm2\n\t"
"punpcklbw %%mm7,%%mm1\n\t"
"punpckhbw %%mm7,%%mm3\n\t"
"psubw %%mm1,%%mm0\n\t"
"psubw %%mm3,%%mm2\n\t"
/*Compute [src+ystride]-[ref+ystride].*/
"movq %%mm4,%%mm1\n\t"
"punpcklbw %%mm7,%%mm4\n\t"
"movq %%mm5,%%mm3\n\t"
"punpckhbw %%mm7,%%mm1\n\t"
"lea (%[src],%[ystride],2),%[src]\n\t"
"punpcklbw %%mm7,%%mm5\n\t"
"lea (%[ref],%[ystride],2),%[ref]\n\t"
"punpckhbw %%mm7,%%mm3\n\t"
"psubw %%mm5,%%mm4\n\t"
"psubw %%mm3,%%mm1\n\t"
/*Write the answer out.*/
"movq %%mm0,0x00(%[residue])\n\t"
"movq %%mm2,0x08(%[residue])\n\t"
"movq %%mm4,0x10(%[residue])\n\t"
"movq %%mm1,0x18(%[residue])\n\t"
"lea 0x20(%[residue]),%[residue]\n\t"
:[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
:[ystride]"r"((ptrdiff_t)_ystride)
:"memory"
);
}
}
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
const unsigned char *_src,int _ystride){
ptrdiff_t ystride3;
__asm__ __volatile__(
/*mm0=[src]*/
"movq (%[src]),%%mm0\n\t"
/*mm1=[src+ystride]*/
"movq (%[src],%[ystride]),%%mm1\n\t"
/*mm6={-1}x4*/
"pcmpeqw %%mm6,%%mm6\n\t"
/*mm2=[src+2*ystride]*/
"movq (%[src],%[ystride],2),%%mm2\n\t"
/*[ystride3]=3*[ystride]*/
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
/*mm6={1}x4*/
"psllw $15,%%mm6\n\t"
/*mm3=[src+3*ystride]*/
"movq (%[src],%[ystride3]),%%mm3\n\t"
/*mm6={128}x4*/
"psrlw $8,%%mm6\n\t"
/*mm7=0*/
"pxor %%mm7,%%mm7\n\t"
/*[src]=[src]+4*[ystride]*/
"lea (%[src],%[ystride],4),%[src]\n\t"
/*Compute [src]-128 and [src+ystride]-128*/
"movq %%mm0,%%mm4\n\t"
"punpcklbw %%mm7,%%mm0\n\t"
"movq %%mm1,%%mm5\n\t"
"punpckhbw %%mm7,%%mm4\n\t"
"psubw %%mm6,%%mm0\n\t"
"punpcklbw %%mm7,%%mm1\n\t"
"psubw %%mm6,%%mm4\n\t"
"punpckhbw %%mm7,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"psubw %%mm6,%%mm5\n\t"
/*Write the answer out.*/
"movq %%mm0,0x00(%[residue])\n\t"
"movq %%mm4,0x08(%[residue])\n\t"
"movq %%mm1,0x10(%[residue])\n\t"
"movq %%mm5,0x18(%[residue])\n\t"
/*mm0=[src+4*ystride]*/
"movq (%[src]),%%mm0\n\t"
/*mm1=[src+5*ystride]*/
"movq (%[src],%[ystride]),%%mm1\n\t"
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
"movq %%mm2,%%mm4\n\t"
"punpcklbw %%mm7,%%mm2\n\t"
"movq %%mm3,%%mm5\n\t"
"punpckhbw %%mm7,%%mm4\n\t"
"psubw %%mm6,%%mm2\n\t"
"punpcklbw %%mm7,%%mm3\n\t"
"psubw %%mm6,%%mm4\n\t"
"punpckhbw %%mm7,%%mm5\n\t"
"psubw %%mm6,%%mm3\n\t"
"psubw %%mm6,%%mm5\n\t"
/*Write the answer out.*/
"movq %%mm2,0x20(%[residue])\n\t"
"movq %%mm4,0x28(%[residue])\n\t"
"movq %%mm3,0x30(%[residue])\n\t"
"movq %%mm5,0x38(%[residue])\n\t"
/*mm2=[src+6*ystride]*/
"movq (%[src],%[ystride],2),%%mm2\n\t"
/*mm3=[src+7*ystride]*/
"movq (%[src],%[ystride3]),%%mm3\n\t"
/*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
"movq %%mm0,%%mm4\n\t"
"punpcklbw %%mm7,%%mm0\n\t"
"movq %%mm1,%%mm5\n\t"
"punpckhbw %%mm7,%%mm4\n\t"
"psubw %%mm6,%%mm0\n\t"
"punpcklbw %%mm7,%%mm1\n\t"
"psubw %%mm6,%%mm4\n\t"
"punpckhbw %%mm7,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"psubw %%mm6,%%mm5\n\t"
/*Write the answer out.*/
"movq %%mm0,0x40(%[residue])\n\t"
"movq %%mm4,0x48(%[residue])\n\t"
"movq %%mm1,0x50(%[residue])\n\t"
"movq %%mm5,0x58(%[residue])\n\t"
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
"movq %%mm2,%%mm4\n\t"
"punpcklbw %%mm7,%%mm2\n\t"
"movq %%mm3,%%mm5\n\t"
"punpckhbw %%mm7,%%mm4\n\t"
"psubw %%mm6,%%mm2\n\t"
"punpcklbw %%mm7,%%mm3\n\t"
"psubw %%mm6,%%mm4\n\t"
"punpckhbw %%mm7,%%mm5\n\t"
"psubw %%mm6,%%mm3\n\t"
"psubw %%mm6,%%mm5\n\t"
/*Write the answer out.*/
"movq %%mm2,0x60(%[residue])\n\t"
"movq %%mm4,0x68(%[residue])\n\t"
"movq %%mm3,0x70(%[residue])\n\t"
"movq %%mm5,0x78(%[residue])\n\t"
:[src]"+r"(_src),[ystride3]"=&r"(ystride3)
:[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
:"memory"
);
}
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
}
#endif

678
thirdparty/libtheora/x86/mmxfdct.c vendored Normal file
View File

@@ -0,0 +1,678 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************/
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
#include "x86zigzag.h"
#if defined(OC_X86_ASM)
# define OC_FDCT_STAGE1_8x4 \
"#OC_FDCT_STAGE1_8x4\n\t" \
/*Stage 1:*/ \
/*mm0=t7'=t0-t7*/ \
"psubw %%mm7,%%mm0\n\t" \
"paddw %%mm7,%%mm7\n\t" \
/*mm1=t6'=t1-t6*/ \
"psubw %%mm6,%%mm1\n\t" \
"paddw %%mm6,%%mm6\n\t" \
/*mm2=t5'=t2-t5*/ \
"psubw %%mm5,%%mm2\n\t" \
"paddw %%mm5,%%mm5\n\t" \
/*mm3=t4'=t3-t4*/ \
"psubw %%mm4,%%mm3\n\t" \
"paddw %%mm4,%%mm4\n\t" \
/*mm7=t0'=t0+t7*/ \
"paddw %%mm0,%%mm7\n\t" \
/*mm6=t1'=t1+t6*/ \
"paddw %%mm1,%%mm6\n\t" \
/*mm5=t2'=t2+t5*/ \
"paddw %%mm2,%%mm5\n\t" \
/*mm4=t3'=t3+t4*/ \
"paddw %%mm3,%%mm4\n\t" \
# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
"#OC_FDCT8x4\n\t" \
/*Stage 2:*/ \
/*mm7=t3''=t0'-t3'*/ \
"psubw %%mm4,%%mm7\n\t" \
"paddw %%mm4,%%mm4\n\t" \
/*mm6=t2''=t1'-t2'*/ \
"psubw %%mm5,%%mm6\n\t" \
"movq %%mm7,"_r6"(%[y])\n\t" \
"paddw %%mm5,%%mm5\n\t" \
/*mm1=t5''=t6'-t5'*/ \
"psubw %%mm2,%%mm1\n\t" \
"movq %%mm6,"_r2"(%[y])\n\t" \
/*mm4=t0''=t0'+t3'*/ \
"paddw %%mm7,%%mm4\n\t" \
"paddw %%mm2,%%mm2\n\t" \
/*mm5=t1''=t1'+t2'*/ \
"movq %%mm4,"_r0"(%[y])\n\t" \
"paddw %%mm6,%%mm5\n\t" \
/*mm2=t6''=t6'+t5'*/ \
"paddw %%mm1,%%mm2\n\t" \
"movq %%mm5,"_r4"(%[y])\n\t" \
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
/*mm4, mm5, mm6, mm7 are free.*/ \
/*Stage 3:*/ \
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
"mov $0x5A806A0A,%[a]\n\t" \
"pcmpeqb %%mm6,%%mm6\n\t" \
"movd %[a],%%mm7\n\t" \
"psrlw $15,%%mm6\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm6,%%mm6\n\t" \
/*mm0=0, m2={-1}x4 \
mm5:mm4=t5''*27146+0xB500*/ \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm0,"_r7"(%[y])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqb %%mm2,%%mm2\n\t" \
/*mm2=t6'', mm1=t5''+(t5''!=0) \
mm4=(t5''*27146+0xB500>>16)*/ \
"pcmpeqw %%mm1,%%mm0\n\t" \
"psrad $16,%%mm4\n\t" \
"psubw %%mm2,%%mm0\n\t" \
"movq "_r3"(%[y]),%%mm2\n\t" \
"psrad $16,%%mm5\n\t" \
"paddw %%mm0,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
"paddw %%mm1,%%mm4\n\t" \
"movq "_r7"(%[y]),%%mm0\n\t" \
"psraw $1,%%mm4\n\t" \
"movq %%mm3,%%mm1\n\t" \
/*mm3=t4''=t4'+s*/ \
"paddw %%mm4,%%mm3\n\t" \
/*mm1=t5'''=t4'-s*/ \
"psubw %%mm4,%%mm1\n\t" \
/*mm1=0, mm3={-1}x4 \
mm5:mm4=t6''*27146+0xB500*/ \
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm1,"_r5"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm3,"_r1"(%[y])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"pxor %%mm1,%%mm1\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
"psrad $16,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm1\n\t" \
"psrad $16,%%mm5\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"paddw %%mm1,%%mm2\n\t" \
/*mm1=t1'' \
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
"paddw %%mm2,%%mm4\n\t" \
"movq "_r4"(%[y]),%%mm1\n\t" \
"psraw $1,%%mm4\n\t" \
"movq %%mm0,%%mm2\n\t" \
/*mm7={54491-0x7FFF,0x7FFF}x2 \
mm0=t7''=t7'+s*/ \
"paddw %%mm4,%%mm0\n\t" \
/*mm2=t6'''=t7'-s*/ \
"psubw %%mm4,%%mm2\n\t" \
/*Stage 4:*/ \
/*mm0=0, mm2=t0'' \
mm5:mm4=t1''*27146+0xB500*/ \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq "_r0"(%[y]),%%mm2\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movq %%mm0,"_r7"(%[y])\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
/*mm7={27146,0x4000>>1}x2 \
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
"psrad $16,%%mm4\n\t" \
"mov $0x20006A0A,%[a]\n\t" \
"pcmpeqw %%mm1,%%mm0\n\t" \
"movd %[a],%%mm7\n\t" \
"psrad $16,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"paddw %%mm1,%%mm0\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm4,%%mm0\n\t" \
/*mm6={0x00000E3D}x2 \
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"mov $0x0E3D,%[a]\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pxor %%mm1,%%mm1\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
"pcmpeqw %%mm2,%%mm1\n\t" \
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
"psrad $16,%%mm4\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"psrad $16,%%mm5\n\t" \
"paddw %%mm1,%%mm2\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movq "_r5"(%[y]),%%mm1\n\t" \
"paddw %%mm2,%%mm4\n\t" \
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
The naive implementation could cause overflow, so we use \
u=(r&s)+((r^s)>>1).*/ \
"movq "_r3"(%[y]),%%mm2\n\t" \
"movq %%mm0,%%mm7\n\t" \
"pxor %%mm4,%%mm0\n\t" \
"pand %%mm4,%%mm7\n\t" \
"psraw $1,%%mm0\n\t" \
"mov $0x7FFF54DC,%[a]\n\t" \
"paddw %%mm7,%%mm0\n\t" \
"movd %[a],%%mm7\n\t" \
/*mm7={54491-0x7FFF,0x7FFF}x2 \
mm4=_y[4]=v=r-u*/ \
"psubw %%mm0,%%mm4\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"movq %%mm4,"_r4"(%[y])\n\t" \
/*mm0=0, mm7={36410}x4 \
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"mov $0x8E3A8E3A,%[a]\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"movq %%mm0,"_r0"(%[y])\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
/*mm0=0 \
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"paddw %%mm2,%%mm1\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
/*mm3={-1}x4, mm6={1}x4 \
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"psubw %%mm3,%%mm6\n\t" \
/*mm1=t7'', mm7={26568,0x3400}x2 \
mm2=s=t6'''-(36410*u>>16)*/ \
"movq %%mm4,%%mm1\n\t" \
"mov $0x340067C8,%[a]\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"movq %%mm1,"_r5"(%[y])\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddw %%mm1,%%mm4\n\t" \
"movq "_r7"(%[y]),%%mm1\n\t" \
"psubw %%mm4,%%mm2\n\t" \
/*mm6={0x00007B1B}x2 \
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
"movq %%mm2,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x7B1B,%[a]\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
/*mm7={64277-0x7FFF,0x7FFF}x2 \
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
"psrad $17,%%mm4\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psrad $17,%%mm5\n\t" \
"mov $0x7FFF7B16,%[a]\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"paddw %%mm4,%%mm2\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
/*mm0=0, mm7={12785}x4 \
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"movq %%mm2,"_r3"(%[y])\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"movq "_r1"(%[y]),%%mm2\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x31F131F1,%[a]\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
/*mm3={-1}x4, mm6={1}x4 \
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm3,%%mm3\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"psubw %%mm3,%%mm6\n\t" \
/*mm1=t3'', mm7={20539,0x3000}x2 \
mm4=s=(12785*u>>16)-t4''*/ \
"movq %%mm4,"_r1"(%[y])\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"mov $0x3000503B,%[a]\n\t" \
"movq "_r6"(%[y]),%%mm1\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm2,%%mm4\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
/*mm6={0x00006CB7}x2 \
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
"movq %%mm4,%%mm5\n\t" \
"movq %%mm4,%%mm2\n\t" \
"punpcklwd %%mm6,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x6CB7,%[a]\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" \
"movd %[a],%%mm6\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"psubw %%mm3,%%mm0\n\t" \
"punpckldq %%mm6,%%mm6\n\t" \
/*mm7={60547-0x7FFF,0x7FFF}x2 \
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
"psrad $20,%%mm4\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psrad $20,%%mm5\n\t" \
"mov $0x7FFF6C84,%[a]\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
"movd %[a],%%mm7\n\t" \
"paddw %%mm4,%%mm2\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
/*mm0=0, mm7={25080}x4 \
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm1,%%mm5\n\t" \
"movq %%mm2,"_r7"(%[y])\n\t" \
"punpcklwd %%mm1,%%mm4\n\t" \
"movq "_r2"(%[y]),%%mm2\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"mov $0x61F861F8,%[a]\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
"pxor %%mm0,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm5\n\t" \
"movd %[a],%%mm7\n\t" \
"pcmpeqw %%mm0,%%mm1\n\t" \
"psubw %%mm3,%%mm1\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"paddd %%mm6,%%mm4\n\t" \
"paddd %%mm6,%%mm5\n\t" \
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
"movq %%mm2,%%mm6\n\t" \
"movq %%mm2,%%mm3\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"pmullw %%mm7,%%mm3\n\t" \
"paddw %%mm1,%%mm6\n\t" \
"movq %%mm3,%%mm1\n\t" \
"punpckhwd %%mm6,%%mm3\n\t" \
"punpcklwd %%mm6,%%mm1\n\t" \
/*mm1={-1}x4 \
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
"paddd %%mm3,%%mm5\n\t" \
"paddd %%mm1,%%mm4\n\t" \
"psrad $16,%%mm5\n\t" \
"mov $0x28005460,%[a]\n\t" \
"psrad $16,%%mm4\n\t" \
"pcmpeqb %%mm1,%%mm1\n\t" \
"packssdw %%mm5,%%mm4\n\t" \
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
mm4=s=(25080*u>>16)-t2''*/ \
"movq %%mm4,%%mm6\n\t" \
"pmulhw %%mm7,%%mm4\n\t" \
"pxor %%mm5,%%mm5\n\t" \
"movd %[a],%%mm7\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"punpckldq %%mm7,%%mm7\n\t" \
"psubw %%mm2,%%mm4\n\t" \
/*mm2=s+(s!=0) \
mm4:mm3=s*21600+0x2800*/ \
"movq %%mm4,%%mm3\n\t" \
"movq %%mm4,%%mm2\n\t" \
"punpckhwd %%mm5,%%mm4\n\t" \
"pcmpeqw %%mm2,%%mm0\n\t" \
"pmaddwd %%mm7,%%mm4\n\t" \
"psubw %%mm1,%%mm0\n\t" \
"punpcklwd %%mm5,%%mm3\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"pmaddwd %%mm7,%%mm3\n\t" \
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
"movq "_r4"(%[y]),%%mm0\n\t" \
"psrad $18,%%mm4\n\t" \
"movq "_r5"(%[y]),%%mm5\n\t" \
"psrad $18,%%mm3\n\t" \
"movq "_r7"(%[y]),%%mm1\n\t" \
"packssdw %%mm4,%%mm3\n\t" \
"movq "_r0"(%[y]),%%mm4\n\t" \
"paddw %%mm2,%%mm3\n\t" \
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
"#OC_TRANSPOSE8x4\n\t" \
/*First 4x4 transpose:*/ \
/*mm0 = e3 e2 e1 e0 \
mm5 = f3 f2 f1 f0 \
mm3 = g3 g2 g1 g0 \
mm1 = h3 h2 h1 h0*/ \
"movq %%mm0,%%mm2\n\t" \
"punpcklwd %%mm5,%%mm0\n\t" \
"punpckhwd %%mm5,%%mm2\n\t" \
"movq %%mm3,%%mm5\n\t" \
"punpcklwd %%mm1,%%mm3\n\t" \
"punpckhwd %%mm1,%%mm5\n\t" \
/*mm0 = f1 e1 f0 e0 \
mm2 = f3 e3 f2 e2 \
mm3 = h1 g1 h0 g0 \
mm5 = h3 g3 h2 g2*/ \
"movq %%mm0,%%mm1\n\t" \
"punpckldq %%mm3,%%mm0\n\t" \
"movq %%mm0,"_r4"(%[y])\n\t" \
"punpckhdq %%mm3,%%mm1\n\t" \
"movq "_r1"(%[y]),%%mm0\n\t" \
"movq %%mm2,%%mm3\n\t" \
"punpckldq %%mm5,%%mm2\n\t" \
"punpckhdq %%mm5,%%mm3\n\t" \
"movq "_r3"(%[y]),%%mm5\n\t" \
/*_y[4] = h0 g0 f0 e0 \
mm1 = h1 g1 f1 e1 \
mm2 = h2 g2 f2 e2 \
mm3 = h3 g3 f3 e3*/ \
/*Second 4x4 transpose:*/ \
/*mm4 = a3 a2 a1 a0 \
mm0 = b3 b2 b1 b0 \
mm6 = c3 c2 c1 c0 \
mm5 = d3 d2 d1 d0*/ \
"movq %%mm4,%%mm7\n\t" \
"punpcklwd %%mm0,%%mm4\n\t" \
"punpckhwd %%mm0,%%mm7\n\t" \
"movq %%mm6,%%mm0\n\t" \
"punpcklwd %%mm5,%%mm6\n\t" \
"punpckhwd %%mm5,%%mm0\n\t" \
/*mm4 = b1 a1 b0 a0 \
mm7 = b3 a3 b2 a2 \
mm6 = d1 c1 d0 c0 \
mm0 = d3 c3 d2 c2*/ \
"movq %%mm4,%%mm5\n\t" \
"punpckldq %%mm6,%%mm4\n\t" \
"punpckhdq %%mm6,%%mm5\n\t" \
"movq %%mm7,%%mm6\n\t" \
"punpckhdq %%mm0,%%mm7\n\t" \
"punpckldq %%mm0,%%mm6\n\t" \
/*mm4 = d0 c0 b0 a0 \
mm5 = d1 c1 b1 a1 \
mm6 = d2 c2 b2 a2 \
mm7 = d3 c3 b3 a3*/ \
/*MMX implementation of the fDCT.*/
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
OC_ALIGN8(ogg_int16_t buf[64]);
ptrdiff_t a;
__asm__ __volatile__(
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
/*We also add biases to correct for some systematic error that remains in
the full fDCT->iDCT round trip.*/
"movq 0x00(%[x]),%%mm0\n\t"
"movq 0x10(%[x]),%%mm1\n\t"
"movq 0x20(%[x]),%%mm2\n\t"
"movq 0x30(%[x]),%%mm3\n\t"
"pcmpeqb %%mm4,%%mm4\n\t"
"pxor %%mm7,%%mm7\n\t"
"movq %%mm0,%%mm5\n\t"
"psllw $2,%%mm0\n\t"
"pcmpeqw %%mm7,%%mm5\n\t"
"movq 0x70(%[x]),%%mm7\n\t"
"psllw $2,%%mm1\n\t"
"psubw %%mm4,%%mm5\n\t"
"psllw $2,%%mm2\n\t"
"mov $1,%[a]\n\t"
"pslld $16,%%mm5\n\t"
"movd %[a],%%mm6\n\t"
"psllq $16,%%mm5\n\t"
"mov $0x10001,%[a]\n\t"
"psllw $2,%%mm3\n\t"
"movd %[a],%%mm4\n\t"
"punpckhwd %%mm6,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"movq 0x60(%[x]),%%mm6\n\t"
"paddw %%mm5,%%mm0\n\t"
"movq 0x50(%[x]),%%mm5\n\t"
"paddw %%mm4,%%mm0\n\t"
"movq 0x40(%[x]),%%mm4\n\t"
/*We inline stage1 of the transform here so we can get better instruction
scheduling with the shifts.*/
/*mm0=t7'=t0-t7*/
"psllw $2,%%mm7\n\t"
"psubw %%mm7,%%mm0\n\t"
"psllw $2,%%mm6\n\t"
"paddw %%mm7,%%mm7\n\t"
/*mm1=t6'=t1-t6*/
"psllw $2,%%mm5\n\t"
"psubw %%mm6,%%mm1\n\t"
"psllw $2,%%mm4\n\t"
"paddw %%mm6,%%mm6\n\t"
/*mm2=t5'=t2-t5*/
"psubw %%mm5,%%mm2\n\t"
"paddw %%mm5,%%mm5\n\t"
/*mm3=t4'=t3-t4*/
"psubw %%mm4,%%mm3\n\t"
"paddw %%mm4,%%mm4\n\t"
/*mm7=t0'=t0+t7*/
"paddw %%mm0,%%mm7\n\t"
/*mm6=t1'=t1+t6*/
"paddw %%mm1,%%mm6\n\t"
/*mm5=t2'=t2+t5*/
"paddw %%mm2,%%mm5\n\t"
/*mm4=t3'=t3+t4*/
"paddw %%mm3,%%mm4\n\t"
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
/*Swap out this 8x4 block for the next one.*/
"movq 0x08(%[x]),%%mm0\n\t"
"movq %%mm7,0x30(%[y])\n\t"
"movq 0x78(%[x]),%%mm7\n\t"
"movq %%mm1,0x50(%[y])\n\t"
"movq 0x18(%[x]),%%mm1\n\t"
"movq %%mm6,0x20(%[y])\n\t"
"movq 0x68(%[x]),%%mm6\n\t"
"movq %%mm2,0x60(%[y])\n\t"
"movq 0x28(%[x]),%%mm2\n\t"
"movq %%mm5,0x10(%[y])\n\t"
"movq 0x58(%[x]),%%mm5\n\t"
"movq %%mm3,0x70(%[y])\n\t"
"movq 0x38(%[x]),%%mm3\n\t"
/*And increase its working precision, too.*/
"psllw $2,%%mm0\n\t"
"movq %%mm4,0x00(%[y])\n\t"
"psllw $2,%%mm7\n\t"
"movq 0x48(%[x]),%%mm4\n\t"
/*We inline stage1 of the transform here so we can get better instruction
scheduling with the shifts.*/
/*mm0=t7'=t0-t7*/
"psubw %%mm7,%%mm0\n\t"
"psllw $2,%%mm1\n\t"
"paddw %%mm7,%%mm7\n\t"
"psllw $2,%%mm6\n\t"
/*mm1=t6'=t1-t6*/
"psubw %%mm6,%%mm1\n\t"
"psllw $2,%%mm2\n\t"
"paddw %%mm6,%%mm6\n\t"
"psllw $2,%%mm5\n\t"
/*mm2=t5'=t2-t5*/
"psubw %%mm5,%%mm2\n\t"
"psllw $2,%%mm3\n\t"
"paddw %%mm5,%%mm5\n\t"
"psllw $2,%%mm4\n\t"
/*mm3=t4'=t3-t4*/
"psubw %%mm4,%%mm3\n\t"
"paddw %%mm4,%%mm4\n\t"
/*mm7=t0'=t0+t7*/
"paddw %%mm0,%%mm7\n\t"
/*mm6=t1'=t1+t6*/
"paddw %%mm1,%%mm6\n\t"
/*mm5=t2'=t2+t5*/
"paddw %%mm2,%%mm5\n\t"
/*mm4=t3'=t3+t4*/
"paddw %%mm3,%%mm4\n\t"
OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place,
so we only have to do half the stores and loads.*/
"movq 0x00(%[y]),%%mm0\n\t"
"movq %%mm1,0x58(%[y])\n\t"
"movq 0x10(%[y]),%%mm1\n\t"
"movq %%mm2,0x68(%[y])\n\t"
"movq 0x20(%[y]),%%mm2\n\t"
"movq %%mm3,0x78(%[y])\n\t"
"movq 0x30(%[y]),%%mm3\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
/*mm2={-2}x4*/
"pcmpeqw %%mm2,%%mm2\n\t"
"paddw %%mm2,%%mm2\n\t"
/*Round and store the results (no transpose).*/
"movq 0x10(%[y]),%%mm7\n\t"
"psubw %%mm2,%%mm4\n\t"
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
"movq 0x30(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
"psraw $2,%%mm7\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
"psraw $2,%%mm4\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
/*Load the next block.*/
"movq 0x40(%[y]),%%mm0\n\t"
"movq 0x78(%[y]),%%mm7\n\t"
"movq 0x50(%[y]),%%mm1\n\t"
"movq 0x68(%[y]),%%mm6\n\t"
"movq 0x60(%[y]),%%mm2\n\t"
"movq 0x58(%[y]),%%mm5\n\t"
"movq 0x70(%[y]),%%mm3\n\t"
"movq 0x48(%[y]),%%mm4\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
/*mm2={-2}x4*/
"pcmpeqw %%mm2,%%mm2\n\t"
"paddw %%mm2,%%mm2\n\t"
/*Round and store the results (no transpose).*/
"movq 0x50(%[y]),%%mm7\n\t"
"psubw %%mm2,%%mm4\n\t"
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
"movq 0x70(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
"psraw $2,%%mm7\n\t"
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
"psraw $2,%%mm4\n\t"
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
/*Final transpose and zig-zag.*/
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[y]"r"(_y),[x]"r"(_x)
:"memory"
);
}
#endif

368
thirdparty/libtheora/x86/mmxfrag.c vendored Normal file
View File

@@ -0,0 +1,368 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of fragment reconstruction for motion compensation.
Originally written by Rudolf Marek.
Additional optimization by Nils Pipenbrinck.
Note: Loops are unrolled for best performance.
The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"
#if defined(OC_X86_ASM)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
do{ \
const unsigned char *src; \
unsigned char *dst; \
ptrdiff_t ystride3; \
src=(_src); \
dst=(_dst); \
__asm__ __volatile__( \
/*src+0*ystride*/ \
"movq (%[src]),%%mm0\n\t" \
/*src+1*ystride*/ \
"movq (%[src],%[ystride]),%%mm1\n\t" \
/*ystride3=ystride*3*/ \
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
/*src+2*ystride*/ \
"movq (%[src],%[ystride],2),%%mm2\n\t" \
/*src+3*ystride*/ \
"movq (%[src],%[ystride3]),%%mm3\n\t" \
/*dst+0*ystride*/ \
"movq %%mm0,(%[dst])\n\t" \
/*dst+1*ystride*/ \
"movq %%mm1,(%[dst],%[ystride])\n\t" \
/*Pointer to next 4.*/ \
"lea (%[src],%[ystride],4),%[src]\n\t" \
/*dst+2*ystride*/ \
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
/*dst+3*ystride*/ \
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
/*Pointer to next 4.*/ \
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
/*src+0*ystride*/ \
"movq (%[src]),%%mm0\n\t" \
/*src+1*ystride*/ \
"movq (%[src],%[ystride]),%%mm1\n\t" \
/*src+2*ystride*/ \
"movq (%[src],%[ystride],2),%%mm2\n\t" \
/*src+3*ystride*/ \
"movq (%[src],%[ystride3]),%%mm3\n\t" \
/*dst+0*ystride*/ \
"movq %%mm0,(%[dst])\n\t" \
/*dst+1*ystride*/ \
"movq %%mm1,(%[dst],%[ystride])\n\t" \
/*dst+2*ystride*/ \
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
/*dst+3*ystride*/ \
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
:[ystride]"r"((ptrdiff_t)(_ystride)) \
:"memory" \
); \
} \
while(0)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride){
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
}
/*Copies the fragments specified by the lists of fragment indices from one
frame to another.
_dst_frame: The reference frame to copy to.
_src_frame: The reference frame to copy from.
_ystride: The row stride of the reference frames.
_fragis: A pointer to a list of fragment indices.
_nfragis: The number of fragment indices to copy.
_frag_buf_offs: The offsets of fragments in the reference frames.*/
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
const unsigned char *_src_frame,int _ystride,
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
ptrdiff_t fragii;
for(fragii=0;fragii<_nfragis;fragii++){
ptrdiff_t frag_buf_off;
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
_src_frame+frag_buf_off,_ystride);
}
}
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue){
__asm__ __volatile__(
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
"pcmpeqw %%mm0,%%mm0\n\t"
/*#0 Load low residue.*/
"movq 0*8(%[residue]),%%mm1\n\t"
/*#0 Load high residue.*/
"movq 1*8(%[residue]),%%mm2\n\t"
/*Set mm0 to 0x8000800080008000.*/
"psllw $15,%%mm0\n\t"
/*#1 Load low residue.*/
"movq 2*8(%[residue]),%%mm3\n\t"
/*#1 Load high residue.*/
"movq 3*8(%[residue]),%%mm4\n\t"
/*Set mm0 to 0x0080008000800080.*/
"psrlw $8,%%mm0\n\t"
/*#2 Load low residue.*/
"movq 4*8(%[residue]),%%mm5\n\t"
/*#2 Load high residue.*/
"movq 5*8(%[residue]),%%mm6\n\t"
/*#0 Bias low residue.*/
"paddsw %%mm0,%%mm1\n\t"
/*#0 Bias high residue.*/
"paddsw %%mm0,%%mm2\n\t"
/*#0 Pack to byte.*/
"packuswb %%mm2,%%mm1\n\t"
/*#1 Bias low residue.*/
"paddsw %%mm0,%%mm3\n\t"
/*#1 Bias high residue.*/
"paddsw %%mm0,%%mm4\n\t"
/*#1 Pack to byte.*/
"packuswb %%mm4,%%mm3\n\t"
/*#2 Bias low residue.*/
"paddsw %%mm0,%%mm5\n\t"
/*#2 Bias high residue.*/
"paddsw %%mm0,%%mm6\n\t"
/*#2 Pack to byte.*/
"packuswb %%mm6,%%mm5\n\t"
/*#0 Write row.*/
"movq %%mm1,(%[dst])\n\t"
/*#1 Write row.*/
"movq %%mm3,(%[dst],%[ystride])\n\t"
/*#2 Write row.*/
"movq %%mm5,(%[dst],%[ystride],2)\n\t"
/*#3 Load low residue.*/
"movq 6*8(%[residue]),%%mm1\n\t"
/*#3 Load high residue.*/
"movq 7*8(%[residue]),%%mm2\n\t"
/*#4 Load high residue.*/
"movq 8*8(%[residue]),%%mm3\n\t"
/*#4 Load high residue.*/
"movq 9*8(%[residue]),%%mm4\n\t"
/*#5 Load high residue.*/
"movq 10*8(%[residue]),%%mm5\n\t"
/*#5 Load high residue.*/
"movq 11*8(%[residue]),%%mm6\n\t"
/*#3 Bias low residue.*/
"paddsw %%mm0,%%mm1\n\t"
/*#3 Bias high residue.*/
"paddsw %%mm0,%%mm2\n\t"
/*#3 Pack to byte.*/
"packuswb %%mm2,%%mm1\n\t"
/*#4 Bias low residue.*/
"paddsw %%mm0,%%mm3\n\t"
/*#4 Bias high residue.*/
"paddsw %%mm0,%%mm4\n\t"
/*#4 Pack to byte.*/
"packuswb %%mm4,%%mm3\n\t"
/*#5 Bias low residue.*/
"paddsw %%mm0,%%mm5\n\t"
/*#5 Bias high residue.*/
"paddsw %%mm0,%%mm6\n\t"
/*#5 Pack to byte.*/
"packuswb %%mm6,%%mm5\n\t"
/*#3 Write row.*/
"movq %%mm1,(%[dst],%[ystride3])\n\t"
/*#4 Write row.*/
"movq %%mm3,(%[dst4])\n\t"
/*#5 Write row.*/
"movq %%mm5,(%[dst4],%[ystride])\n\t"
/*#6 Load low residue.*/
"movq 12*8(%[residue]),%%mm1\n\t"
/*#6 Load high residue.*/
"movq 13*8(%[residue]),%%mm2\n\t"
/*#7 Load low residue.*/
"movq 14*8(%[residue]),%%mm3\n\t"
/*#7 Load high residue.*/
"movq 15*8(%[residue]),%%mm4\n\t"
/*#6 Bias low residue.*/
"paddsw %%mm0,%%mm1\n\t"
/*#6 Bias high residue.*/
"paddsw %%mm0,%%mm2\n\t"
/*#6 Pack to byte.*/
"packuswb %%mm2,%%mm1\n\t"
/*#7 Bias low residue.*/
"paddsw %%mm0,%%mm3\n\t"
/*#7 Bias high residue.*/
"paddsw %%mm0,%%mm4\n\t"
/*#7 Pack to byte.*/
"packuswb %%mm4,%%mm3\n\t"
/*#6 Write row.*/
"movq %%mm1,(%[dst4],%[ystride],2)\n\t"
/*#7 Write row.*/
"movq %%mm3,(%[dst4],%[ystride3])\n\t"
:
:[residue]"r"(_residue),
[dst]"r"(_dst),
[dst4]"r"(_dst+(_ystride*4)),
[ystride]"r"((ptrdiff_t)_ystride),
[ystride3]"r"((ptrdiff_t)_ystride*3)
:"memory"
);
}
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
int _ystride,const ogg_int16_t *_residue){
int i;
/*Zero mm0.*/
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
for(i=4;i-->0;){
__asm__ __volatile__(
/*#0 Load source.*/
"movq (%[src]),%%mm3\n\t"
/*#1 Load source.*/
"movq (%[src],%[ystride]),%%mm7\n\t"
/*#0 Get copy of src.*/
"movq %%mm3,%%mm4\n\t"
/*#0 Expand high source.*/
"punpckhbw %%mm0,%%mm4\n\t"
/*#0 Expand low source.*/
"punpcklbw %%mm0,%%mm3\n\t"
/*#0 Add residue high.*/
"paddsw 8(%[residue]),%%mm4\n\t"
/*#1 Get copy of src.*/
"movq %%mm7,%%mm2\n\t"
/*#0 Add residue low.*/
"paddsw (%[residue]), %%mm3\n\t"
/*#1 Expand high source.*/
"punpckhbw %%mm0,%%mm2\n\t"
/*#0 Pack final row pixels.*/
"packuswb %%mm4,%%mm3\n\t"
/*#1 Expand low source.*/
"punpcklbw %%mm0,%%mm7\n\t"
/*#1 Add residue low.*/
"paddsw 16(%[residue]),%%mm7\n\t"
/*#1 Add residue high.*/
"paddsw 24(%[residue]),%%mm2\n\t"
/*Advance residue.*/
"lea 32(%[residue]),%[residue]\n\t"
/*#1 Pack final row pixels.*/
"packuswb %%mm2,%%mm7\n\t"
/*Advance src.*/
"lea (%[src],%[ystride],2),%[src]\n\t"
/*#0 Write row.*/
"movq %%mm3,(%[dst])\n\t"
/*#1 Write row.*/
"movq %%mm7,(%[dst],%[ystride])\n\t"
/*Advance dst.*/
"lea (%[dst],%[ystride],2),%[dst]\n\t"
:[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
:[ystride]"r"((ptrdiff_t)_ystride)
:"memory"
);
}
}
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
int i;
/*Zero mm7.*/
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
for(i=4;i-->0;){
__asm__ __volatile__(
/*#0 Load src1.*/
"movq (%[src1]),%%mm0\n\t"
/*#0 Load src2.*/
"movq (%[src2]),%%mm2\n\t"
/*#0 Copy src1.*/
"movq %%mm0,%%mm1\n\t"
/*#0 Copy src2.*/
"movq %%mm2,%%mm3\n\t"
/*#1 Load src1.*/
"movq (%[src1],%[ystride]),%%mm4\n\t"
/*#0 Unpack lower src1.*/
"punpcklbw %%mm7,%%mm0\n\t"
/*#1 Load src2.*/
"movq (%[src2],%[ystride]),%%mm5\n\t"
/*#0 Unpack higher src1.*/
"punpckhbw %%mm7,%%mm1\n\t"
/*#0 Unpack lower src2.*/
"punpcklbw %%mm7,%%mm2\n\t"
/*#0 Unpack higher src2.*/
"punpckhbw %%mm7,%%mm3\n\t"
/*Advance src1 ptr.*/
"lea (%[src1],%[ystride],2),%[src1]\n\t"
/*Advance src2 ptr.*/
"lea (%[src2],%[ystride],2),%[src2]\n\t"
/*#0 Lower src1+src2.*/
"paddsw %%mm2,%%mm0\n\t"
/*#0 Higher src1+src2.*/
"paddsw %%mm3,%%mm1\n\t"
/*#1 Copy src1.*/
"movq %%mm4,%%mm2\n\t"
/*#0 Build lo average.*/
"psraw $1,%%mm0\n\t"
/*#1 Copy src2.*/
"movq %%mm5,%%mm3\n\t"
/*#1 Unpack lower src1.*/
"punpcklbw %%mm7,%%mm4\n\t"
/*#0 Build hi average.*/
"psraw $1,%%mm1\n\t"
/*#1 Unpack higher src1.*/
"punpckhbw %%mm7,%%mm2\n\t"
/*#0 low+=residue.*/
"paddsw (%[residue]),%%mm0\n\t"
/*#1 Unpack lower src2.*/
"punpcklbw %%mm7,%%mm5\n\t"
/*#0 high+=residue.*/
"paddsw 8(%[residue]),%%mm1\n\t"
/*#1 Unpack higher src2.*/
"punpckhbw %%mm7,%%mm3\n\t"
/*#1 Lower src1+src2.*/
"paddsw %%mm4,%%mm5\n\t"
/*#0 Pack and saturate.*/
"packuswb %%mm1,%%mm0\n\t"
/*#1 Higher src1+src2.*/
"paddsw %%mm2,%%mm3\n\t"
/*#0 Write row.*/
"movq %%mm0,(%[dst])\n\t"
/*#1 Build lo average.*/
"psraw $1,%%mm5\n\t"
/*#1 Build hi average.*/
"psraw $1,%%mm3\n\t"
/*#1 low+=residue.*/
"paddsw 16(%[residue]),%%mm5\n\t"
/*#1 high+=residue.*/
"paddsw 24(%[residue]),%%mm3\n\t"
/*#1 Pack and saturate.*/
"packuswb %%mm3,%%mm5\n\t"
/*#1 Write row ptr.*/
"movq %%mm5,(%[dst],%[ystride])\n\t"
/*Advance residue ptr.*/
"add $32,%[residue]\n\t"
/*Advance dest ptr.*/
"lea (%[dst],%[ystride],2),%[dst]\n\t"
:[dst]"+r"(_dst),[residue]"+r"(_residue),
[src1]"+r"(_src1),[src2]"+r"(_src2)
:[ystride]"r"((ptrdiff_t)_ystride)
:"memory"
);
}
}
void oc_restore_fpu_mmx(void){
__asm__ __volatile__("emms\n\t");
}
#endif

558
thirdparty/libtheora/x86/mmxidct.c vendored Normal file
View File

@@ -0,0 +1,558 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of Theora's iDCT.
Originally written by Rudolf Marek, based on code from On2's VP3.*/
#include "x86int.h"
#include "../dct.h"
#if defined(OC_X86_ASM)
/*These are offsets into the table of constants below.*/
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
#define OC_COSINE_OFFSET (0)
/*A row of 8's.*/
#define OC_EIGHT_OFFSET (56)
/*38 cycles*/
#define OC_IDCT_BEGIN(_y,_x) \
"#OC_IDCT_BEGIN\n\t" \
"movq "OC_I(3,_x)",%%mm2\n\t" \
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
"movq %%mm2,%%mm4\n\t" \
"movq "OC_J(5,_x)",%%mm7\n\t" \
"pmulhw %%mm6,%%mm4\n\t" \
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"movq %%mm1,%%mm5\n\t" \
"pmulhw %%mm2,%%mm1\n\t" \
"movq "OC_I(1,_x)",%%mm3\n\t" \
"pmulhw %%mm7,%%mm5\n\t" \
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
"paddw %%mm2,%%mm4\n\t" \
"paddw %%mm7,%%mm6\n\t" \
"paddw %%mm1,%%mm2\n\t" \
"movq "OC_J(7,_x)",%%mm1\n\t" \
"paddw %%mm5,%%mm7\n\t" \
"movq %%mm0,%%mm5\n\t" \
"pmulhw %%mm3,%%mm0\n\t" \
"paddw %%mm7,%%mm4\n\t" \
"pmulhw %%mm1,%%mm5\n\t" \
"movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm3,%%mm0\n\t" \
"pmulhw %%mm7,%%mm3\n\t" \
"movq "OC_I(2,_x)",%%mm2\n\t" \
"pmulhw %%mm1,%%mm7\n\t" \
"paddw %%mm1,%%mm5\n\t" \
"movq %%mm2,%%mm1\n\t" \
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
"psubw %%mm5,%%mm3\n\t" \
"movq "OC_J(6,_x)",%%mm5\n\t" \
"paddw %%mm7,%%mm0\n\t" \
"movq %%mm5,%%mm7\n\t" \
"psubw %%mm4,%%mm0\n\t" \
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
"paddw %%mm1,%%mm2\n\t" \
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
"paddw %%mm4,%%mm4\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"psubw %%mm6,%%mm3\n\t" \
"paddw %%mm7,%%mm5\n\t" \
"paddw %%mm6,%%mm6\n\t" \
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
"paddw %%mm3,%%mm6\n\t" \
"movq %%mm4,"OC_I(1,_y)"\n\t" \
"psubw %%mm5,%%mm1\n\t" \
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"movq %%mm3,%%mm5\n\t" \
"pmulhw %%mm4,%%mm3\n\t" \
"paddw %%mm2,%%mm7\n\t" \
"movq %%mm6,"OC_I(2,_y)"\n\t" \
"movq %%mm0,%%mm2\n\t" \
"movq "OC_I(0,_x)",%%mm6\n\t" \
"pmulhw %%mm4,%%mm0\n\t" \
"paddw %%mm3,%%mm5\n\t" \
"movq "OC_J(4,_x)",%%mm3\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psubw %%mm3,%%mm6\n\t" \
"movq %%mm6,%%mm0\n\t" \
"pmulhw %%mm4,%%mm6\n\t" \
"paddw %%mm3,%%mm3\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"paddw %%mm0,%%mm3\n\t" \
"paddw %%mm5,%%mm1\n\t" \
"pmulhw %%mm3,%%mm4\n\t" \
"paddw %%mm0,%%mm6\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"movq "OC_I(1,_y)",%%mm0\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"paddw %%mm3,%%mm4\n\t" \
"psubw %%mm1,%%mm2\n\t" \
"#end OC_IDCT_BEGIN\n\t" \
/*38+8=46 cycles.*/
#define OC_ROW_IDCT(_y,_x) \
"#OC_ROW_IDCT\n" \
OC_IDCT_BEGIN(_y,_x) \
/*r3=D'*/ \
"movq "OC_I(2,_y)",%%mm3\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*r1=R1=A''+H'*/ \
"paddw %%mm2,%%mm1\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
"paddw %%mm3,%%mm3\n\t" \
/*r6=R6=F'-B''*/ \
"psubw %%mm5,%%mm6\n\t" \
"paddw %%mm5,%%mm5\n\t" \
/*r3=R3=E'+D'*/ \
"paddw %%mm4,%%mm3\n\t" \
/*r5=R5=F'+B''*/ \
"paddw %%mm6,%%mm5\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
"paddw %%mm0,%%mm0\n\t" \
/*Save R1.*/ \
"movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r0=R0=G.+C.*/ \
"paddw %%mm7,%%mm0\n\t" \
"#end OC_ROW_IDCT\n\t" \
/*The following macro does two 4x4 transposes in place.
At entry, we assume:
r0 = a3 a2 a1 a0
I(1) = b3 b2 b1 b0
r2 = c3 c2 c1 c0
r3 = d3 d2 d1 d0
r4 = e3 e2 e1 e0
r5 = f3 f2 f1 f0
r6 = g3 g2 g1 g0
r7 = h3 h2 h1 h0
At exit, we have:
I(0) = d0 c0 b0 a0
I(1) = d1 c1 b1 a1
I(2) = d2 c2 b2 a2
I(3) = d3 c3 b3 a3
J(4) = h0 g0 f0 e0
J(5) = h1 g1 f1 e1
J(6) = h2 g2 f2 e2
J(7) = h3 g3 f3 e3
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
Since r1 is free at entry, we calculate the Js first.*/
/*19 cycles.*/
#define OC_TRANSPOSE(_y) \
"#OC_TRANSPOSE\n\t" \
"movq %%mm4,%%mm1\n\t" \
"punpcklwd %%mm5,%%mm4\n\t" \
"movq %%mm0,"OC_I(0,_y)"\n\t" \
"punpckhwd %%mm5,%%mm1\n\t" \
"movq %%mm6,%%mm0\n\t" \
"punpcklwd %%mm7,%%mm6\n\t" \
"movq %%mm4,%%mm5\n\t" \
"punpckldq %%mm6,%%mm4\n\t" \
"punpckhdq %%mm6,%%mm5\n\t" \
"movq %%mm1,%%mm6\n\t" \
"movq %%mm4,"OC_J(4,_y)"\n\t" \
"punpckhwd %%mm7,%%mm0\n\t" \
"movq %%mm5,"OC_J(5,_y)"\n\t" \
"punpckhdq %%mm0,%%mm6\n\t" \
"movq "OC_I(0,_y)",%%mm4\n\t" \
"punpckldq %%mm0,%%mm1\n\t" \
"movq "OC_I(1,_y)",%%mm5\n\t" \
"movq %%mm4,%%mm0\n\t" \
"movq %%mm6,"OC_J(7,_y)"\n\t" \
"punpcklwd %%mm5,%%mm0\n\t" \
"movq %%mm1,"OC_J(6,_y)"\n\t" \
"punpckhwd %%mm5,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm3,%%mm2\n\t" \
"movq %%mm0,%%mm1\n\t" \
"punpckldq %%mm2,%%mm0\n\t" \
"punpckhdq %%mm2,%%mm1\n\t" \
"movq %%mm4,%%mm2\n\t" \
"movq %%mm0,"OC_I(0,_y)"\n\t" \
"punpckhwd %%mm3,%%mm5\n\t" \
"movq %%mm1,"OC_I(1,_y)"\n\t" \
"punpckhdq %%mm5,%%mm4\n\t" \
"punpckldq %%mm5,%%mm2\n\t" \
"movq %%mm4,"OC_I(3,_y)"\n\t" \
"movq %%mm2,"OC_I(2,_y)"\n\t" \
"#end OC_TRANSPOSE\n\t" \
/*38+19=57 cycles.*/
#define OC_COLUMN_IDCT(_y) \
"#OC_COLUMN_IDCT\n" \
OC_IDCT_BEGIN(_y,_y) \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r1=R1=A''+H'*/ \
"paddw %%mm2,%%mm1\n\t" \
/*r2=NR2*/ \
"psraw $4,%%mm2\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=NR1*/ \
"psraw $4,%%mm1\n\t" \
/*r3=D'*/ \
"movq "OC_I(2,_y)",%%mm3\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*Store NR2 at I(2).*/ \
"movq %%mm2,"OC_I(2,_y)"\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*Store NR1 at I(1).*/ \
"movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
/*r3=D'+D'*/ \
"paddw %%mm3,%%mm3\n\t" \
/*r3=R3=E'+D'*/ \
"paddw %%mm4,%%mm3\n\t" \
/*r4=NR4*/ \
"psraw $4,%%mm4\n\t" \
/*r6=R6=F'-B''*/ \
"psubw %%mm5,%%mm6\n\t" \
/*r3=NR3*/ \
"psraw $4,%%mm3\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
/*r5=B''+B''*/ \
"paddw %%mm5,%%mm5\n\t" \
/*r5=R5=F'+B''*/ \
"paddw %%mm6,%%mm5\n\t" \
/*r6=NR6*/ \
"psraw $4,%%mm6\n\t" \
/*Store NR4 at J(4).*/ \
"movq %%mm4,"OC_J(4,_y)"\n\t" \
/*r5=NR5*/ \
"psraw $4,%%mm5\n\t" \
/*Store NR3 at I(3).*/ \
"movq %%mm3,"OC_I(3,_y)"\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
/*r0=C'+C'*/ \
"paddw %%mm0,%%mm0\n\t" \
/*r0=R0=G'+C'*/ \
"paddw %%mm7,%%mm0\n\t" \
/*r7=NR7*/ \
"psraw $4,%%mm7\n\t" \
/*Store NR6 at J(6).*/ \
"movq %%mm6,"OC_J(6,_y)"\n\t" \
/*r0=NR0*/ \
"psraw $4,%%mm0\n\t" \
/*Store NR5 at J(5).*/ \
"movq %%mm5,"OC_J(5,_y)"\n\t" \
/*Store NR7 at J(7).*/ \
"movq %%mm7,"OC_J(7,_y)"\n\t" \
/*Store NR0 at I(0).*/ \
"movq %%mm0,"OC_I(0,_y)"\n\t" \
"#end OC_COLUMN_IDCT\n\t" \
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
int i;
/*This routine accepts an 8x8 matrix, but in partially transposed form.
Every 4x4 block is transposed.*/
__asm__ __volatile__(
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
OC_ROW_IDCT(y,x)
OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
OC_ROW_IDCT(y,x)
OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT(y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT(y)
#undef OC_I
#undef OC_J
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64)
);
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
for(i=0;i<4;i++){
__asm__ __volatile__(
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
:[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
);
}
}
/*25 cycles.*/
#define OC_IDCT_BEGIN_10(_y,_x) \
"#OC_IDCT_BEGIN_10\n\t" \
"movq "OC_I(3,_x)",%%mm2\n\t" \
"nop\n\t" \
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
"movq %%mm2,%%mm4\n\t" \
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
"pmulhw %%mm6,%%mm4\n\t" \
"movq "OC_I(1,_x)",%%mm3\n\t" \
"pmulhw %%mm2,%%mm1\n\t" \
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
"paddw %%mm2,%%mm4\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"paddw %%mm1,%%mm2\n\t" \
"movq "OC_I(2,_x)",%%mm5\n\t" \
"pmulhw %%mm3,%%mm0\n\t" \
"movq %%mm5,%%mm1\n\t" \
"paddw %%mm3,%%mm0\n\t" \
"pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
"psubw %%mm4,%%mm0\n\t" \
"movq "OC_I(2,_x)",%%mm7\n\t" \
"paddw %%mm4,%%mm4\n\t" \
"paddw %%mm5,%%mm7\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
"psubw %%mm6,%%mm3\n\t" \
"movq %%mm4,"OC_I(1,_y)"\n\t" \
"paddw %%mm6,%%mm6\n\t" \
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"paddw %%mm3,%%mm6\n\t" \
"movq %%mm3,%%mm5\n\t" \
"pmulhw %%mm4,%%mm3\n\t" \
"movq %%mm6,"OC_I(2,_y)"\n\t" \
"movq %%mm0,%%mm2\n\t" \
"movq "OC_I(0,_x)",%%mm6\n\t" \
"pmulhw %%mm4,%%mm0\n\t" \
"paddw %%mm3,%%mm5\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"pmulhw %%mm4,%%mm6\n\t" \
"paddw "OC_I(0,_x)",%%mm6\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"movq %%mm6,%%mm4\n\t" \
"paddw %%mm5,%%mm1\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"movq "OC_I(1,_y)",%%mm0\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"psubw %%mm1,%%mm2\n\t" \
"nop\n\t" \
"#end OC_IDCT_BEGIN_10\n\t" \
/*25+8=33 cycles.*/
#define OC_ROW_IDCT_10(_y,_x) \
"#OC_ROW_IDCT_10\n\t" \
OC_IDCT_BEGIN_10(_y,_x) \
/*r3=D'*/ \
"movq "OC_I(2,_y)",%%mm3\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*r1=R1=A''+H'*/ \
"paddw %%mm2,%%mm1\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
"paddw %%mm3,%%mm3\n\t" \
/*r6=R6=F'-B''*/ \
"psubw %%mm5,%%mm6\n\t" \
"paddw %%mm5,%%mm5\n\t" \
/*r3=R3=E'+D'*/ \
"paddw %%mm4,%%mm3\n\t" \
/*r5=R5=F'+B''*/ \
"paddw %%mm6,%%mm5\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
"paddw %%mm0,%%mm0\n\t" \
/*Save R1.*/ \
"movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r0=R0=G'+C'*/ \
"paddw %%mm7,%%mm0\n\t" \
"#end OC_ROW_IDCT_10\n\t" \
/*25+19=44 cycles'*/
#define OC_COLUMN_IDCT_10(_y) \
"#OC_COLUMN_IDCT_10\n\t" \
OC_IDCT_BEGIN_10(_y,_y) \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r1=R1=A''+H'*/ \
"paddw %%mm2,%%mm1\n\t" \
/*r2=NR2*/ \
"psraw $4,%%mm2\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=NR1*/ \
"psraw $4,%%mm1\n\t" \
/*r3=D'*/ \
"movq "OC_I(2,_y)",%%mm3\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*Store NR2 at I(2).*/ \
"movq %%mm2,"OC_I(2,_y)"\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*Store NR1 at I(1).*/ \
"movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
/*r3=D'+D'*/ \
"paddw %%mm3,%%mm3\n\t" \
/*r3=R3=E'+D'*/ \
"paddw %%mm4,%%mm3\n\t" \
/*r4=NR4*/ \
"psraw $4,%%mm4\n\t" \
/*r6=R6=F'-B''*/ \
"psubw %%mm5,%%mm6\n\t" \
/*r3=NR3*/ \
"psraw $4,%%mm3\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
/*r5=B''+B''*/ \
"paddw %%mm5,%%mm5\n\t" \
/*r5=R5=F'+B''*/ \
"paddw %%mm6,%%mm5\n\t" \
/*r6=NR6*/ \
"psraw $4,%%mm6\n\t" \
/*Store NR4 at J(4).*/ \
"movq %%mm4,"OC_J(4,_y)"\n\t" \
/*r5=NR5*/ \
"psraw $4,%%mm5\n\t" \
/*Store NR3 at I(3).*/ \
"movq %%mm3,"OC_I(3,_y)"\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
/*r0=C'+C'*/ \
"paddw %%mm0,%%mm0\n\t" \
/*r0=R0=G'+C'*/ \
"paddw %%mm7,%%mm0\n\t" \
/*r7=NR7*/ \
"psraw $4,%%mm7\n\t" \
/*Store NR6 at J(6).*/ \
"movq %%mm6,"OC_J(6,_y)"\n\t" \
/*r0=NR0*/ \
"psraw $4,%%mm0\n\t" \
/*Store NR5 at J(5).*/ \
"movq %%mm5,"OC_J(5,_y)"\n\t" \
/*Store NR7 at J(7).*/ \
"movq %%mm7,"OC_J(7,_y)"\n\t" \
/*Store NR0 at I(0).*/ \
"movq %%mm0,"OC_I(0,_y)"\n\t" \
"#end OC_COLUMN_IDCT_10\n\t" \
static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
__asm__ __volatile__(
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
/*Done with dequant, descramble, and partial transpose.
Now do the iDCT itself.*/
OC_ROW_IDCT_10(y,x)
OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT_10(y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT_10(y)
#undef OC_I
#undef OC_J
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64)
);
__asm__ __volatile__(
"pxor %%mm0,%%mm0\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
:[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
);
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
decoded.
In most cases this is an EOB token (the continuation of an EOB run from a
previous block counts), and so this is the same as the coefficient count.
However, in the case that the last token was NOT an EOB token, but filled
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
Provided the last token was not a pure zero run, the minimum value it can
be is 46, and so that doesn't affect any of the cases in this routine.
However, if the last token WAS a pure zero run of length 63, then _last_zzi
will be 1 while the number of coefficients decoded is 64.
Thus, we will trigger the following special case, where the real
coefficient count would not.
Note also that a zero run of length 64 will give _last_zzi a value of 0,
but we still process the DC coefficient, which might have a non-zero value
due to DC prediction.
Although convoluted, this is arguably the correct behavior: it allows us to
use a smaller transform when the block ends with a long zero run instead
of a normal EOB token.
It could be smarter... multiple separate zero runs at the end of a block
will fool it, but an encoder that generates these really deserves what it
gets.
Needless to say we inherited this approach from VP3.*/
/*Then perform the iDCT.*/
if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
else oc_idct8x8_slow_mmx(_y,_x);
}
#endif

318
thirdparty/libtheora/x86/mmxloop.h vendored Normal file
View File

@@ -0,0 +1,318 @@
#if !defined(_x86_mmxloop_H)
# define _x86_mmxloop_H (1)
# include <stddef.h>
# include "x86int.h"
#if defined(OC_X86_ASM)
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
#define OC_LOOP_FILTER8_MMX \
"#OC_LOOP_FILTER8_MMX\n\t" \
/*mm7=0*/ \
"pxor %%mm7,%%mm7\n\t" \
/*mm6:mm0={a0,...,a7}*/ \
"movq %%mm0,%%mm6\n\t" \
"punpcklbw %%mm7,%%mm0\n\t" \
"punpckhbw %%mm7,%%mm6\n\t" \
/*mm3:mm5={d0,...,d7}*/ \
"movq %%mm3,%%mm5\n\t" \
"punpcklbw %%mm7,%%mm3\n\t" \
"punpckhbw %%mm7,%%mm5\n\t" \
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
"psubw %%mm3,%%mm0\n\t" \
"psubw %%mm5,%%mm6\n\t" \
/*mm3:mm1={b0,...,b7}*/ \
"movq %%mm1,%%mm3\n\t" \
"punpcklbw %%mm7,%%mm1\n\t" \
"movq %%mm2,%%mm4\n\t" \
"punpckhbw %%mm7,%%mm3\n\t" \
/*mm5:mm4={c0,...,c7}*/ \
"movq %%mm2,%%mm5\n\t" \
"punpcklbw %%mm7,%%mm4\n\t" \
"punpckhbw %%mm7,%%mm5\n\t" \
/*mm7={3}x4 \
mm5:mm4={c0-b0,...,c7-b7}*/ \
"pcmpeqw %%mm7,%%mm7\n\t" \
"psubw %%mm1,%%mm4\n\t" \
"psrlw $14,%%mm7\n\t" \
"psubw %%mm3,%%mm5\n\t" \
/*Scale by 3.*/ \
"pmullw %%mm7,%%mm4\n\t" \
"pmullw %%mm7,%%mm5\n\t" \
/*mm7={4}x4 \
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
"psrlw $1,%%mm7\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"psllw $2,%%mm7\n\t" \
"movq (%[ll]),%%mm0\n\t" \
"paddw %%mm6,%%mm5\n\t" \
/*R_i has the range [-127,128], so we compute -R_i instead. \
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
"psubw %%mm7,%%mm4\n\t" \
"psubw %%mm7,%%mm5\n\t" \
"psraw $3,%%mm4\n\t" \
"psraw $3,%%mm5\n\t" \
"pcmpeqb %%mm7,%%mm7\n\t" \
"packsswb %%mm5,%%mm4\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"pxor %%mm7,%%mm4\n\t" \
"packuswb %%mm3,%%mm1\n\t" \
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
we have to split things by sign (the other option is to work in 16 bits, \
but working in 8 bits gives much better parallelism). \
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
Finally, we split mm4 into positive and negative pieces using the mask in \
mm6, and add and subtract them as appropriate.*/ \
/*mm4=abs(-R_i)*/ \
/*mm7=255-2*L*/ \
"pcmpgtb %%mm4,%%mm6\n\t" \
"psubb %%mm0,%%mm7\n\t" \
"pxor %%mm6,%%mm4\n\t" \
"psubb %%mm0,%%mm7\n\t" \
"psubb %%mm6,%%mm4\n\t" \
/*mm7=255-max(2*L-abs(R_i),0)*/ \
"paddusb %%mm4,%%mm7\n\t" \
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
"paddusb %%mm7,%%mm4\n\t" \
"psubusb %%mm7,%%mm4\n\t" \
/*Now split mm4 by the original sign of -R_i.*/ \
"movq %%mm4,%%mm5\n\t" \
"pand %%mm6,%%mm4\n\t" \
"pandn %%mm5,%%mm6\n\t" \
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
"paddusb %%mm4,%%mm1\n\t" \
"psubusb %%mm4,%%mm2\n\t" \
"psubusb %%mm6,%%mm1\n\t" \
"paddusb %%mm6,%%mm2\n\t" \
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
All other MMX registers are clobbered.*/
#define OC_LOOP_FILTER8_MMXEXT \
"#OC_LOOP_FILTER8_MMXEXT\n\t" \
/*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
-R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
/*This first part is based on the transformation \
f = -(3*(c-b)+a-d+4>>3) \
= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
= -(3*(c+~b)+(a+~d)-1016>>3) \
= 127-(3*(c+~b)+(a+~d)>>3) \
= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
Using this, the last expression above can be computed in 8 bits of working \
precision via: \
u = ~pavgb(~b,c); \
v = pavgb(b,~c); \
This mask is 0 or 0xFF, and controls whether t is biased up or down: \
m = u-v; \
t = m^pavgb(m^~a,m^d); \
f = 128+pavgb(pavgb(t,u),v); \
This required some careful analysis to ensure that carries are propagated \
correctly in all cases, but has been checked exhaustively.*/ \
/*input (a, b, c, d, ., ., ., .)*/ \
/*ff=0xFF; \
u=b; \
v=c; \
ll=255-2*L;*/ \
"pcmpeqb %%mm7,%%mm7\n\t" \
"movq %%mm1,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"movq (%[ll]),%%mm6\n\t" \
/*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
/*u^=ff; \
v^=ff;*/ \
"pxor %%mm7,%%mm4\n\t" \
"pxor %%mm7,%%mm5\n\t" \
/*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
/*u=pavgb(u,c); \
v=pavgb(v,b);*/ \
"pavgb %%mm2,%%mm4\n\t" \
"pavgb %%mm1,%%mm5\n\t" \
/*u^=ff; \
a^=ff;*/ \
"pxor %%mm7,%%mm4\n\t" \
"pxor %%mm7,%%mm0\n\t" \
/*m=u-v;*/ \
"psubb %%mm5,%%mm4\n\t" \
/*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
/*a^=m; \
d^=m;*/ \
"pxor %%mm4,%%mm0\n\t" \
"pxor %%mm4,%%mm3\n\t" \
/*t=pavgb(a,d);*/ \
"pavgb %%mm3,%%mm0\n\t" \
"psllw $7,%%mm7\n\t" \
/*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
/*t^=m; \
u=m+v;*/ \
"pxor %%mm4,%%mm0\n\t" \
"paddb %%mm5,%%mm4\n\t" \
/*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
/*f=pavgb(f,u); \
of=128;*/ \
"pavgb %%mm4,%%mm0\n\t" \
"packsswb %%mm7,%%mm7\n\t" \
/*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
/*f=pavgb(f,v);*/ \
"pavgb %%mm5,%%mm0\n\t" \
"movq %%mm7,%%mm3\n\t" \
"movq %%mm6,%%mm4\n\t" \
/*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
/*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
we have to split things by sign (the other option is to work in 16 bits, \
but staying in 8 bits gives much better parallelism).*/ \
/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
This is the same number of instructions as computing a mask and splitting \
after the lflim computation, but has shorter dependency chains.*/ \
/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
"psubusb %%mm0,%%mm3\n\t" \
"psubusb %%mm7,%%mm0\n\t" \
/*mm6=255-max(2*L-abs(R_i<0),0) \
mm4=255-max(2*L-abs(R_i>0),0)*/ \
"paddusb %%mm3,%%mm4\n\t" \
"paddusb %%mm0,%%mm6\n\t" \
/*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
"paddusb %%mm4,%%mm3\n\t" \
"paddusb %%mm6,%%mm0\n\t" \
"psubusb %%mm4,%%mm3\n\t" \
"psubusb %%mm6,%%mm0\n\t" \
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
"paddusb %%mm3,%%mm1\n\t" \
"psubusb %%mm3,%%mm2\n\t" \
"psubusb %%mm0,%%mm1\n\t" \
"paddusb %%mm0,%%mm2\n\t" \
#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
do{ \
ptrdiff_t ystride3__; \
__asm__ __volatile__( \
/*mm0={a0,...,a7}*/ \
"movq (%[pix]),%%mm0\n\t" \
/*ystride3=_ystride*3*/ \
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
/*mm3={d0,...,d7}*/ \
"movq (%[pix],%[ystride3]),%%mm3\n\t" \
/*mm1={b0,...,b7}*/ \
"movq (%[pix],%[ystride]),%%mm1\n\t" \
/*mm2={c0,...,c7}*/ \
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
_filter \
/*Write it back out.*/ \
"movq %%mm1,(%[pix],%[ystride])\n\t" \
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
:[ystride3]"=&r"(ystride3__) \
:[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
[ll]"r"(_ll) \
:"memory" \
); \
} \
while(0)
#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
do{ \
unsigned char *pix__; \
ptrdiff_t ystride3__; \
ptrdiff_t d__; \
pix__=(_pix)-2; \
__asm__ __volatile__( \
/*x x x x d0 c0 b0 a0*/ \
"movd (%[pix]),%%mm0\n\t" \
/*x x x x d1 c1 b1 a1*/ \
"movd (%[pix],%[ystride]),%%mm1\n\t" \
/*ystride3=_ystride*3*/ \
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
/*x x x x d2 c2 b2 a2*/ \
"movd (%[pix],%[ystride],2),%%mm2\n\t" \
/*x x x x d3 c3 b3 a3*/ \
"lea (%[pix],%[ystride],4),%[d]\n\t" \
"movd (%[pix],%[ystride3]),%%mm3\n\t" \
/*x x x x d4 c4 b4 a4*/ \
"movd (%[d]),%%mm4\n\t" \
/*x x x x d5 c5 b5 a5*/ \
"movd (%[d],%[ystride]),%%mm5\n\t" \
/*x x x x d6 c6 b6 a6*/ \
"movd (%[d],%[ystride],2),%%mm6\n\t" \
/*x x x x d7 c7 b7 a7*/ \
"movd (%[d],%[ystride3]),%%mm7\n\t" \
/*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
"punpcklbw %%mm1,%%mm0\n\t" \
/*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
"punpcklbw %%mm3,%%mm2\n\t" \
/*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
"movq %%mm0,%%mm3\n\t" \
/*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
"punpcklwd %%mm2,%%mm0\n\t" \
/*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
"punpckhwd %%mm2,%%mm3\n\t" \
/*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
"movq %%mm0,%%mm1\n\t" \
/*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
"punpcklbw %%mm5,%%mm4\n\t" \
/*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
"punpcklbw %%mm7,%%mm6\n\t" \
/*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
"movq %%mm4,%%mm5\n\t" \
/*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
"punpcklwd %%mm6,%%mm4\n\t" \
/*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
"punpckhwd %%mm6,%%mm5\n\t" \
/*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
"movq %%mm3,%%mm2\n\t" \
/*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
"punpckldq %%mm4,%%mm0\n\t" \
/*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
"punpckhdq %%mm4,%%mm1\n\t" \
/*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
"punpckldq %%mm5,%%mm2\n\t" \
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
"punpckhdq %%mm5,%%mm3\n\t" \
_filter \
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
"movq %%mm1,%%mm0\n\t" \
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
"punpcklbw %%mm2,%%mm1\n\t" \
/*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
"punpckhbw %%mm2,%%mm0\n\t" \
/*[d]=c1 b1 c0 b0*/ \
"movd %%mm1,%[d]\n\t" \
"movw %w[d],1(%[pix])\n\t" \
"psrlq $32,%%mm1\n\t" \
"shr $16,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride])\n\t" \
/*[d]=c3 b3 c2 b2*/ \
"movd %%mm1,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
"shr $16,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
"lea (%[pix],%[ystride],4),%[pix]\n\t" \
/*[d]=c5 b5 c4 b4*/ \
"movd %%mm0,%[d]\n\t" \
"movw %w[d],1(%[pix])\n\t" \
"psrlq $32,%%mm0\n\t" \
"shr $16,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride])\n\t" \
/*[d]=c7 b7 c6 b6*/ \
"movd %%mm0,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
"shr $16,%[d]\n\t" \
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
:[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
:[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
:"memory" \
); \
} \
while(0)
# endif
#endif

226
thirdparty/libtheora/x86/mmxstate.c vendored Normal file
View File

@@ -0,0 +1,226 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of complete fragment reconstruction algorithm.
Originally written by Rudolf Marek.*/
#include <string.h>
#include "x86int.h"
#include "mmxloop.h"
#if defined(OC_X86_ASM)
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
int refi;
/*Apply the inverse transform.*/
/*Special case only having a DC component.*/
if(_last_zzi<2){
/*Note that this value must be unsigned, to keep the __asm__ block from
sign-extending it when it puts it in a register.*/
ogg_uint16_t p;
int i;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
/*Fill _dct_coeffs with p.*/
__asm__ __volatile__(
/*mm0=0000 0000 0000 AAAA*/
"movd %[p],%%mm0\n\t"
/*mm0=0000 0000 AAAA AAAA*/
"punpcklwd %%mm0,%%mm0\n\t"
/*mm0=AAAA AAAA AAAA AAAA*/
"punpckldq %%mm0,%%mm0\n\t"
:
:[p]"r"((unsigned)p)
);
for(i=0;i<4;i++){
__asm__ __volatile__(
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
);
}
}
else{
/*Dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
refi=_state->frags[_fragi].refi;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
else{
const unsigned char *ref;
int mvoffsets[2];
ref=_state->ref_frame_data[refi]+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi])>1){
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
_dct_coeffs+64);
}
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
}
}
/*We copy these entire function to inline the actual MMX routines so that we
use only a single indirect call.*/
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
memset(_bv,_flimit,8);
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
The filter may be run on the bottom edge, affecting pixels in the next row of
fragments, so this row also needs to be available.
_bv: The bounding values array.
_refi: The index of the frame buffer to filter.
_pli: The color plane to filter.
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
OC_ALIGN8(unsigned char ll[8]);
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
unsigned char *ref_frame_data;
ptrdiff_t fragi_top;
ptrdiff_t fragi_bot;
ptrdiff_t fragi0;
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
However, the order that the filters are applied in matters, and VP3 chose
the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
fragi=fragi0;
fragi_end=fragi+nhfrags;
while(fragi<fragi_end){
if(frags[fragi].coded){
unsigned char *ref;
ref=ref_frame_data+frag_buf_offs[fragi];
if(fragi>fragi0){
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
}
if(fragi0>fragi_top){
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
}
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride*8),ystride,ll);
}
}
fragi++;
}
fragi0+=nhfrags;
}
}
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
memset(_bv,~(_flimit<<1),8);
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
The filter may be run on the bottom edge, affecting pixels in the next row of
fragments, so this row also needs to be available.
_bv: The bounding values array.
_refi: The index of the frame buffer to filter.
_pli: The color plane to filter.
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
unsigned char *ref_frame_data;
ptrdiff_t fragi_top;
ptrdiff_t fragi_bot;
ptrdiff_t fragi0;
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
However, the order that the filters are applied in matters, and VP3 chose
the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
fragi=fragi0;
fragi_end=fragi+nhfrags;
while(fragi<fragi_end){
if(frags[fragi].coded){
unsigned char *ref;
ref=ref_frame_data+frag_buf_offs[fragi];
if(fragi>fragi0){
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
}
if(fragi0>fragi_top){
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
}
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride*8),ystride,_bv);
}
}
fragi++;
}
fragi0+=nhfrags;
}
}
#endif

500
thirdparty/libtheora/x86/sse2encfrag.c vendored Normal file
View File

@@ -0,0 +1,500 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#include "sse2trans.h"
#if defined(OC_X86_ASM)
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
16-bit differences.
On output, these are stored in _m0, xmm1, xmm2, and xmm3.
xmm4 and xmm5 are clobbered.*/
#define OC_LOAD_SUB_4x8(_m0) \
"#OC_LOAD_SUB_4x8\n\t" \
/*Load the first three rows.*/ \
"movq (%[src]),"_m0"\n\t" \
"movq (%[ref]),%%xmm4\n\t" \
"movq (%[src],%[ystride]),%%xmm1\n\t" \
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
/*Unpack and subtract.*/ \
"punpcklbw %%xmm4,"_m0"\n\t" \
"punpcklbw %%xmm4,%%xmm4\n\t" \
"punpcklbw %%xmm3,%%xmm1\n\t" \
"punpcklbw %%xmm3,%%xmm3\n\t" \
"psubw %%xmm4,"_m0"\n\t" \
"psubw %%xmm3,%%xmm1\n\t" \
/*Load the last row.*/ \
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
/*Unpack, subtract, and advance the pointers.*/ \
"punpcklbw %%xmm5,%%xmm2\n\t" \
"punpcklbw %%xmm5,%%xmm5\n\t" \
"lea (%[src],%[ystride],4),%[src]\n\t" \
"psubw %%xmm5,%%xmm2\n\t" \
"punpcklbw %%xmm4,%%xmm3\n\t" \
"punpcklbw %%xmm4,%%xmm4\n\t" \
"lea (%[ref],%[ystride],4),%[ref]\n\t" \
"psubw %%xmm4,%%xmm3\n\t" \
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
On output, xmm0 contains the sum of two of the rows, and the other two are
added to xmm7.*/
#define OC_SSD_4x8(_m0) \
"pmaddwd "_m0","_m0"\n\t" \
"pmaddwd %%xmm1,%%xmm1\n\t" \
"pmaddwd %%xmm2,%%xmm2\n\t" \
"pmaddwd %%xmm3,%%xmm3\n\t" \
"paddd %%xmm1,"_m0"\n\t" \
"paddd %%xmm3,%%xmm2\n\t" \
"paddd %%xmm2,%%xmm7\n\t" \
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride){
unsigned ret;
__asm__ __volatile__(
OC_LOAD_SUB_4x8("%%xmm7")
OC_SSD_4x8("%%xmm7")
OC_LOAD_SUB_4x8("%%xmm0")
OC_SSD_4x8("%%xmm0")
"paddd %%xmm0,%%xmm7\n\t"
"movdqa %%xmm7,%%xmm6\n\t"
"punpckhqdq %%xmm7,%%xmm7\n\t"
"paddd %%xmm6,%%xmm7\n\t"
"pshufd $1,%%xmm7,%%xmm6\n\t"
"paddd %%xmm6,%%xmm7\n\t"
"movd %%xmm7,%[ret]\n\t"
:[ret]"=a"(ret)
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
[ystride3]"r"((ptrdiff_t)_ystride*3)
);
return ret;
}
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
};
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
horizontal sums as well as their 16-bit differences subject to a mask.
%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
#define OC_LOAD_SUB_MASK_2x8 \
"#OC_LOAD_SUB_MASK_2x8\n\t" \
/*Start the loads and expand the next 8 bits of the mask.*/ \
"shl $8,%[m]\n\t" \
"movq (%[src]),%%xmm0\n\t" \
"mov %h[m],%b[m]\n\t" \
"movq (%[ref]),%%xmm2\n\t" \
"movd %[m],%%xmm4\n\t" \
"shr $8,%[m]\n\t" \
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
"mov %h[m],%b[m]\n\t" \
"pand %%xmm6,%%xmm4\n\t" \
"pcmpeqb %%xmm6,%%xmm4\n\t" \
/*Perform the masking.*/ \
"pand %%xmm4,%%xmm0\n\t" \
"pand %%xmm4,%%xmm2\n\t" \
/*Finish the loads while unpacking the first set of rows, and expand the next
8 bits of the mask.*/ \
"movd %[m],%%xmm4\n\t" \
"movq (%[src],%[ystride]),%%xmm1\n\t" \
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
"pand %%xmm6,%%xmm4\n\t" \
"punpcklbw %%xmm2,%%xmm0\n\t" \
"pcmpeqb %%xmm6,%%xmm4\n\t" \
"punpcklbw %%xmm2,%%xmm2\n\t" \
/*Mask and unpack the second set of rows.*/ \
"pand %%xmm4,%%xmm1\n\t" \
"pand %%xmm4,%%xmm3\n\t" \
"punpcklbw %%xmm3,%%xmm1\n\t" \
"punpcklbw %%xmm3,%%xmm3\n\t" \
"psubw %%xmm2,%%xmm0\n\t" \
"psubw %%xmm3,%%xmm1\n\t" \
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
ptrdiff_t ystride;
unsigned ret;
int i;
ystride=_ystride;
__asm__ __volatile__(
"pxor %%xmm7,%%xmm7\n\t"
"movq %[c],%%xmm6\n\t"
:
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
);
for(i=0;i<4;i++){
unsigned m;
m=_mask&0xFFFF;
_mask>>=16;
if(m){
__asm__ __volatile__(
OC_LOAD_SUB_MASK_2x8
"pmaddwd %%xmm0,%%xmm0\n\t"
"pmaddwd %%xmm1,%%xmm1\n\t"
"paddd %%xmm0,%%xmm7\n\t"
"paddd %%xmm1,%%xmm7\n\t"
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
);
}
_src+=2*ystride;
_ref+=2*ystride;
}
__asm__ __volatile__(
"movdqa %%xmm7,%%xmm6\n\t"
"punpckhqdq %%xmm7,%%xmm7\n\t"
"paddd %%xmm6,%%xmm7\n\t"
"pshufd $1,%%xmm7,%%xmm6\n\t"
"paddd %%xmm6,%%xmm7\n\t"
"movd %%xmm7,%[ret]\n\t"
:[ret]"=a"(ret)
);
return ret;
}
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
16-bit difference in %%xmm0...%%xmm7.*/
#define OC_LOAD_SUB_8x8 \
"#OC_LOAD_SUB_8x8\n\t" \
"movq (%[src]),%%xmm0\n\t" \
"movq (%[ref]),%%xmm4\n\t" \
"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"movq (%[src]),%%xmm2\n\t" \
"movq (%[ref]),%%xmm7\n\t" \
"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
"punpcklbw %%xmm4,%%xmm0\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"punpcklbw %%xmm4,%%xmm4\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"psubw %%xmm4,%%xmm0\n\t" \
"movq (%[src]),%%xmm4\n\t" \
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
"movq (%[ref]),%%xmm0\n\t" \
"punpcklbw %%xmm5,%%xmm1\n\t" \
"punpcklbw %%xmm5,%%xmm5\n\t" \
"psubw %%xmm5,%%xmm1\n\t" \
"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
"punpcklbw %%xmm7,%%xmm2\n\t" \
"punpcklbw %%xmm7,%%xmm7\n\t" \
"psubw %%xmm7,%%xmm2\n\t" \
"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
"punpcklbw %%xmm6,%%xmm3\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"punpcklbw %%xmm6,%%xmm6\n\t" \
"psubw %%xmm6,%%xmm3\n\t" \
"movq (%[src]),%%xmm6\n\t" \
"punpcklbw %%xmm0,%%xmm4\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"punpcklbw %%xmm0,%%xmm0\n\t" \
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
"psubw %%xmm0,%%xmm4\n\t" \
"movq (%[ref]),%%xmm0\n\t" \
"punpcklbw %%xmm7,%%xmm5\n\t" \
"neg %[src_ystride]\n\t" \
"punpcklbw %%xmm7,%%xmm7\n\t" \
"psubw %%xmm7,%%xmm5\n\t" \
"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
"punpcklbw %%xmm0,%%xmm6\n\t" \
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
"punpcklbw %%xmm0,%%xmm0\n\t" \
"neg %[ref_ystride]\n\t" \
"psubw %%xmm0,%%xmm6\n\t" \
"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
"punpcklbw %%xmm0,%%xmm7\n\t" \
"punpcklbw %%xmm0,%%xmm0\n\t" \
"psubw %%xmm0,%%xmm7\n\t" \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
#define OC_LOAD_8x8 \
"#OC_LOAD_8x8\n\t" \
"movq (%[src]),%%xmm0\n\t" \
"movq (%[src],%[ystride]),%%xmm1\n\t" \
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
"pxor %%xmm7,%%xmm7\n\t" \
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
"punpcklbw %%xmm7,%%xmm0\n\t" \
"movq (%[src4]),%%xmm4\n\t" \
"punpcklbw %%xmm7,%%xmm1\n\t" \
"movq (%[src4],%[ystride]),%%xmm5\n\t" \
"punpcklbw %%xmm7,%%xmm2\n\t" \
"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
"punpcklbw %%xmm7,%%xmm3\n\t" \
"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
"punpcklbw %%xmm4,%%xmm4\n\t" \
"punpcklbw %%xmm5,%%xmm5\n\t" \
"psrlw $8,%%xmm4\n\t" \
"psrlw $8,%%xmm5\n\t" \
"punpcklbw %%xmm6,%%xmm6\n\t" \
"punpcklbw %%xmm7,%%xmm7\n\t" \
"psrlw $8,%%xmm6\n\t" \
"psrlw $8,%%xmm7\n\t" \
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x8 \
"#OC_HADAMARD_AB_8x8\n\t" \
/*Stage A:*/ \
"paddw %%xmm5,%%xmm1\n\t" \
"paddw %%xmm6,%%xmm2\n\t" \
"paddw %%xmm5,%%xmm5\n\t" \
"paddw %%xmm6,%%xmm6\n\t" \
"psubw %%xmm1,%%xmm5\n\t" \
"psubw %%xmm2,%%xmm6\n\t" \
"paddw %%xmm7,%%xmm3\n\t" \
"paddw %%xmm4,%%xmm0\n\t" \
"paddw %%xmm7,%%xmm7\n\t" \
"paddw %%xmm4,%%xmm4\n\t" \
"psubw %%xmm3,%%xmm7\n\t" \
"psubw %%xmm0,%%xmm4\n\t" \
/*Stage B:*/ \
"paddw %%xmm2,%%xmm0\n\t" \
"paddw %%xmm3,%%xmm1\n\t" \
"paddw %%xmm6,%%xmm4\n\t" \
"paddw %%xmm7,%%xmm5\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
"paddw %%xmm3,%%xmm3\n\t" \
"paddw %%xmm6,%%xmm6\n\t" \
"paddw %%xmm7,%%xmm7\n\t" \
"psubw %%xmm0,%%xmm2\n\t" \
"psubw %%xmm1,%%xmm3\n\t" \
"psubw %%xmm4,%%xmm6\n\t" \
"psubw %%xmm5,%%xmm7\n\t" \
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x8 \
"#OC_HADAMARD_C_8x8\n\t" \
/*Stage C:*/ \
"paddw %%xmm1,%%xmm0\n\t" \
"paddw %%xmm3,%%xmm2\n\t" \
"paddw %%xmm5,%%xmm4\n\t" \
"paddw %%xmm7,%%xmm6\n\t" \
"paddw %%xmm1,%%xmm1\n\t" \
"paddw %%xmm3,%%xmm3\n\t" \
"paddw %%xmm5,%%xmm5\n\t" \
"paddw %%xmm7,%%xmm7\n\t" \
"psubw %%xmm0,%%xmm1\n\t" \
"psubw %%xmm2,%%xmm3\n\t" \
"psubw %%xmm4,%%xmm5\n\t" \
"psubw %%xmm6,%%xmm7\n\t" \
/*Performs an 8-point 1-D Hadamard transform in place.
Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
in place with no temporary registers).*/
#define OC_HADAMARD_8x8 \
OC_HADAMARD_AB_8x8 \
OC_HADAMARD_C_8x8 \
/*Performs the first part of the final stage of the Hadamard transform and
summing of absolute values.
At the end of this part, %%xmm1 will contain the DC coefficient of the
transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
/*We use the fact that \
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
to merge the final butterfly with the abs and the first stage of \
accumulation. \
Thus we can avoid using pabsw, which is not available until SSSE3. \
Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
registers). \
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
This implementation is only 26 (+4 for spilling registers).*/ \
"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*xmm7={0x7FFF}x4 \
xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
"pcmpeqb %%xmm7,%%xmm7\n\t" \
"movdqa %%xmm4,%%xmm6\n\t" \
"psrlw $1,%%xmm7\n\t" \
"paddw %%xmm5,%%xmm6\n\t" \
"pmaxsw %%xmm5,%%xmm4\n\t" \
"paddsw %%xmm7,%%xmm6\n\t" \
"psubw %%xmm6,%%xmm4\n\t" \
/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
"movdqa %%xmm2,%%xmm6\n\t" \
"movdqa %%xmm0,%%xmm5\n\t" \
"pmaxsw %%xmm3,%%xmm2\n\t" \
"pmaxsw %%xmm1,%%xmm0\n\t" \
"paddw %%xmm3,%%xmm6\n\t" \
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
"paddw %%xmm5,%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
/*Performs the second part of the final stage of the Hadamard transform and
summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
"paddsw %%xmm7,%%xmm6\n\t" \
"paddsw %%xmm7,%%xmm1\n\t" \
"psubw %%xmm6,%%xmm2\n\t" \
"psubw %%xmm1,%%xmm0\n\t" \
/*xmm7={1}x4 (needed for the horizontal add that follows) \
xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
"movdqa %%xmm3,%%xmm6\n\t" \
"pmaxsw %%xmm5,%%xmm3\n\t" \
"paddw %%xmm2,%%xmm0\n\t" \
"paddw %%xmm5,%%xmm6\n\t" \
"paddw %%xmm4,%%xmm0\n\t" \
"paddsw %%xmm7,%%xmm6\n\t" \
"paddw %%xmm3,%%xmm0\n\t" \
"psrlw $14,%%xmm7\n\t" \
"psubw %%xmm6,%%xmm0\n\t" \
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
absolute value of each component, and accumulates everything into xmm0.*/
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
component, and accumulates everything into xmm0.
Note that xmm0 will have an extra 4 added to each column, and that after
removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x8 \
OC_HADAMARD_AB_8x8 \
OC_HADAMARD_C_ABS_ACCUM_8x8
static unsigned oc_int_frag_satd_sse2(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
unsigned ret;
unsigned ret2;
int dc;
__asm__ __volatile__(
OC_LOAD_SUB_8x8
OC_HADAMARD_8x8
OC_TRANSPOSE_8x8
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x8
OC_HADAMARD_C_ABS_ACCUM_A_8x8
"movd %%xmm1,%[dc]\n\t"
OC_HADAMARD_C_ABS_ACCUM_B_8x8
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
latency of pmaddwd by starting to compute abs(dc) here.*/
"pmaddwd %%xmm7,%%xmm0\n\t"
"movsx %w[dc],%[dc]\n\t"
"cdq\n\t"
"movdqa %%xmm0,%%xmm1\n\t"
"punpckhqdq %%xmm0,%%xmm0\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"movd %%xmm0,%[ret]\n\t"
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
added to them, a factor of two removed, and the DC value included;
correct the final sum here.*/
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
"xor %[dc],%[ret2]\n\t"
"sub %[ret2],%[ret]\n\t"
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
and %[dc] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf].*/
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
constraints, otherwise if gcc can prove they're equal it will allocate
them to the same register (which is bad); _src and _ref face a similar
problem.
All four are destructively modified, but if we list them as output
constraints, gcc can't alias them with other outputs.*/
:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
/*We have to use neg, so we actually clobber the condition codes for once
(not to mention sub, and add).*/
:"cc"
);
*_dc=dc;
return ret;
}
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
const unsigned char *_src,int _ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
unsigned ret;
int dc;
__asm__ __volatile__(
OC_LOAD_8x8
OC_HADAMARD_8x8
OC_TRANSPOSE_8x8
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x8
OC_HADAMARD_C_ABS_ACCUM_A_8x8
"movd %%xmm1,%[dc]\n\t"
OC_HADAMARD_C_ABS_ACCUM_B_8x8
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.*/
"pmaddwd %%xmm7,%%xmm0\n\t"
/*We assume that the DC coefficient is always positive (which is true,
because the input to the INTRA transform was not a difference).*/
"movzx %w[dc],%[dc]\n\t"
"movdqa %%xmm0,%%xmm1\n\t"
"punpckhqdq %%xmm0,%%xmm0\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"movd %%xmm0,%[ret]\n\t"
"lea -64(%[ret],%[ret]),%[ret]\n\t"
"sub %[dc],%[ret]\n\t"
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
and %[dc] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf].*/
:[ret]"=a"(ret),[dc]"=r"(dc),
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
/*We have to use sub, so we actually clobber the condition codes for once.*/
:"cc"
);
*_dc=dc;
return ret;
}
#endif

452
thirdparty/libtheora/x86/sse2fdct.c vendored Normal file
View File

@@ -0,0 +1,452 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************/
/*SSE2 fDCT implementation for x86_64.*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include <stddef.h>
#include "x86enc.h"
#include "x86zigzag.h"
#include "sse2trans.h"
#if defined(OC_X86_64_ASM)
# define OC_FDCT_8x8 \
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
"#OC_FDCT_8x8\n\t" \
/*Stage 1:*/ \
"movdqa %%xmm0,%%xmm11\n\t" \
"movdqa %%xmm1,%%xmm10\n\t" \
"movdqa %%xmm2,%%xmm9\n\t" \
"movdqa %%xmm3,%%xmm8\n\t" \
/*xmm11=t7'=t0-t7*/ \
"psubw %%xmm7,%%xmm11\n\t" \
/*xmm10=t6'=t1-t6*/ \
"psubw %%xmm6,%%xmm10\n\t" \
/*xmm9=t5'=t2-t5*/ \
"psubw %%xmm5,%%xmm9\n\t" \
/*xmm8=t4'=t3-t4*/ \
"psubw %%xmm4,%%xmm8\n\t" \
/*xmm0=t0'=t0+t7*/ \
"paddw %%xmm7,%%xmm0\n\t" \
/*xmm1=t1'=t1+t6*/ \
"paddw %%xmm6,%%xmm1\n\t" \
/*xmm5=t2'=t2+t5*/ \
"paddw %%xmm2,%%xmm5\n\t" \
/*xmm4=t3'=t3+t4*/ \
"paddw %%xmm3,%%xmm4\n\t" \
/*xmm2,3,6,7 are now free.*/ \
/*Stage 2:*/ \
"movdqa %%xmm0,%%xmm3\n\t" \
"mov $0x5A806A0A,%[a]\n\t" \
"movdqa %%xmm1,%%xmm2\n\t" \
"movd %[a],%%xmm13\n\t" \
"movdqa %%xmm10,%%xmm6\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
/*xmm2=t2''=t1'-t2'*/ \
"psubw %%xmm5,%%xmm2\n\t" \
"pxor %%xmm12,%%xmm12\n\t" \
/*xmm3=t3''=t0'-t3'*/ \
"psubw %%xmm4,%%xmm3\n\t" \
"psubw %%xmm14,%%xmm12\n\t" \
/*xmm10=t5''=t6'-t5'*/ \
"psubw %%xmm9,%%xmm10\n\t" \
"paddw %%xmm12,%%xmm12\n\t" \
/*xmm4=t0''=t0'+t3'*/ \
"paddw %%xmm0,%%xmm4\n\t" \
/*xmm1=t1''=t1'+t2'*/ \
"paddw %%xmm5,%%xmm1\n\t" \
/*xmm6=t6''=t6'+t5'*/ \
"paddw %%xmm9,%%xmm6\n\t" \
/*xmm0,xmm5,xmm9 are now free.*/ \
/*Stage 3:*/ \
/*xmm10:xmm5=t5''*27146+0xB500 \
xmm0=t5''*/ \
"movdqa %%xmm10,%%xmm5\n\t" \
"movdqa %%xmm10,%%xmm0\n\t" \
"punpckhwd %%xmm12,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
"punpcklwd %%xmm12,%%xmm5\n\t" \
"pmaddwd %%xmm13,%%xmm5\n\t" \
/*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
"psrad $16,%%xmm10\n\t" \
"psrad $16,%%xmm5\n\t" \
"packssdw %%xmm10,%%xmm5\n\t" \
"paddw %%xmm0,%%xmm5\n\t" \
/*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
"pcmpeqw %%xmm15,%%xmm0\n\t" \
"psubw %%xmm14,%%xmm0\n\t" \
"paddw %%xmm5,%%xmm0\n\t" \
"movdqa %%xmm8,%%xmm5\n\t" \
"psraw $1,%%xmm0\n\t" \
/*xmm5=t5'''=t4'-s*/ \
"psubw %%xmm0,%%xmm5\n\t" \
/*xmm8=t4''=t4'+s*/ \
"paddw %%xmm0,%%xmm8\n\t" \
/*xmm0,xmm7,xmm9,xmm10 are free.*/ \
/*xmm7:xmm9=t6''*27146+0xB500*/ \
"movdqa %%xmm6,%%xmm7\n\t" \
"movdqa %%xmm6,%%xmm9\n\t" \
"punpckhwd %%xmm12,%%xmm7\n\t" \
"pmaddwd %%xmm13,%%xmm7\n\t" \
"punpcklwd %%xmm12,%%xmm9\n\t" \
"pmaddwd %%xmm13,%%xmm9\n\t" \
/*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
"psrad $16,%%xmm7\n\t" \
"psrad $16,%%xmm9\n\t" \
"packssdw %%xmm7,%%xmm9\n\t" \
"paddw %%xmm6,%%xmm9\n\t" \
/*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
"pcmpeqw %%xmm15,%%xmm6\n\t" \
"psubw %%xmm14,%%xmm6\n\t" \
"paddw %%xmm6,%%xmm9\n\t" \
"movdqa %%xmm11,%%xmm7\n\t" \
"psraw $1,%%xmm9\n\t" \
/*xmm7=t6'''=t7'-s*/ \
"psubw %%xmm9,%%xmm7\n\t" \
/*xmm9=t7''=t7'+s*/ \
"paddw %%xmm11,%%xmm9\n\t" \
/*xmm0,xmm6,xmm10,xmm11 are free.*/ \
/*Stage 4:*/ \
/*xmm10:xmm0=t1''*27146+0xB500*/ \
"movdqa %%xmm1,%%xmm0\n\t" \
"movdqa %%xmm1,%%xmm10\n\t" \
"punpcklwd %%xmm12,%%xmm0\n\t" \
"pmaddwd %%xmm13,%%xmm0\n\t" \
"punpckhwd %%xmm12,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
/*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
"psrad $16,%%xmm0\n\t" \
"psrad $16,%%xmm10\n\t" \
"mov $0x20006A0A,%[a]\n\t" \
"packssdw %%xmm10,%%xmm0\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddw %%xmm1,%%xmm0\n\t" \
/*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
"pcmpeqw %%xmm15,%%xmm1\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"psubw %%xmm14,%%xmm1\n\t" \
"paddw %%xmm1,%%xmm0\n\t" \
/*xmm10:xmm4=t0''*27146+0x4000*/ \
"movdqa %%xmm4,%%xmm1\n\t" \
"movdqa %%xmm4,%%xmm10\n\t" \
"punpcklwd %%xmm12,%%xmm4\n\t" \
"pmaddwd %%xmm13,%%xmm4\n\t" \
"punpckhwd %%xmm12,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
/*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
"psrad $16,%%xmm4\n\t" \
"psrad $16,%%xmm10\n\t" \
"mov $0x6CB7,%[a]\n\t" \
"packssdw %%xmm10,%%xmm4\n\t" \
"movd %[a],%%xmm12\n\t" \
"paddw %%xmm1,%%xmm4\n\t" \
/*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
"pcmpeqw %%xmm15,%%xmm1\n\t" \
"pshufd $00,%%xmm12,%%xmm12\n\t" \
"psubw %%xmm14,%%xmm1\n\t" \
"mov $0x7FFF6C84,%[a]\n\t" \
"paddw %%xmm1,%%xmm4\n\t" \
/*xmm0=_y[0]=u=r+s>>1 \
The naive implementation could cause overflow, so we use \
u=(r&s)+((r^s)>>1).*/ \
"movdqa %%xmm0,%%xmm6\n\t" \
"pxor %%xmm4,%%xmm0\n\t" \
"pand %%xmm4,%%xmm6\n\t" \
"psraw $1,%%xmm0\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddw %%xmm6,%%xmm0\n\t" \
/*xmm4=_y[4]=v=r-u*/ \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"psubw %%xmm0,%%xmm4\n\t" \
/*xmm1,xmm6,xmm10,xmm11 are free.*/ \
/*xmm6:xmm10=60547*t3''+0x6CB7*/ \
"movdqa %%xmm3,%%xmm10\n\t" \
"movdqa %%xmm3,%%xmm6\n\t" \
"punpcklwd %%xmm3,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
"mov $0x61F861F8,%[a]\n\t" \
"punpckhwd %%xmm3,%%xmm6\n\t" \
"pmaddwd %%xmm13,%%xmm6\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddd %%xmm12,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"paddd %%xmm12,%%xmm6\n\t" \
/*xmm1:xmm2=25080*t2'' \
xmm12=t2''*/ \
"movdqa %%xmm2,%%xmm11\n\t" \
"movdqa %%xmm2,%%xmm12\n\t" \
"pmullw %%xmm13,%%xmm2\n\t" \
"pmulhw %%xmm13,%%xmm11\n\t" \
"movdqa %%xmm2,%%xmm1\n\t" \
"punpcklwd %%xmm11,%%xmm2\n\t" \
"punpckhwd %%xmm11,%%xmm1\n\t" \
/*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
"paddd %%xmm2,%%xmm10\n\t" \
"paddd %%xmm1,%%xmm6\n\t" \
"psrad $16,%%xmm10\n\t" \
"pcmpeqw %%xmm15,%%xmm3\n\t" \
"psrad $16,%%xmm6\n\t" \
"psubw %%xmm14,%%xmm3\n\t" \
"packssdw %%xmm6,%%xmm10\n\t" \
"paddw %%xmm3,%%xmm10\n\t" \
/*xmm2=_y[2]=u \
xmm10=s=(25080*u>>16)-t2''*/ \
"movdqa %%xmm10,%%xmm2\n\t" \
"pmulhw %%xmm13,%%xmm10\n\t" \
"psubw %%xmm12,%%xmm10\n\t" \
/*xmm1:xmm6=s*21600+0x2800*/ \
"pxor %%xmm12,%%xmm12\n\t" \
"psubw %%xmm14,%%xmm12\n\t" \
"mov $0x28005460,%[a]\n\t" \
"movd %[a],%%xmm13\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"movdqa %%xmm10,%%xmm6\n\t" \
"movdqa %%xmm10,%%xmm1\n\t" \
"punpcklwd %%xmm12,%%xmm6\n\t" \
"pmaddwd %%xmm13,%%xmm6\n\t" \
"mov $0x0E3D,%[a]\n\t" \
"punpckhwd %%xmm12,%%xmm1\n\t" \
"pmaddwd %%xmm13,%%xmm1\n\t" \
/*xmm6=(s*21600+0x2800>>18)+s*/ \
"psrad $18,%%xmm6\n\t" \
"psrad $18,%%xmm1\n\t" \
"movd %[a],%%xmm12\n\t" \
"packssdw %%xmm1,%%xmm6\n\t" \
"pshufd $00,%%xmm12,%%xmm12\n\t" \
"paddw %%xmm10,%%xmm6\n\t" \
/*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
"mov $0x7FFF54DC,%[a]\n\t" \
"pcmpeqw %%xmm15,%%xmm10\n\t" \
"movd %[a],%%xmm13\n\t" \
"psubw %%xmm14,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"paddw %%xmm10,%%xmm6\n\t " \
/*xmm1,xmm3,xmm10,xmm11 are free.*/ \
/*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
"movdqa %%xmm5,%%xmm10\n\t" \
"movdqa %%xmm5,%%xmm11\n\t" \
"punpcklwd %%xmm5,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
"mov $0x8E3A8E3A,%[a]\n\t" \
"punpckhwd %%xmm5,%%xmm11\n\t" \
"pmaddwd %%xmm13,%%xmm11\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddd %%xmm12,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"paddd %%xmm12,%%xmm11\n\t" \
/*xmm7:xmm12=36410*t6''' \
xmm1=t6'''*/ \
"movdqa %%xmm7,%%xmm3\n\t" \
"movdqa %%xmm7,%%xmm1\n\t" \
"pmulhw %%xmm13,%%xmm3\n\t" \
"pmullw %%xmm13,%%xmm7\n\t" \
"paddw %%xmm1,%%xmm3\n\t" \
"movdqa %%xmm7,%%xmm12\n\t" \
"punpckhwd %%xmm3,%%xmm7\n\t" \
"punpcklwd %%xmm3,%%xmm12\n\t" \
/*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
"paddd %%xmm12,%%xmm10\n\t" \
"paddd %%xmm7,%%xmm11\n\t" \
"psrad $16,%%xmm10\n\t" \
"pcmpeqw %%xmm15,%%xmm5\n\t" \
"psrad $16,%%xmm11\n\t" \
"psubw %%xmm14,%%xmm5\n\t" \
"packssdw %%xmm11,%%xmm10\n\t" \
"pxor %%xmm12,%%xmm12\n\t" \
"paddw %%xmm5,%%xmm10\n\t" \
/*xmm5=_y[5]=u \
xmm1=s=t6'''-(36410*u>>16)*/ \
"psubw %%xmm14,%%xmm12\n\t" \
"movdqa %%xmm10,%%xmm5\n\t" \
"mov $0x340067C8,%[a]\n\t" \
"pmulhw %%xmm13,%%xmm10\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddw %%xmm5,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"psubw %%xmm10,%%xmm1\n\t" \
/*xmm11:xmm3=s*26568+0x3400*/ \
"movdqa %%xmm1,%%xmm3\n\t" \
"movdqa %%xmm1,%%xmm11\n\t" \
"punpcklwd %%xmm12,%%xmm3\n\t" \
"pmaddwd %%xmm13,%%xmm3\n\t" \
"mov $0x7B1B,%[a]\n\t" \
"punpckhwd %%xmm12,%%xmm11\n\t" \
"pmaddwd %%xmm13,%%xmm11\n\t" \
/*xmm3=(s*26568+0x3400>>17)+s*/ \
"psrad $17,%%xmm3\n\t" \
"psrad $17,%%xmm11\n\t" \
"movd %[a],%%xmm12\n\t" \
"packssdw %%xmm11,%%xmm3\n\t" \
"pshufd $00,%%xmm12,%%xmm12\n\t" \
"paddw %%xmm1,%%xmm3\n\t" \
/*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
"mov $0x7FFF7B16,%[a]\n\t" \
"pcmpeqw %%xmm15,%%xmm1\n\t" \
"movd %[a],%%xmm13\n\t" \
"psubw %%xmm14,%%xmm1\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"paddw %%xmm1,%%xmm3\n\t " \
/*xmm1,xmm7,xmm10,xmm11 are free.*/ \
/*xmm11:xmm10=64277*t7''+0x7B1B*/ \
"movdqa %%xmm9,%%xmm10\n\t" \
"movdqa %%xmm9,%%xmm11\n\t" \
"punpcklwd %%xmm9,%%xmm10\n\t" \
"pmaddwd %%xmm13,%%xmm10\n\t" \
"mov $0x31F131F1,%[a]\n\t" \
"punpckhwd %%xmm9,%%xmm11\n\t" \
"pmaddwd %%xmm13,%%xmm11\n\t" \
"movd %[a],%%xmm13\n\t" \
"paddd %%xmm12,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
"paddd %%xmm12,%%xmm11\n\t" \
/*xmm12:xmm7=12785*t4''*/ \
"movdqa %%xmm8,%%xmm7\n\t" \
"movdqa %%xmm8,%%xmm1\n\t" \
"pmullw %%xmm13,%%xmm7\n\t" \
"pmulhw %%xmm13,%%xmm1\n\t" \
"movdqa %%xmm7,%%xmm12\n\t" \
"punpcklwd %%xmm1,%%xmm7\n\t" \
"punpckhwd %%xmm1,%%xmm12\n\t" \
/*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
"paddd %%xmm7,%%xmm10\n\t" \
"paddd %%xmm12,%%xmm11\n\t" \
"psrad $16,%%xmm10\n\t" \
"pcmpeqw %%xmm15,%%xmm9\n\t" \
"psrad $16,%%xmm11\n\t" \
"psubw %%xmm14,%%xmm9\n\t" \
"packssdw %%xmm11,%%xmm10\n\t" \
"pxor %%xmm12,%%xmm12\n\t" \
"paddw %%xmm9,%%xmm10\n\t" \
/*xmm1=_y[1]=u \
xmm10=s=(12785*u>>16)-t4''*/ \
"psubw %%xmm14,%%xmm12\n\t" \
"movdqa %%xmm10,%%xmm1\n\t" \
"mov $0x3000503B,%[a]\n\t" \
"pmulhw %%xmm13,%%xmm10\n\t" \
"movd %[a],%%xmm13\n\t" \
"psubw %%xmm8,%%xmm10\n\t" \
"pshufd $00,%%xmm13,%%xmm13\n\t" \
/*xmm8:xmm7=s*20539+0x3000*/ \
"movdqa %%xmm10,%%xmm7\n\t" \
"movdqa %%xmm10,%%xmm8\n\t" \
"punpcklwd %%xmm12,%%xmm7\n\t" \
"pmaddwd %%xmm13,%%xmm7\n\t" \
"punpckhwd %%xmm12,%%xmm8\n\t" \
"pmaddwd %%xmm13,%%xmm8\n\t" \
/*xmm7=(s*20539+0x3000>>20)+s*/ \
"psrad $20,%%xmm7\n\t" \
"psrad $20,%%xmm8\n\t" \
"packssdw %%xmm8,%%xmm7\n\t" \
"paddw %%xmm10,%%xmm7\n\t" \
/*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
"pcmpeqw %%xmm15,%%xmm10\n\t" \
"psubw %%xmm14,%%xmm10\n\t" \
"paddw %%xmm10,%%xmm7\n\t " \
/*SSE2 implementation of the fDCT for x86-64 only.
Because of the 8 extra XMM registers on x86-64, this version can operate
without any temporary stack access at all.*/
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
ptrdiff_t a;
__asm__ __volatile__(
/*Load the input.*/
"movdqa 0x00(%[x]),%%xmm0\n\t"
"movdqa 0x10(%[x]),%%xmm1\n\t"
"movdqa 0x20(%[x]),%%xmm2\n\t"
"movdqa 0x30(%[x]),%%xmm3\n\t"
"movdqa 0x40(%[x]),%%xmm4\n\t"
"movdqa 0x50(%[x]),%%xmm5\n\t"
"movdqa 0x60(%[x]),%%xmm6\n\t"
"movdqa 0x70(%[x]),%%xmm7\n\t"
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
/*We also add a few biases to correct for some systematic error that
remains in the full fDCT->iDCT round trip.*/
/*xmm15={0}x8*/
"pxor %%xmm15,%%xmm15\n\t"
/*xmm14={-1}x8*/
"pcmpeqb %%xmm14,%%xmm14\n\t"
"psllw $2,%%xmm0\n\t"
/*xmm8=xmm0*/
"movdqa %%xmm0,%%xmm8\n\t"
"psllw $2,%%xmm1\n\t"
/*xmm8={_x[7...0]==0}*/
"pcmpeqw %%xmm15,%%xmm8\n\t"
"psllw $2,%%xmm2\n\t"
/*xmm8={_x[7...0]!=0}*/
"psubw %%xmm14,%%xmm8\n\t"
"psllw $2,%%xmm3\n\t"
/*%[a]=1*/
"mov $1,%[a]\n\t"
/*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
"pslld $16,%%xmm8\n\t"
"psllw $2,%%xmm4\n\t"
/*xmm9={0,0,0,0,0,0,0,1}*/
"movd %[a],%%xmm9\n\t"
/*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
"pshufhw $0x00,%%xmm8,%%xmm8\n\t"
"psllw $2,%%xmm5\n\t"
/*%[a]={1}x2*/
"mov $0x10001,%[a]\n\t"
/*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
"pshuflw $0x01,%%xmm8,%%xmm8\n\t"
"psllw $2,%%xmm6\n\t"
/*xmm10={0,0,0,0,0,0,1,1}*/
"movd %[a],%%xmm10\n\t"
/*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
"paddw %%xmm8,%%xmm0\n\t"
"psllw $2,%%xmm7\n\t"
/*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
"paddw %%xmm10,%%xmm0\n\t"
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
"psubw %%xmm9,%%xmm1\n\t"
/*Transform columns.*/
OC_FDCT_8x8
/*Transform rows.*/
OC_TRANSPOSE_8x8
OC_FDCT_8x8
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
"paddw %%xmm14,%%xmm14\n\t"
"psubw %%xmm14,%%xmm0\n\t"
"psubw %%xmm14,%%xmm1\n\t"
"psraw $2,%%xmm0\n\t"
"psubw %%xmm14,%%xmm2\n\t"
"psraw $2,%%xmm1\n\t"
"psubw %%xmm14,%%xmm3\n\t"
"psraw $2,%%xmm2\n\t"
"psubw %%xmm14,%%xmm4\n\t"
"psraw $2,%%xmm3\n\t"
"psubw %%xmm14,%%xmm5\n\t"
"psraw $2,%%xmm4\n\t"
"psubw %%xmm14,%%xmm6\n\t"
"psraw $2,%%xmm5\n\t"
"psubw %%xmm14,%%xmm7\n\t"
"psraw $2,%%xmm6\n\t"
"psraw $2,%%xmm7\n\t"
/*Transpose, zig-zag, and store the result.*/
/*We could probably do better using SSSE3's palignr, but re-using MMXEXT
version will do for now.*/
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
"movdq2q %%xmm"#_row","_reg"\n\t" \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
"punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
"movdq2q %%xmm"#_row","_reg"\n\t" \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a)
:[y]"r"(_y),[x]"r"(_x)
:"memory"
);
}
#endif

456
thirdparty/libtheora/x86/sse2idct.c vendored Normal file
View File

@@ -0,0 +1,456 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*SSE2 acceleration of Theora's iDCT.*/
#include "x86int.h"
#include "sse2trans.h"
#include "../dct.h"
#if defined(OC_X86_ASM)
/*A table of constants used by the MMX routines.*/
const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
8, 8, 8, 8, 8, 8, 8, 8,
OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
};
/*Performs the first three stages of the iDCT.
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
(accessed in that order).
The remaining rows must be in _x at their corresponding locations.
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
contain rows 4 through 7.*/
#define OC_IDCT_8x8_ABC(_x) \
"#OC_IDCT_8x8_ABC\n\t" \
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
"movdqa %%xmm1,%%xmm0\n\t" \
"pmulhw %%xmm2,%%xmm1\n\t" \
"movdqa %%xmm4,%%xmm7\n\t" \
"pmulhw %%xmm6,%%xmm0\n\t" \
"pmulhw %%xmm2,%%xmm7\n\t" \
"pmulhw %%xmm6,%%xmm4\n\t" \
"paddw %%xmm6,%%xmm0\n\t" \
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
"paddw %%xmm1,%%xmm2\n\t" \
"psubw %%xmm0,%%xmm7\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
"paddw %%xmm4,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
"movdqa %%xmm4,%%xmm2\n\t" \
"movdqa %%xmm6,%%xmm1\n\t" \
"pmulhw %%xmm3,%%xmm4\n\t" \
"pmulhw %%xmm5,%%xmm1\n\t" \
"pmulhw %%xmm3,%%xmm6\n\t" \
"pmulhw %%xmm5,%%xmm2\n\t" \
"paddw %%xmm3,%%xmm4\n\t" \
"paddw %%xmm5,%%xmm3\n\t" \
"paddw %%xmm6,%%xmm3\n\t" \
"movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
"paddw %%xmm5,%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
"paddw %%xmm3,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
"psubw %%xmm4,%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
/*4-7 rotation by 7pi/16. \
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
"movdqa %%xmm3,%%xmm0\n\t" \
"movdqa %%xmm4,%%xmm7\n\t" \
"pmulhw %%xmm5,%%xmm3\n\t" \
"pmulhw %%xmm5,%%xmm7\n\t" \
"pmulhw %%xmm6,%%xmm4\n\t" \
"pmulhw %%xmm6,%%xmm0\n\t" \
"paddw %%xmm6,%%xmm4\n\t" \
"movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
"paddw %%xmm5,%%xmm7\n\t" \
"psubw %%xmm4,%%xmm3\n\t" \
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
"paddw %%xmm7,%%xmm0\n\t" \
"movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
/*0-1 butterfly. \
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
"paddw %%xmm7,%%xmm6\n\t" \
"movdqa %%xmm4,%%xmm5\n\t" \
"pmulhw %%xmm6,%%xmm4\n\t" \
"paddw %%xmm7,%%xmm7\n\t" \
"psubw %%xmm6,%%xmm7\n\t" \
"paddw %%xmm6,%%xmm4\n\t" \
/*Stage 2:*/ \
/*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
"movdqa %%xmm3,%%xmm6\n\t" \
"paddw %%xmm1,%%xmm3\n\t" \
"psubw %%xmm1,%%xmm6\n\t" \
"movdqa %%xmm5,%%xmm1\n\t" \
"pmulhw %%xmm7,%%xmm5\n\t" \
"paddw %%xmm7,%%xmm5\n\t" \
"movdqa %%xmm0,%%xmm7\n\t" \
"paddw %%xmm2,%%xmm0\n\t" \
"psubw %%xmm2,%%xmm7\n\t" \
"movdqa %%xmm1,%%xmm2\n\t" \
"pmulhw %%xmm6,%%xmm1\n\t" \
"pmulhw %%xmm7,%%xmm2\n\t" \
"paddw %%xmm6,%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
"paddw %%xmm7,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
/*Stage 3: \
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
"paddw %%xmm2,%%xmm1\n\t" \
"paddw %%xmm5,%%xmm6\n\t" \
"paddw %%xmm4,%%xmm7\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
"paddw %%xmm4,%%xmm4\n\t" \
"paddw %%xmm5,%%xmm5\n\t" \
"psubw %%xmm1,%%xmm2\n\t" \
"psubw %%xmm7,%%xmm4\n\t" \
"psubw %%xmm6,%%xmm5\n\t" \
/*Performs the last stage of the iDCT.
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
contain rows 4 through 7.
On output, xmm0 through xmm7 contain the corresponding rows.*/
#define OC_IDCT_8x8_D \
"#OC_IDCT_8x8_D\n\t" \
/*Stage 4: \
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
"psubw %%xmm0,%%xmm7\n\t" \
"psubw %%xmm1,%%xmm6\n\t" \
"psubw %%xmm2,%%xmm5\n\t" \
"psubw %%xmm3,%%xmm4\n\t" \
"paddw %%xmm0,%%xmm0\n\t" \
"paddw %%xmm1,%%xmm1\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
"paddw %%xmm3,%%xmm3\n\t" \
"paddw %%xmm7,%%xmm0\n\t" \
"paddw %%xmm6,%%xmm1\n\t" \
"paddw %%xmm5,%%xmm2\n\t" \
"paddw %%xmm4,%%xmm3\n\t" \
/*Performs the last stage of the iDCT.
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
contain rows 4 through 7.
On output, xmm0 through xmm7 contain the corresponding rows.*/
#define OC_IDCT_8x8_D_STORE \
"#OC_IDCT_8x8_D_STORE\n\t" \
/*Stage 4: \
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
"psubw %%xmm3,%%xmm4\n\t" \
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
"movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
"psubw %%xmm0,%%xmm7\n\t" \
"psubw %%xmm1,%%xmm6\n\t" \
"psubw %%xmm2,%%xmm5\n\t" \
"paddw %%xmm4,%%xmm7\n\t" \
"paddw %%xmm4,%%xmm6\n\t" \
"paddw %%xmm4,%%xmm5\n\t" \
"paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
"paddw %%xmm0,%%xmm0\n\t" \
"paddw %%xmm1,%%xmm1\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
"paddw %%xmm3,%%xmm3\n\t" \
"paddw %%xmm7,%%xmm0\n\t" \
"paddw %%xmm6,%%xmm1\n\t" \
"psraw $4,%%xmm0\n\t" \
"paddw %%xmm5,%%xmm2\n\t" \
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
"psraw $4,%%xmm1\n\t" \
"paddw %%xmm4,%%xmm3\n\t" \
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
"psraw $4,%%xmm2\n\t" \
"movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
"psraw $4,%%xmm3\n\t" \
"movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
"psraw $4,%%xmm4\n\t" \
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
"psraw $4,%%xmm5\n\t" \
"movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
"psraw $4,%%xmm6\n\t" \
"movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
"psraw $4,%%xmm7\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
int i;
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
"movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
"movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
"movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
"movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
OC_IDCT_8x8_ABC(x)
OC_IDCT_8x8_D
OC_TRANSPOSE_8x8
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
OC_IDCT_8x8_ABC(y)
OC_IDCT_8x8_D_STORE
:[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
);
__asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
/*Clear input data for next block (decoder only).*/
for(i=0;i<2;i++){
__asm__ __volatile__(
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
"movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
);
}
}
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
need to work with four columns at a time.
Doing this in MMX is faster on processors with a 64-bit data path.*/
#define OC_IDCT_8x8_10_MMX \
"#OC_IDCT_8x8_10_MMX\n\t" \
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
"movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
"movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
"pmulhw %%mm2,%%mm6\n\t" \
"pmulhw %%mm2,%%mm7\n\t" \
"movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
"movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
"pmulhw %%mm3,%%mm5\n\t" \
"pmulhw %%mm3,%%mm2\n\t" \
"movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
"paddw %%mm3,%%mm5\n\t" \
"paddw %%mm3,%%mm2\n\t" \
"movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
/*4-7 rotation by 7pi/16. \
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
"pmulhw %%mm1,%%mm3\n\t" \
"pmulhw %%mm1,%%mm7\n\t" \
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"movq %%mm3,%%mm6\n\t" \
"paddw %%mm1,%%mm7\n\t" \
/*0-1 butterfly. \
mm4=C4, mm0=X0, X4=0.*/ \
/*Stage 2:*/ \
/*4-5 butterfly: mm3=t[4], mm5=t[5] \
7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
"psubw %%mm5,%%mm3\n\t" \
"paddw %%mm5,%%mm6\n\t" \
"movq %%mm4,%%mm1\n\t" \
"pmulhw %%mm0,%%mm4\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"movq %%mm7,%%mm0\n\t" \
"movq %%mm4,%%mm5\n\t" \
"paddw %%mm2,%%mm0\n\t" \
"psubw %%mm2,%%mm7\n\t" \
"movq %%mm1,%%mm2\n\t" \
"pmulhw %%mm6,%%mm1\n\t" \
"pmulhw %%mm7,%%mm2\n\t" \
"paddw %%mm6,%%mm1\n\t" \
"movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
"paddw %%mm7,%%mm2\n\t" \
"movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
/*Stage 3: \
6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
"paddw %%mm2,%%mm1\n\t" \
"paddw %%mm5,%%mm6\n\t" \
"paddw %%mm4,%%mm7\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"paddw %%mm4,%%mm4\n\t" \
"paddw %%mm5,%%mm5\n\t" \
"psubw %%mm1,%%mm2\n\t" \
"psubw %%mm7,%%mm4\n\t" \
"psubw %%mm6,%%mm5\n\t" \
/*Stage 4: \
0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
"psubw %%mm0,%%mm7\n\t" \
"psubw %%mm1,%%mm6\n\t" \
"psubw %%mm2,%%mm5\n\t" \
"psubw %%mm3,%%mm4\n\t" \
"paddw %%mm0,%%mm0\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"paddw %%mm2,%%mm2\n\t" \
"paddw %%mm3,%%mm3\n\t" \
"paddw %%mm7,%%mm0\n\t" \
"paddw %%mm6,%%mm1\n\t" \
"paddw %%mm5,%%mm2\n\t" \
"paddw %%mm4,%%mm3\n\t" \
#define OC_IDCT_8x8_10_ABC \
"#OC_IDCT_8x8_10_ABC\n\t" \
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
"pmulhw %%xmm2,%%xmm6\n\t" \
"pmulhw %%xmm2,%%xmm7\n\t" \
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
"paddw %%xmm6,%%xmm2\n\t" \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
"pmulhw %%xmm3,%%xmm5\n\t" \
"pmulhw %%xmm3,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
"paddw %%xmm3,%%xmm5\n\t" \
"paddw %%xmm3,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
/*4-7 rotation by 7pi/16. \
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
"pmulhw %%xmm1,%%xmm3\n\t" \
"pmulhw %%xmm1,%%xmm7\n\t" \
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
"movdqa %%xmm3,%%xmm6\n\t" \
"paddw %%xmm1,%%xmm7\n\t" \
/*0-1 butterfly. \
xmm4=C4, xmm0=X0, X4=0.*/ \
/*Stage 2:*/ \
/*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
"psubw %%xmm5,%%xmm3\n\t" \
"paddw %%xmm5,%%xmm6\n\t" \
"movdqa %%xmm4,%%xmm1\n\t" \
"pmulhw %%xmm0,%%xmm4\n\t" \
"paddw %%xmm0,%%xmm4\n\t" \
"movdqa %%xmm7,%%xmm0\n\t" \
"movdqa %%xmm4,%%xmm5\n\t" \
"paddw %%xmm2,%%xmm0\n\t" \
"psubw %%xmm2,%%xmm7\n\t" \
"movdqa %%xmm1,%%xmm2\n\t" \
"pmulhw %%xmm6,%%xmm1\n\t" \
"pmulhw %%xmm7,%%xmm2\n\t" \
"paddw %%xmm6,%%xmm1\n\t" \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
"paddw %%xmm7,%%xmm2\n\t" \
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
/*Stage 3: \
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
"paddw %%xmm2,%%xmm1\n\t" \
"paddw %%xmm5,%%xmm6\n\t" \
"paddw %%xmm4,%%xmm7\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
"paddw %%xmm4,%%xmm4\n\t" \
"paddw %%xmm5,%%xmm5\n\t" \
"psubw %%xmm1,%%xmm2\n\t" \
"psubw %%xmm7,%%xmm4\n\t" \
"psubw %%xmm6,%%xmm5\n\t" \
static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
"movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
"movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
"movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
"movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
OC_IDCT_8x8_10_MMX
OC_TRANSPOSE_8x4_MMX2SSE
OC_IDCT_8x8_10_ABC
OC_IDCT_8x8_D_STORE
:[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
);
/*Clear input data for next block (decoder only).*/
__asm__ __volatile__(
"pxor %%mm0,%%mm0\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
);
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
decoded.
In most cases this is an EOB token (the continuation of an EOB run from a
previous block counts), and so this is the same as the coefficient count.
However, in the case that the last token was NOT an EOB token, but filled
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
Provided the last token was not a pure zero run, the minimum value it can
be is 46, and so that doesn't affect any of the cases in this routine.
However, if the last token WAS a pure zero run of length 63, then _last_zzi
will be 1 while the number of coefficients decoded is 64.
Thus, we will trigger the following special case, where the real
coefficient count would not.
Note also that a zero run of length 64 will give _last_zzi a value of 0,
but we still process the DC coefficient, which might have a non-zero value
due to DC prediction.
Although convoluted, this is arguably the correct behavior: it allows us to
use a smaller transform when the block ends with a long zero run instead
of a normal EOB token.
It could be smarter... multiple separate zero runs at the end of a block
will fool it, but an encoder that generates these really deserves what it
gets.
Needless to say we inherited this approach from VP3.*/
/*Then perform the iDCT.*/
if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
else oc_idct8x8_slow_sse2(_y,_x);
}
#endif

242
thirdparty/libtheora/x86/sse2trans.h vendored Normal file
View File

@@ -0,0 +1,242 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_sse2trans_H)
# define _x86_sse2trans_H (1)
# include "x86int.h"
# if defined(OC_X86_64_ASM)
/*On x86-64 we can transpose in-place without spilling registers.
By clever choices of the order to apply the butterflies and the order of
their outputs, we can take the rows in order and output the columns in order
without any extra operations and using just one temporary register.*/
# define OC_TRANSPOSE_8x8 \
"#OC_TRANSPOSE_8x8\n\t" \
"movdqa %%xmm4,%%xmm8\n\t" \
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
"punpcklwd %%xmm5,%%xmm4\n\t" \
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
"punpckhwd %%xmm5,%%xmm8\n\t" \
/*xmm5 is free.*/ \
"movdqa %%xmm0,%%xmm5\n\t" \
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
"punpcklwd %%xmm1,%%xmm0\n\t" \
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
"punpckhwd %%xmm1,%%xmm5\n\t" \
/*xmm1 is free.*/ \
"movdqa %%xmm6,%%xmm1\n\t" \
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
"punpcklwd %%xmm7,%%xmm6\n\t" \
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
"punpckhwd %%xmm7,%%xmm1\n\t" \
/*xmm7 is free.*/ \
"movdqa %%xmm2,%%xmm7\n\t" \
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
"punpckhwd %%xmm3,%%xmm2\n\t" \
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
"punpcklwd %%xmm3,%%xmm7\n\t" \
/*xmm3 is free.*/ \
"movdqa %%xmm0,%%xmm3\n\t" \
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
"punpckldq %%xmm7,%%xmm0\n\t" \
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
"punpckhdq %%xmm7,%%xmm3\n\t" \
/*xmm7 is free.*/ \
"movdqa %%xmm5,%%xmm7\n\t" \
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
"punpckldq %%xmm2,%%xmm5\n\t" \
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
"punpckhdq %%xmm2,%%xmm7\n\t" \
/*xmm2 is free.*/ \
"movdqa %%xmm4,%%xmm2\n\t" \
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
"punpckhdq %%xmm6,%%xmm4\n\t" \
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
"punpckldq %%xmm6,%%xmm2\n\t" \
/*xmm6 is free.*/ \
"movdqa %%xmm8,%%xmm6\n\t" \
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
"punpckldq %%xmm1,%%xmm6\n\t" \
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
"punpckhdq %%xmm1,%%xmm8\n\t" \
/*xmm1 is free.*/ \
"movdqa %%xmm0,%%xmm1\n\t" \
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
"punpcklqdq %%xmm2,%%xmm0\n\t" \
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
"punpckhqdq %%xmm2,%%xmm1\n\t" \
/*xmm2 is free.*/ \
"movdqa %%xmm3,%%xmm2\n\t" \
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
"punpckhqdq %%xmm4,%%xmm3\n\t" \
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
"punpcklqdq %%xmm4,%%xmm2\n\t" \
/*xmm4 is free.*/ \
"movdqa %%xmm5,%%xmm4\n\t" \
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
"punpckhqdq %%xmm6,%%xmm5\n\t" \
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
"punpcklqdq %%xmm6,%%xmm4\n\t" \
/*xmm6 is free.*/ \
"movdqa %%xmm7,%%xmm6\n\t" \
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
"punpckhqdq %%xmm8,%%xmm7\n\t" \
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
"punpcklqdq %%xmm8,%%xmm6\n\t" \
/*xmm8 is free.*/ \
# else
/*Otherwise, we need to spill some values to %[buf] temporarily.
Again, the butterflies are carefully arranged to get the columns to come out
in order, minimizing register spills and maximizing the delay between a load
and when the value loaded is actually used.*/
# define OC_TRANSPOSE_8x8 \
"#OC_TRANSPOSE_8x8\n\t" \
/*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*xmm0 is free.*/ \
"movdqa %%xmm2,%%xmm0\n\t" \
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
"punpckhwd %%xmm3,%%xmm2\n\t" \
/*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
"punpcklwd %%xmm3,%%xmm0\n\t" \
/*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
/*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
/*xmm2 is free.*/ \
"movdqa %%xmm6,%%xmm2\n\t" \
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
"punpcklwd %%xmm7,%%xmm6\n\t" \
/*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
"punpckhwd %%xmm7,%%xmm2\n\t" \
/*xmm7 is free.*/ \
"movdqa %%xmm4,%%xmm7\n\t" \
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
"punpcklwd %%xmm5,%%xmm4\n\t" \
/*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
"punpckhwd %%xmm5,%%xmm7\n\t" \
/*xmm5 is free.*/ \
"movdqa %%xmm3,%%xmm5\n\t" \
/*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
"punpcklwd %%xmm1,%%xmm3\n\t" \
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
"punpckhwd %%xmm1,%%xmm5\n\t" \
/*xmm1 is free.*/ \
"movdqa %%xmm7,%%xmm1\n\t" \
/*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
"punpckldq %%xmm2,%%xmm7\n\t" \
/*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
"punpckhdq %%xmm2,%%xmm1\n\t" \
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
/*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
"movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*xmm1 is free.*/ \
"movdqa %%xmm3,%%xmm1\n\t" \
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
"punpckhdq %%xmm0,%%xmm3\n\t" \
/*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
"punpckldq %%xmm0,%%xmm1\n\t" \
/*xmm0 is free.*/ \
"movdqa %%xmm4,%%xmm0\n\t" \
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
"punpckhdq %%xmm6,%%xmm4\n\t" \
/*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
"punpckldq %%xmm6,%%xmm0\n\t" \
/*xmm6 is free.*/ \
"movdqa %%xmm5,%%xmm6\n\t" \
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
"punpckldq %%xmm2,%%xmm5\n\t" \
/*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
"punpckhdq %%xmm2,%%xmm6\n\t" \
/*xmm2 is free.*/ \
"movdqa %%xmm1,%%xmm2\n\t" \
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
"punpckhqdq %%xmm0,%%xmm1\n\t" \
/*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
"punpcklqdq %%xmm0,%%xmm2\n\t" \
/*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
/*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
/*xmm2 is free.*/ \
"movdqa %%xmm3,%%xmm2\n\t" \
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
"punpckhqdq %%xmm4,%%xmm3\n\t" \
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
"punpcklqdq %%xmm4,%%xmm2\n\t" \
/*xmm4 is free.*/ \
"movdqa %%xmm5,%%xmm4\n\t" \
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
"punpckhqdq %%xmm7,%%xmm5\n\t" \
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
"punpcklqdq %%xmm7,%%xmm4\n\t" \
/*xmm7 is free.*/ \
"movdqa %%xmm6,%%xmm7\n\t" \
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
"punpcklqdq %%xmm0,%%xmm6\n\t" \
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
"punpckhqdq %%xmm0,%%xmm7\n\t" \
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
# endif
/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
four SSE registers.
No need to be clever here; we have plenty of room.*/
# define OC_TRANSPOSE_8x4_MMX2SSE \
"#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
"movq2dq %%mm0,%%xmm0\n\t" \
"movq2dq %%mm1,%%xmm1\n\t" \
/*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
"punpcklwd %%xmm1,%%xmm0\n\t" \
"movq2dq %%mm2,%%xmm3\n\t" \
"movq2dq %%mm3,%%xmm2\n\t" \
/*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
"punpcklwd %%xmm2,%%xmm3\n\t" \
"movq2dq %%mm4,%%xmm4\n\t" \
"movq2dq %%mm5,%%xmm5\n\t" \
/*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
"punpcklwd %%xmm5,%%xmm4\n\t" \
"movq2dq %%mm6,%%xmm7\n\t" \
"movq2dq %%mm7,%%xmm6\n\t" \
/*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
"punpcklwd %%xmm6,%%xmm7\n\t" \
"movdqa %%xmm0,%%xmm2\n\t" \
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
"punpckldq %%xmm3,%%xmm0\n\t" \
/*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
"punpckhdq %%xmm3,%%xmm2\n\t" \
"movdqa %%xmm4,%%xmm5\n\t" \
/*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
"punpckldq %%xmm7,%%xmm4\n\t" \
/*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
"punpckhdq %%xmm7,%%xmm5\n\t" \
"movdqa %%xmm0,%%xmm1\n\t" \
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
"punpcklqdq %%xmm4,%%xmm0\n\t" \
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
"punpckhqdq %%xmm4,%%xmm1\n\t" \
"movdqa %%xmm2,%%xmm3\n\t" \
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
"punpcklqdq %%xmm5,%%xmm2\n\t" \
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
"punpckhqdq %%xmm5,%%xmm3\n\t" \
#endif

182
thirdparty/libtheora/x86/x86cpu.c vendored Normal file
View File

@@ -0,0 +1,182 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
CPU capability detection for x86 processors.
Originally written by Rudolf Marek.
function:
********************************************************************/
#include "x86cpu.h"
#if !defined(OC_X86_ASM)
ogg_uint32_t oc_cpu_flags_get(void){
return 0;
}
#else
# if defined(__amd64__)||defined(__x86_64__)
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
compiling with -fPIC.*/
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
__asm__ __volatile__( \
"cpuid\n\t" \
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
:"a"(_op) \
:"cc" \
)
# else
/*On x86-32, not so much.*/
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
__asm__ __volatile__( \
"xchgl %%ebx,%[ebx]\n\t" \
"cpuid\n\t" \
"xchgl %%ebx,%[ebx]\n\t" \
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
:"a"(_op) \
:"cc" \
)
# endif
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
ogg_uint32_t flags;
/*If there isn't even MMX, give up.*/
if(!(_edx&0x00800000))return 0;
flags=OC_CPU_X86_MMX;
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
return flags;
}
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
ogg_uint32_t flags;
/*If there isn't even MMX, give up.*/
if(!(_edx&0x00800000))return 0;
flags=OC_CPU_X86_MMX;
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
return flags;
}
ogg_uint32_t oc_cpu_flags_get(void){
ogg_uint32_t flags;
ogg_uint32_t eax;
ogg_uint32_t ebx;
ogg_uint32_t ecx;
ogg_uint32_t edx;
# if !defined(__amd64__)&&!defined(__x86_64__)
/*Not all x86-32 chips support cpuid, so we have to check.*/
__asm__ __volatile__(
"pushfl\n\t"
"pushfl\n\t"
"popl %[a]\n\t"
"movl %[a],%[b]\n\t"
"xorl $0x200000,%[a]\n\t"
"pushl %[a]\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl %[a]\n\t"
"popfl\n\t"
:[a]"=r"(eax),[b]"=r"(ebx)
:
:"cc"
);
/*No cpuid.*/
if(eax==ebx)return 0;
# endif
cpuid(0,eax,ebx,ecx,edx);
/* l e t n I e n i u n e G*/
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
/* 6 8 x M T e n i u n e G*/
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
int family;
int model;
/*Intel, Transmeta (tested with Crusoe TM5800):*/
cpuid(1,eax,ebx,ecx,edx);
flags=oc_parse_intel_flags(edx,ecx);
family=(eax>>8)&0xF;
model=(eax>>4)&0xF;
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
unit, so don't use it.*/
if(family==6&&(model==9||model==13||model==14)){
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
}
}
/* D M A c i t n e h t u A*/
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
/* C S N y b e d o e G*/
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
/*AMD, Geode:*/
cpuid(0x80000000,eax,ebx,ecx,edx);
if(eax<0x80000001)flags=0;
else{
cpuid(0x80000001,eax,ebx,ecx,edx);
flags=oc_parse_amd_flags(edx,ecx);
}
/*Also check for SSE.*/
cpuid(1,eax,ebx,ecx,edx);
flags|=oc_parse_intel_flags(edx,ecx);
}
/*Technically some VIA chips can be configured in the BIOS to return any
string here the user wants.
There is a special detection method that can be used to identify such
processors, but in my opinion, if the user really wants to change it, they
deserve what they get.*/
/* s l u a H r u a t n e C*/
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
/*VIA:*/
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
chips (thanks to the engineers from Centaur Technology who provided it).
These chips support Intel-like cpuid info.
The C3-2 (Nehemiah) cores appear to, as well.*/
cpuid(1,eax,ebx,ecx,edx);
flags=oc_parse_intel_flags(edx,ecx);
if(eax>=0x80000001){
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
We need to check this even if the Intel test succeeds to pick up 3DNow!
support on these processors.
Unlike actual AMD processors, we cannot _rely_ on this info, since
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
this function, yet return edx=0, despite the Intel test indicating
MMX support.
Therefore the features detected here are strictly added to those
detected by the Intel test.*/
/*TODO: How about earlier chips?*/
cpuid(0x80000001,eax,ebx,ecx,edx);
/*Note: As of the C7, this function returns Intel-style extended feature
flags, not AMD-style.
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
do not conflict with any of the AMD flags we inspect.
For the remaining bits, Intel tells us, "Do not count on their value",
but VIA assures us that they will all be zero (at least on the C7 and
Isaiah chips).
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
(0xC0C00000) for something else, we will have to add code to detect
the model to decide when it is appropriate to inspect them.*/
flags|=oc_parse_amd_flags(edx,ecx);
}
}
else{
/*Implement me.*/
flags=0;
}
return flags;
}
#endif

36
thirdparty/libtheora/x86/x86cpu.h vendored Normal file
View File

@@ -0,0 +1,36 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_x86cpu_H)
# define _x86_x86cpu_H (1)
#include "../internal.h"
#define OC_CPU_X86_MMX (1<<0)
#define OC_CPU_X86_3DNOW (1<<1)
#define OC_CPU_X86_3DNOWEXT (1<<2)
#define OC_CPU_X86_MMXEXT (1<<3)
#define OC_CPU_X86_SSE (1<<4)
#define OC_CPU_X86_SSE2 (1<<5)
#define OC_CPU_X86_PNI (1<<6)
#define OC_CPU_X86_SSSE3 (1<<7)
#define OC_CPU_X86_SSE4_1 (1<<8)
#define OC_CPU_X86_SSE4_2 (1<<9)
#define OC_CPU_X86_SSE4A (1<<10)
#define OC_CPU_X86_SSE5 (1<<11)
ogg_uint32_t oc_cpu_flags_get(void);
#endif

63
thirdparty/libtheora/x86/x86enc.c vendored Normal file
View File

@@ -0,0 +1,63 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include "x86enc.h"
#if defined(OC_X86_ASM)
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
ogg_uint32_t cpu_flags;
cpu_flags=_enc->state.cpu_flags;
oc_enc_accel_init_c(_enc);
# if defined(OC_ENC_USE_VTABLE)
if(cpu_flags&OC_CPU_X86_MMX){
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
}
if(cpu_flags&OC_CPU_X86_MMXEXT){
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
}
if(cpu_flags&OC_CPU_X86_SSE2){
# if defined(OC_X86_64_ASM)
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
# endif
_enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
_enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
_enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
_enc->opt_vtable.quantize=oc_enc_quantize_sse2;
# else
(void) cpu_flags;
# endif
_enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
_enc->opt_data.enquant_table_alignment=16;
# if defined(OC_ENC_USE_VTABLE)
}
# endif
}
#endif

114
thirdparty/libtheora/x86/x86enc.h vendored Normal file
View File

@@ -0,0 +1,114 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_x86enc_H)
# define _x86_x86enc_H (1)
# include "x86int.h"
# if defined(OC_X86_ASM)
# define oc_enc_accel_init oc_enc_accel_init_x86
# if defined(OC_X86_64_ASM)
/*x86-64 guarantees SIMD support up through at least SSE2.
If the best routine we have available only needs SSE2 (which at the moment
covers all of them), then we can avoid runtime detection and the indirect
call.*/
# define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
# define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
oc_enc_enquant_table_init_x86(_enquant,_dequant)
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
# define oc_enc_fdct8x8(_enc,_y,_x) \
oc_enc_fdct8x8_x86_64sse2(_y,_x)
# else
# define OC_ENC_USE_VTABLE (1)
# endif
# endif
# include "../encint.h"
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
const unsigned char *_x,const unsigned char *_y,int _stride);
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
const unsigned char *_x,int _stride);
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride,unsigned _thresh);
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh);
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
const unsigned char *_src,int _ystride);
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
const unsigned char *_src,int _ystride);
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
void oc_enc_enquant_table_init_x86(void *_enquant,
const ogg_uint16_t _dequant[64]);
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
const ogg_uint16_t _dequant[64],const void *_enquant);
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
# if defined(OC_X86_64_ASM)
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
# endif
#endif

149
thirdparty/libtheora/x86/x86enquant.c vendored Normal file
View File

@@ -0,0 +1,149 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include "x86enc.h"
#if defined(OC_X86_ASM)
/*The default enquant table is not quite suitable for SIMD purposes.
First, the m and l parameters need to be separated so that an entire row full
of m's or l's can be loaded at a time.
Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
emulate one with a multiply.
Therefore we translate the shift count into a scale factor.*/
void oc_enc_enquant_table_init_x86(void *_enquant,
const ogg_uint16_t _dequant[64]){
ogg_int16_t *m;
ogg_int16_t *l;
int zzi;
m=(ogg_int16_t *)_enquant;
l=m+64;
for(zzi=0;zzi<64;zzi++){
oc_iquant q;
oc_iquant_init(&q,_dequant[zzi]);
m[zzi]=q.m;
/*q.l must be at least 2 for this to work; fortunately, once all the scale
factors are baked in, the minimum quantizer is much larger than that.*/
l[zzi]=1<<16-q.l;
}
}
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
int pli;
int qii;
int qti;
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
((ogg_int16_t *)_enquant[pli][0][qti])[0];
((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
((ogg_int16_t *)_enquant[pli][0][qti])[64];
}
}
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
const ogg_uint16_t _dequant[64],const void *_enquant){
ptrdiff_t r;
__asm__ __volatile__(
"xor %[r],%[r]\n\t"
/*Loop through two rows at a time.*/
".p2align 4\n\t"
"0:\n\t"
/*Load the first two rows of the data and the quant matrices.*/
"movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
"movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
"movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
/*Double the input and propagate its sign to the rounding factor.
Using SSSE3's psignw would help here, but we need the mask later anyway.*/
"movdqa %%xmm0,%%xmm6\n\t"
"psraw $15,%%xmm0\n\t"
"movdqa %%xmm1,%%xmm7\n\t"
"paddw %%xmm6,%%xmm6\n\t"
"psraw $15,%%xmm1\n\t"
"paddw %%xmm7,%%xmm7\n\t"
"paddw %%xmm0,%%xmm2\n\t"
"paddw %%xmm1,%%xmm3\n\t"
"pxor %%xmm0,%%xmm2\n\t"
"pxor %%xmm1,%%xmm3\n\t"
/*Add the rounding factor and perform the first multiply.*/
"paddw %%xmm2,%%xmm6\n\t"
"paddw %%xmm3,%%xmm7\n\t"
"pmulhw %%xmm6,%%xmm4\n\t"
"pmulhw %%xmm7,%%xmm5\n\t"
"movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
"movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
"paddw %%xmm4,%%xmm6\n\t"
"paddw %%xmm5,%%xmm7\n\t"
/*Emulate an element-wise right-shift via a second multiply.*/
"pmulhw %%xmm2,%%xmm6\n\t"
"pmulhw %%xmm3,%%xmm7\n\t"
"add $32,%[r]\n\t"
"cmp $96,%[r]\n\t"
/*Correct for the sign.*/
"psubw %%xmm0,%%xmm6\n\t"
"psubw %%xmm1,%%xmm7\n\t"
/*Save the result.*/
"movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
"movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
"jle 0b\n\t"
/*Now find the location of the last non-zero value.*/
"movdqa 0x50(%[qdct]),%%xmm5\n\t"
"movdqa 0x40(%[qdct]),%%xmm4\n\t"
"packsswb %%xmm7,%%xmm6\n\t"
"packsswb %%xmm5,%%xmm4\n\t"
"pxor %%xmm0,%%xmm0\n\t"
"mov $-1,%k[dq]\n\t"
"pcmpeqb %%xmm0,%%xmm6\n\t"
"pcmpeqb %%xmm0,%%xmm4\n\t"
"pmovmskb %%xmm6,%k[q]\n\t"
"pmovmskb %%xmm4,%k[r]\n\t"
"shl $16,%k[q]\n\t"
"or %k[r],%k[q]\n\t"
"mov $32,%[r]\n\t"
/*We have to use xor here instead of not in order to set the flags.*/
"xor %k[dq],%k[q]\n\t"
"jnz 1f\n\t"
"movdqa 0x30(%[qdct]),%%xmm7\n\t"
"movdqa 0x20(%[qdct]),%%xmm6\n\t"
"movdqa 0x10(%[qdct]),%%xmm5\n\t"
"movdqa 0x00(%[qdct]),%%xmm4\n\t"
"packsswb %%xmm7,%%xmm6\n\t"
"packsswb %%xmm5,%%xmm4\n\t"
"pcmpeqb %%xmm0,%%xmm6\n\t"
"pcmpeqb %%xmm0,%%xmm4\n\t"
"pmovmskb %%xmm6,%k[q]\n\t"
"pmovmskb %%xmm4,%k[r]\n\t"
"shl $16,%k[q]\n\t"
"or %k[r],%k[q]\n\t"
"xor %[r],%[r]\n\t"
"not %k[q]\n\t"
"or $1,%k[q]\n\t"
"1:\n\t"
"bsr %k[q],%k[q]\n\t"
"add %k[q],%k[r]\n\t"
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
:[dct]"r"(_dct),[qdct]"r"(_qdct)
:"cc","memory"
);
return (int)r;
}
#endif

122
thirdparty/libtheora/x86/x86int.h vendored Normal file
View File

@@ -0,0 +1,122 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_x86int_H)
# define _x86_x86int_H (1)
# include "../internal.h"
# if defined(OC_X86_ASM)
# define oc_state_accel_init oc_state_accel_init_x86
# if defined(OC_X86_64_ASM)
/*x86-64 guarantees SIMD support up through at least SSE2.
If the best routine we have available only needs SSE2 (which at the moment
covers all of them), then we can avoid runtime detection and the indirect
call.*/
# define oc_frag_copy(_state,_dst,_src,_ystride) \
oc_frag_copy_mmx(_dst,_src,_ystride)
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
_fragis,_nfragis,_frag_buf_offs) \
oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
_fragis,_nfragis,_frag_buf_offs)
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
oc_idct8x8_sse2(_y,_x,_last_zzi)
# define oc_state_frag_recon oc_state_frag_recon_mmx
# define oc_loop_filter_init(_state,_bv,_flimit) \
oc_loop_filter_init_mmxext(_bv,_flimit)
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
# define oc_restore_fpu(_state) \
oc_restore_fpu_mmx()
# else
# define OC_STATE_USE_VTABLE (1)
# endif
# endif
# include "../state.h"
# include "x86cpu.h"
/*Converts the expression in the argument to a string.*/
#define OC_M2STR(_s) #_s
/*Memory operands do not always include an offset.
To avoid warnings, we force an offset with %H (which adds 8).*/
# if __GNUC_PREREQ(4,0)
# define OC_MEM_OFFS(_offs,_name) \
OC_M2STR(_offs-8+%H[_name])
# endif
/*If your gcc version doesn't support %H, then you get to suffer the warnings.
Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
whole offset, instead of substituting in 0 for the missing operand to +.*/
# if !defined(OC_MEM_OFFS)
# define OC_MEM_OFFS(_offs,_name) \
OC_M2STR(_offs+%[_name])
# endif
/*Declare an array operand with an exact size.
This tells gcc we're going to clobber this memory region, without having to
clobber all of "memory" and lets us access local buffers directly using the
stack pointer, without allocating a separate register to point to them.*/
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
(*({ \
struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
array_addr__; \
}))
/*Declare an array operand with an exact size.
This tells gcc we're going to clobber this memory region, without having to
clobber all of "memory" and lets us access local buffers directly using the
stack pointer, without allocating a separate register to point to them.*/
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
(*({ \
const struct{_type array_value__[(_size)];} *array_addr__= \
(const void *)(_ptr); \
array_addr__; \
}))
extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
void oc_state_accel_init_x86(oc_theora_state *_state);
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride);
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
const unsigned char *_src_frame,int _ystride,
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu_mmx(void);
#endif

97
thirdparty/libtheora/x86/x86state.c vendored Normal file
View File

@@ -0,0 +1,97 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include "x86int.h"
#if defined(OC_X86_ASM)
#if defined(OC_STATE_USE_VTABLE)
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
each quadrant of the destination.*/
static const unsigned char OC_FZIG_ZAG_MMX[128]={
0, 8, 1, 2, 9,16,24,17,
10, 3,32,11,18,25, 4,12,
5,26,19,40,33,34,41,48,
27, 6,13,20,28,21,14, 7,
56,49,42,35,43,50,57,36,
15,22,29,30,23,44,37,58,
51,59,38,45,52,31,60,53,
46,39,47,54,61,62,55,63,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64
};
#endif
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
the destination.*/
static const unsigned char OC_FZIG_ZAG_SSE2[128]={
0, 8, 1, 2, 9,16,24,17,
10, 3, 4,11,18,25,32,40,
33,26,19,12, 5, 6,13,20,
27,34,41,48,56,49,42,35,
28,21,14, 7,15,22,29,36,
43,50,57,58,51,44,37,30,
23,31,38,45,52,59,60,53,
46,39,47,54,61,62,55,63,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64
};
void oc_state_accel_init_x86(oc_theora_state *_state){
oc_state_accel_init_c(_state);
_state->cpu_flags=oc_cpu_flags_get();
# if defined(OC_STATE_USE_VTABLE)
if(_state->cpu_flags&OC_CPU_X86_MMX){
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmxext;
}
if(_state->cpu_flags&OC_CPU_X86_SSE2){
_state->opt_vtable.idct8x8=oc_idct8x8_sse2;
# endif
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
# if defined(OC_STATE_USE_VTABLE)
}
# endif
}
#endif

244
thirdparty/libtheora/x86/x86zigzag.h vendored Normal file
View File

@@ -0,0 +1,244 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_x86zigzag_H)
# define _x86_x86zigzag_H (1)
# include "x86enc.h"
/*Converts DCT coefficients from transposed order into zig-zag scan order and
stores them in %[y].
This relies on two macros to load the contents of each row:
OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
the first four and second four entries of each row into the specified
register, respectively.
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
(because when the rows are already in SSE2 registers, loading the high half
destructively modifies the register).
The index of each output element in the original 64-element array should wind
up in the following 8x8 matrix (the letters indicate the order we compute
each 4-tuple below):
A 0 8 1 2 9 16 24 17 B
C 10 3 4 11 18 25 32 40 E
F 33 26 19 12 5 6 13 20 D
G 27 34 41 48 56 49 42 35 I
L 28 21 14 7 15 22 29 36 M
H 43 50 57 58 51 44 37 30 O
N 23 31 38 45 52 59 60 53 J
P 46 39 47 54 61 62 55 63 K
The order of the coefficients within each tuple is reversed in the comments
below to reflect the usual MSB to LSB notation.*/
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \
OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \
OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \
OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \
OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \
OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \
OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \
"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
"punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \
"pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
"punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \
"movq %%mm7,0x00(%[y])\n\t" \
"punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \
"movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \
"punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \
"movq %%mm1,0x08(%[y])\n\t" \
OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \
"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
"movq %%mm0,0x10(%[y])\n\t" \
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
"punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \
"movq %%mm4,0x28(%[y])\n\t" \
"psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \
"pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
"movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \
"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \
"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
"movq %%mm4,0x18(%[y])\n\t" \
OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \
"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
"movq %%mm2,0x20(%[y])\n\t" \
"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
"pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
"movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \
"punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \
"pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
"punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \
"movq %%mm3,0x30(%[y])\n\t" \
"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
"movq %%mm1,0x50(%[y])\n\t" \
OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \
"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \
"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \
"movq %%mm4,0x38(%[y])\n\t" \
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \
"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
"movq %%mm3,0x68(%[y])\n\t" \
"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \
"movq %%mm4,0x78(%[y])\n\t" \
"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
"punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \
"pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
"pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
"movq %%mm5,0x40(%[y])\n\t" \
"punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \
"movq %%mm7,0x48(%[y])\n\t" \
"pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
"punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \
"punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \
"movq %%mm6,0x60(%[y])\n\t" \
"punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \
"punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \
"movq %%mm3,0x58(%[y])\n\t" \
"pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
"movq %%mm0,0x70(%[y])\n\t" \
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
order and stores them in %[qdct].
The index of each output element in the original 64-element array should wind
up in the following 8x8 matrix (the letters indicate the order we compute
each 4-tuple below):
A 0 1 8 16 9 2 3 10 B
C 17 24 32 25 18 11 4 5 D
E 12 19 26 33 40 48 41 34 I
H 27 20 13 6 7 14 21 28 G
K 35 42 49 56 57 50 43 36 J
F 29 22 15 23 30 37 44 51 M
P 58 59 52 45 38 31 39 46 L
N 53 60 61 54 47 55 62 63 O
The order of the coefficients within each tuple is reversed in the comments
below to reflect the usual MSB to LSB notation.*/
#define OC_ZIG_ZAG_MMXEXT \
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
"movq %%mm0,0x00(%[qdct])\n\t" \
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
"movq %%mm6,0x08(%[qdct])\n\t" \
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
"movq %%mm2,0x10(%[qdct])\n\t" \
"movq %%mm3,0x18(%[qdct])\n\t" \
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
"movq %%mm0,0x20(%[qdct])\n\t" \
"movq %%mm3,0x50(%[qdct])\n\t" \
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
"movq %%mm2,0x30(%[qdct])\n\t" \
"movq %%mm6,0x38(%[qdct])\n\t" \
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
"movq %%mm0,0x28(%[qdct])\n\t" \
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
"movq %%mm4,0x40(%[qdct])\n\t" \
"movq %%mm6,0x48(%[qdct])\n\t" \
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
"movq %%mm2,0x68(%[qdct])\n\t" \
"movq %%mm1,0x58(%[qdct])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
"movq %%mm6,0x70(%[qdct])\n\t" \
"movq %%mm5,0x78(%[qdct])\n\t" \
"movq %%mm7,0x60(%[qdct])\n\t" \
#endif