initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled
This commit is contained in:
903
thirdparty/libtheora/x86/mmxencfrag.c
vendored
Normal file
903
thirdparty/libtheora/x86/mmxencfrag.c
vendored
Normal file
@@ -0,0 +1,903 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
ptrdiff_t ystride3;
|
||||
ptrdiff_t ret;
|
||||
__asm__ __volatile__(
|
||||
/*Load the first 4 rows of each block.*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
"movq (%[ref]),%%mm1\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm2\n\t"
|
||||
"movq (%[ref],%[ystride]),%%mm3\n\t"
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
|
||||
"movq (%[src],%[ystride],2),%%mm4\n\t"
|
||||
"movq (%[ref],%[ystride],2),%%mm5\n\t"
|
||||
"movq (%[src],%[ystride3]),%%mm6\n\t"
|
||||
"movq (%[ref],%[ystride3]),%%mm7\n\t"
|
||||
/*Compute their SADs and add them in %%mm0*/
|
||||
"psadbw %%mm1,%%mm0\n\t"
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"lea (%[ref],%[ystride],4),%[ref]\n\t"
|
||||
/*Load the next 3 rows as registers become available.*/
|
||||
"movq (%[src]),%%mm2\n\t"
|
||||
"movq (%[ref]),%%mm3\n\t"
|
||||
"psadbw %%mm5,%%mm4\n\t"
|
||||
"psadbw %%mm7,%%mm6\n\t"
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"movq (%[ref],%[ystride]),%%mm5\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm4\n\t"
|
||||
"paddw %%mm6,%%mm0\n\t"
|
||||
"movq (%[ref],%[ystride],2),%%mm7\n\t"
|
||||
"movq (%[src],%[ystride],2),%%mm6\n\t"
|
||||
/*Start adding their SADs to %%mm0*/
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"psadbw %%mm5,%%mm4\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"psadbw %%mm7,%%mm6\n\t"
|
||||
/*Load last row as registers become available.*/
|
||||
"movq (%[src],%[ystride3]),%%mm2\n\t"
|
||||
"movq (%[ref],%[ystride3]),%%mm3\n\t"
|
||||
/*And finish adding up their SADs.*/
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"paddw %%mm6,%%mm0\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"movd %%mm0,%[ret]\n\t"
|
||||
:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
);
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
||||
/*Early termination is for suckers.*/
|
||||
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
|
||||
}
|
||||
|
||||
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
|
||||
first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
|
||||
We pre-load the next two rows of data as registers become available.*/
|
||||
#define OC_SAD2_LOOP \
|
||||
"#OC_SAD2_LOOP\n\t" \
|
||||
/*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
|
||||
pavgb computes (%%mm0+%%mm1+1>>1). \
|
||||
The latter is exactly 1 too large when the low bit of two corresponding \
|
||||
bytes is only set in one of them. \
|
||||
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
|
||||
correct the output of pavgb. \
|
||||
TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
|
||||
schedules better; currently, however, this function is unused.*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
|
||||
"pxor %%mm1,%%mm0\n\t" \
|
||||
"pavgb %%mm1,%%mm6\n\t" \
|
||||
"lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pand %%mm7,%%mm0\n\t" \
|
||||
"pavgb %%mm3,%%mm2\n\t" \
|
||||
"pxor %%mm3,%%mm1\n\t" \
|
||||
"movq (%[ref2],%[ystride]),%%mm3\n\t" \
|
||||
"psubb %%mm0,%%mm6\n\t" \
|
||||
"movq (%[ref1]),%%mm0\n\t" \
|
||||
"pand %%mm7,%%mm1\n\t" \
|
||||
"psadbw %%mm6,%%mm4\n\t" \
|
||||
"movd %[ret],%%mm6\n\t" \
|
||||
"psubb %%mm1,%%mm2\n\t" \
|
||||
"movq (%[ref2]),%%mm1\n\t" \
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t" \
|
||||
"psadbw %%mm2,%%mm5\n\t" \
|
||||
"movq (%[ref1],%[ystride]),%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm5\n\t" \
|
||||
"movq (%[src]),%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movq (%[src],%[ystride]),%%mm5\n\t" \
|
||||
"movd %%mm6,%[ret]\n\t" \
|
||||
|
||||
/*Same as above, but does not pre-load the next two rows.*/
|
||||
#define OC_SAD2_TAIL \
|
||||
"#OC_SAD2_TAIL\n\t" \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"pavgb %%mm1,%%mm0\n\t" \
|
||||
"pxor %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pand %%mm7,%%mm6\n\t" \
|
||||
"pavgb %%mm3,%%mm2\n\t" \
|
||||
"pxor %%mm3,%%mm1\n\t" \
|
||||
"psubb %%mm6,%%mm0\n\t" \
|
||||
"pand %%mm7,%%mm1\n\t" \
|
||||
"psadbw %%mm0,%%mm4\n\t" \
|
||||
"psubb %%mm1,%%mm2\n\t" \
|
||||
"movd %[ret],%%mm6\n\t" \
|
||||
"psadbw %%mm2,%%mm5\n\t" \
|
||||
"paddw %%mm4,%%mm5\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movd %%mm6,%[ret]\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh){
|
||||
ptrdiff_t ret;
|
||||
__asm__ __volatile__(
|
||||
"movq (%[ref1]),%%mm0\n\t"
|
||||
"movq (%[ref2]),%%mm1\n\t"
|
||||
"movq (%[ref1],%[ystride]),%%mm2\n\t"
|
||||
"movq (%[ref2],%[ystride]),%%mm3\n\t"
|
||||
"xor %[ret],%[ret]\n\t"
|
||||
"movq (%[src]),%%mm4\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"pcmpeqb %%mm6,%%mm6\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm5\n\t"
|
||||
"psubb %%mm6,%%mm7\n\t"
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_TAIL
|
||||
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
);
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
|
||||
16-bit difference in %%mm0...%%mm7.*/
|
||||
#define OC_LOAD_SUB_8x4(_off) \
|
||||
"#OC_LOAD_SUB_8x4\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm4\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm2\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm7\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
|
||||
"punpcklbw %%mm4,%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm4\n\t" \
|
||||
"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||
"punpcklbw %%mm5,%%mm1\n\t" \
|
||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm7,%%mm2\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm6,%%mm3\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%mm6,%%mm6\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm6\n\t" \
|
||||
"punpcklbw %%mm0,%%mm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"psubw %%mm0,%%mm4\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||
"punpcklbw %%mm7,%%mm5\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm0,%%mm6\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"psubw %%mm0,%%mm6\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm7\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
|
||||
#define OC_LOAD_8x4(_off) \
|
||||
"#OC_LOAD_8x4\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"movd "#_off"(%[src4]),%%mm4\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||
"psrlw $8,%%mm4\n\t" \
|
||||
"psrlw $8,%%mm5\n\t" \
|
||||
"punpcklbw %%mm6,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psrlw $8,%%mm6\n\t" \
|
||||
"psrlw $8,%%mm7\n\t" \
|
||||
|
||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
|
||||
perform this stage in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_AB_8x4 \
|
||||
"#OC_HADAMARD_AB_8x4\n\t" \
|
||||
/*Stage A: \
|
||||
Outputs 0-3 are swapped with 4-7 here.*/ \
|
||||
"paddw %%mm1,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
"psubw %%mm7,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
/*Stage B:*/ \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm1\n\t" \
|
||||
"paddw %%mm6,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm6\n\t" \
|
||||
"psubw %%mm5,%%mm7\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||
place with no temporary registers).*/
|
||||
#define OC_HADAMARD_C_8x4 \
|
||||
"#OC_HADAMARD_C_8x4\n\t" \
|
||||
/*Stage C:*/ \
|
||||
"paddw %%mm1,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm2\n\t" \
|
||||
"paddw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm0,%%mm1\n\t" \
|
||||
"psubw %%mm2,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm5\n\t" \
|
||||
"psubw %%mm6,%%mm7\n\t" \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
|
||||
in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_8x4 \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_8x4 \
|
||||
|
||||
/*Performs the first part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.
|
||||
At the end of this part, %%mm1 will contain the DC coefficient of the
|
||||
transform.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
|
||||
/*We use the fact that \
|
||||
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||
to merge the final butterfly with the abs and the first stage of \
|
||||
accumulation. \
|
||||
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
|
||||
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||
registers). \
|
||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
|
||||
"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
|
||||
"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
|
||||
/*mm7={0x7FFF}x4 \
|
||||
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"pmaxsw %%mm1,%%mm0\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"psubw %%mm6,%%mm0\n\t" \
|
||||
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
|
||||
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"pmaxsw %%mm3,%%mm2\n\t" \
|
||||
"pmaxsw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
|
||||
|
||||
/*Performs the second part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
|
||||
"paddsw %%mm7,%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
/*mm7={1}x4 (needed for the horizontal add that follows) \
|
||||
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
|
||||
"movq %%mm3,%%mm6\n\t" \
|
||||
"pmaxsw %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm6,%%mm0\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||
absolute value of each component, and accumulates everything into mm0.
|
||||
This is the only portion of SATD which requires MMXEXT (we could use plain
|
||||
MMX, but it takes 4 instructions and an extra register to work around the
|
||||
lack of a pmaxsw, which is a pretty serious penalty).*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||
component, and accumulates everything into mm0.
|
||||
Note that mm0 will have an extra 4 added to each column, and that after
|
||||
removing this value, the remainder will be half the conventional value.*/
|
||||
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
|
||||
|
||||
/*Performs two 4x4 transposes (mostly) in place.
|
||||
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
|
||||
contains rows {a,b,c,d}.
|
||||
On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
|
||||
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
|
||||
#define OC_TRANSPOSE_4x4x2(_off) \
|
||||
"#OC_TRANSPOSE_4x4x2\n\t" \
|
||||
/*First 4x4 transpose:*/ \
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm1 = f3 f2 f1 f0 \
|
||||
mm2 = g3 g2 g1 g0 \
|
||||
mm3 = h3 h2 h1 h0*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||
"movq %%mm0,%%mm3\n\t" \
|
||||
"punpcklwd %%mm1,%%mm0\n\t" \
|
||||
"punpckhwd %%mm1,%%mm3\n\t" \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm3 = f3 e3 f2 e2 \
|
||||
mm2 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm2,%%mm0\n\t" \
|
||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm3,%%mm2\n\t" \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
/*mm0 = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm5 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm7 = d3 d2 d1 d0*/ \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
|
||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm4,%%mm7\n\t" \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm7\n\t" \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm7,%%mm6\n\t" \
|
||||
"punpckhdq %%mm0,%%mm7\n\t" \
|
||||
"punpckldq %%mm0,%%mm6\n\t" \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
|
||||
static unsigned oc_int_frag_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
OC_LOAD_SUB_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
"movd %%mm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||
"movd %%mm4,%[ret2]\n\t"
|
||||
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
/*Subtract abs(dc) from 2*ret2.*/
|
||||
"movsx %w[dc],%[dc]\n\t"
|
||||
"cdq\n\t"
|
||||
"lea (%[ret],%[ret2],2),%[ret2]\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"xor %[dc],%[ret]\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
||||
added to them, a factor of two removed, and the DC value included;
|
||||
correct the final sum here.*/
|
||||
"sub %[ret],%[ret2]\n\t"
|
||||
"movd %%mm4,%[ret]\n\t"
|
||||
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||
and %[ret2] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||
constraints, otherwise if gcc can prove they're equal it will allocate
|
||||
them to the same register (which is bad); _src and _ref face a similar
|
||||
problem, though those are never actually the same.*/
|
||||
:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||
/*We have to use neg, so we actually clobber the condition codes for once
|
||||
(not to mention cmp, sub, and add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
||||
we can share code with oc_enc_frag_satd2_mmxext().*/
|
||||
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
||||
__asm__ __volatile__(
|
||||
/*Load the first 3 rows.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
"movq (%[src2]),%%mm1\n\t"
|
||||
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"movq (%[src1]),%%mm4\n\t"
|
||||
"pcmpeqb %%mm6,%%mm6\n\t"
|
||||
"movq (%[src2]),%%mm5\n\t"
|
||||
/*mm7={1}x8.*/
|
||||
"psubb %%mm6,%%mm7\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
|
||||
"movq %%mm2,%%mm1\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"pxor %%mm3,%%mm1\n\t"
|
||||
/*%%mm3 is free.*/
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
/*%%mm0 is free, start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm0\n\t"
|
||||
/*Start averaging %%mm5 and %%mm4 using %%mm3.*/
|
||||
"movq %%mm4,%%mm3\n\t"
|
||||
/*%%mm6 (row 0) is done; write it out.*/
|
||||
"movq %%mm6,(%[dst])\n\t"
|
||||
"pand %%mm7,%%mm1\n\t"
|
||||
"pavgb %%mm5,%%mm4\n\t"
|
||||
"psubb %%mm1,%%mm2\n\t"
|
||||
/*%%mm1 is free, continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm1\n\t"
|
||||
"pxor %%mm5,%%mm3\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
/*%%mm2 (row 1) is done; write it out.*/
|
||||
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
||||
"pand %%mm7,%%mm3\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1]),%%mm2\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm3,%%mm4\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
/*%%mm4 (row 2) is done; write it out.*/
|
||||
"movq %%mm4,(%[dst])\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2]),%%mm3\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm4\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
|
||||
"movq %%mm2,%%mm1\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm5\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
"pxor %%mm3,%%mm1\n\t"
|
||||
/*%%mm3 is free.*/
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
/*%%mm0 is free, start loading the next row.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
/*Start averaging %%mm5 into %%mm4 using %%mm3.*/
|
||||
"movq %%mm4,%%mm3\n\t"
|
||||
/*%%mm6 (row 3) is done; write it out.*/
|
||||
"movq %%mm6,(%[dst],%[dst_ystride])\n\t"
|
||||
"pand %%mm7,%%mm1\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
"pavgb %%mm5,%%mm4\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm1,%%mm2\n\t"
|
||||
/*%%mm1 is free; continue loading the next row.*/
|
||||
"movq (%[src2]),%%mm1\n\t"
|
||||
"pxor %%mm5,%%mm3\n\t"
|
||||
/*%%mm2 (row 4) is done; write it out.*/
|
||||
"movq %%mm2,(%[dst])\n\t"
|
||||
"pand %%mm7,%%mm3\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
|
||||
"psubb %%mm3,%%mm4\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
|
||||
/*%%mm4 (row 5) is done; write it out.*/
|
||||
"movq %%mm4,(%[dst],%[dst_ystride])\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"pxor %%mm3,%%mm4\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
"pand %%mm7,%%mm4\n\t"
|
||||
/*%%mm6 (row 6) is done, write it out.*/
|
||||
"movq %%mm6,(%[dst])\n\t"
|
||||
"psubb %%mm4,%%mm2\n\t"
|
||||
/*%%mm2 (row 7) is done, write it out.*/
|
||||
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
||||
:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
|
||||
[src_ystride]"r"((ptrdiff_t)_src_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
OC_LOAD_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
"movd %%mm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||
"movd %%mm4,%[ret]\n\t"
|
||||
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
/*We assume that the DC coefficient is always positive (which is true,
|
||||
because the input to the INTRA transform was not a difference).*/
|
||||
"movzx %w[dc],%[dc]\n\t"
|
||||
"add %[ret],%[ret]\n\t"
|
||||
"sub %[dc],%[ret]\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movd %%mm4,%[ret2]\n\t"
|
||||
"lea -64(%[ret],%[ret2],2),%[ret]\n\t"
|
||||
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||
and %[ret2] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf] (which is also
|
||||
listed as an output to ensure gcc _doesn't_ alias them against it).*/
|
||||
:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||
/*We have to use sub, so we actually clobber the condition codes for once
|
||||
(not to mention add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
||||
int i;
|
||||
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*mm0=[src]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[ref]*/
|
||||
"movq (%[ref]),%%mm1\n\t"
|
||||
/*mm4=[src+ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm4\n\t"
|
||||
/*mm5=[ref+ystride]*/
|
||||
"movq (%[ref],%[ystride]),%%mm5\n\t"
|
||||
/*Compute [src]-[ref].*/
|
||||
"movq %%mm0,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm3\n\t"
|
||||
"punpckhbw %%mm7,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm1,%%mm0\n\t"
|
||||
"psubw %%mm3,%%mm2\n\t"
|
||||
/*Compute [src+ystride]-[ref+ystride].*/
|
||||
"movq %%mm4,%%mm1\n\t"
|
||||
"punpcklbw %%mm7,%%mm4\n\t"
|
||||
"movq %%mm5,%%mm3\n\t"
|
||||
"punpckhbw %%mm7,%%mm1\n\t"
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t"
|
||||
"punpcklbw %%mm7,%%mm5\n\t"
|
||||
"lea (%[ref],%[ystride],2),%[ref]\n\t"
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm5,%%mm4\n\t"
|
||||
"psubw %%mm3,%%mm1\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x00(%[residue])\n\t"
|
||||
"movq %%mm2,0x08(%[residue])\n\t"
|
||||
"movq %%mm4,0x10(%[residue])\n\t"
|
||||
"movq %%mm1,0x18(%[residue])\n\t"
|
||||
"lea 0x20(%[residue]),%[residue]\n\t"
|
||||
:[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src,int _ystride){
|
||||
ptrdiff_t ystride3;
|
||||
__asm__ __volatile__(
|
||||
/*mm0=[src]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[src+ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t"
|
||||
/*mm6={-1}x4*/
|
||||
"pcmpeqw %%mm6,%%mm6\n\t"
|
||||
/*mm2=[src+2*ystride]*/
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t"
|
||||
/*[ystride3]=3*[ystride]*/
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
|
||||
/*mm6={1}x4*/
|
||||
"psllw $15,%%mm6\n\t"
|
||||
/*mm3=[src+3*ystride]*/
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t"
|
||||
/*mm6={128}x4*/
|
||||
"psrlw $8,%%mm6\n\t"
|
||||
/*mm7=0*/
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
/*[src]=[src]+4*[ystride]*/
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t"
|
||||
/*Compute [src]-128 and [src+ystride]-128*/
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm0\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x00(%[residue])\n\t"
|
||||
"movq %%mm4,0x08(%[residue])\n\t"
|
||||
"movq %%mm1,0x10(%[residue])\n\t"
|
||||
"movq %%mm5,0x18(%[residue])\n\t"
|
||||
/*mm0=[src+4*ystride]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[src+5*ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t"
|
||||
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
"movq %%mm3,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm2,0x20(%[residue])\n\t"
|
||||
"movq %%mm4,0x28(%[residue])\n\t"
|
||||
"movq %%mm3,0x30(%[residue])\n\t"
|
||||
"movq %%mm5,0x38(%[residue])\n\t"
|
||||
/*mm2=[src+6*ystride]*/
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t"
|
||||
/*mm3=[src+7*ystride]*/
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t"
|
||||
/*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm0\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x40(%[residue])\n\t"
|
||||
"movq %%mm4,0x48(%[residue])\n\t"
|
||||
"movq %%mm1,0x50(%[residue])\n\t"
|
||||
"movq %%mm5,0x58(%[residue])\n\t"
|
||||
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
"movq %%mm3,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm2,0x60(%[residue])\n\t"
|
||||
"movq %%mm4,0x68(%[residue])\n\t"
|
||||
"movq %%mm3,0x70(%[residue])\n\t"
|
||||
"movq %%mm5,0x78(%[residue])\n\t"
|
||||
:[src]"+r"(_src),[ystride3]"=&r"(ystride3)
|
||||
:[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
|
||||
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
|
||||
}
|
||||
|
||||
#endif
|
678
thirdparty/libtheora/x86/mmxfdct.c
vendored
Normal file
678
thirdparty/libtheora/x86/mmxfdct.c
vendored
Normal file
@@ -0,0 +1,678 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
|
||||
* by the Xiph.Org Foundation https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************/
|
||||
/*MMX fDCT implementation for x86_32*/
|
||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||
#include "x86enc.h"
|
||||
#include "x86zigzag.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
# define OC_FDCT_STAGE1_8x4 \
|
||||
"#OC_FDCT_STAGE1_8x4\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*mm0=t7'=t0-t7*/ \
|
||||
"psubw %%mm7,%%mm0\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*mm1=t6'=t1-t6*/ \
|
||||
"psubw %%mm6,%%mm1\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
/*mm2=t5'=t2-t5*/ \
|
||||
"psubw %%mm5,%%mm2\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*mm3=t4'=t3-t4*/ \
|
||||
"psubw %%mm4,%%mm3\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
/*mm7=t0'=t0+t7*/ \
|
||||
"paddw %%mm0,%%mm7\n\t" \
|
||||
/*mm6=t1'=t1+t6*/ \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
/*mm5=t2'=t2+t5*/ \
|
||||
"paddw %%mm2,%%mm5\n\t" \
|
||||
/*mm4=t3'=t3+t4*/ \
|
||||
"paddw %%mm3,%%mm4\n\t" \
|
||||
|
||||
# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
|
||||
"#OC_FDCT8x4\n\t" \
|
||||
/*Stage 2:*/ \
|
||||
/*mm7=t3''=t0'-t3'*/ \
|
||||
"psubw %%mm4,%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
/*mm6=t2''=t1'-t2'*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"movq %%mm7,"_r6"(%[y])\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*mm1=t5''=t6'-t5'*/ \
|
||||
"psubw %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm6,"_r2"(%[y])\n\t" \
|
||||
/*mm4=t0''=t0'+t3'*/ \
|
||||
"paddw %%mm7,%%mm4\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
/*mm5=t1''=t1'+t2'*/ \
|
||||
"movq %%mm4,"_r0"(%[y])\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*mm2=t6''=t6'+t5'*/ \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq %%mm5,"_r4"(%[y])\n\t" \
|
||||
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
|
||||
/*mm4, mm5, mm6, mm7 are free.*/ \
|
||||
/*Stage 3:*/ \
|
||||
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
|
||||
"mov $0x5A806A0A,%[a]\n\t" \
|
||||
"pcmpeqb %%mm6,%%mm6\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psrlw $15,%%mm6\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
/*mm0=0, m2={-1}x4 \
|
||||
mm5:mm4=t5''*27146+0xB500*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm0,"_r7"(%[y])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm2,%%mm2\n\t" \
|
||||
/*mm2=t6'', mm1=t5''+(t5''!=0) \
|
||||
mm4=(t5''*27146+0xB500>>16)*/ \
|
||||
"pcmpeqw %%mm1,%%mm0\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"psubw %%mm2,%%mm0\n\t" \
|
||||
"movq "_r3"(%[y]),%%mm2\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
|
||||
"paddw %%mm1,%%mm4\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm0\n\t" \
|
||||
"psraw $1,%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
/*mm3=t4''=t4'+s*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*mm1=t5'''=t4'-s*/ \
|
||||
"psubw %%mm4,%%mm1\n\t" \
|
||||
/*mm1=0, mm3={-1}x4 \
|
||||
mm5:mm4=t6''*27146+0xB500*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm1,"_r5"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm3,"_r1"(%[y])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"pxor %%mm1,%%mm1\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm1\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
/*mm1=t1'' \
|
||||
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"movq "_r4"(%[y]),%%mm1\n\t" \
|
||||
"psraw $1,%%mm4\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm0=t7''=t7'+s*/ \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
/*mm2=t6'''=t7'-s*/ \
|
||||
"psubw %%mm4,%%mm2\n\t" \
|
||||
/*Stage 4:*/ \
|
||||
/*mm0=0, mm2=t0'' \
|
||||
mm5:mm4=t1''*27146+0xB500*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq "_r0"(%[y]),%%mm2\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm0,"_r7"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
/*mm7={27146,0x4000>>1}x2 \
|
||||
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"mov $0x20006A0A,%[a]\n\t" \
|
||||
"pcmpeqw %%mm1,%%mm0\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm1,%%mm0\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
/*mm6={0x00000E3D}x2 \
|
||||
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"mov $0x0E3D,%[a]\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pxor %%mm1,%%mm1\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm1\n\t" \
|
||||
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movq "_r5"(%[y]),%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
|
||||
The naive implementation could cause overflow, so we use \
|
||||
u=(r&s)+((r^s)>>1).*/ \
|
||||
"movq "_r3"(%[y]),%%mm2\n\t" \
|
||||
"movq %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"pand %%mm4,%%mm7\n\t" \
|
||||
"psraw $1,%%mm0\n\t" \
|
||||
"mov $0x7FFF54DC,%[a]\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm4=_y[4]=v=r-u*/ \
|
||||
"psubw %%mm0,%%mm4\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm4,"_r4"(%[y])\n\t" \
|
||||
/*mm0=0, mm7={36410}x4 \
|
||||
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"mov $0x8E3A8E3A,%[a]\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm0,"_r0"(%[y])\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm0=0 \
|
||||
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
/*mm1=t7'', mm7={26568,0x3400}x2 \
|
||||
mm2=s=t6'''-(36410*u>>16)*/ \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"mov $0x340067C8,%[a]\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"movq %%mm1,"_r5"(%[y])\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm4\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm1\n\t" \
|
||||
"psubw %%mm4,%%mm2\n\t" \
|
||||
/*mm6={0x00007B1B}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x7B1B,%[a]\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
/*mm7={64277-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
|
||||
"psrad $17,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psrad $17,%%mm5\n\t" \
|
||||
"mov $0x7FFF7B16,%[a]\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm2\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm0=0, mm7={12785}x4 \
|
||||
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"movq "_r1"(%[y]),%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x31F131F1,%[a]\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
/*mm1=t3'', mm7={20539,0x3000}x2 \
|
||||
mm4=s=(12785*u>>16)-t4''*/ \
|
||||
"movq %%mm4,"_r1"(%[y])\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"mov $0x3000503B,%[a]\n\t" \
|
||||
"movq "_r6"(%[y]),%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm4\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm6={0x00006CB7}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x6CB7,%[a]\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
/*mm7={60547-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
|
||||
"psrad $20,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psrad $20,%%mm5\n\t" \
|
||||
"mov $0x7FFF6C84,%[a]\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm2\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm0=0, mm7={25080}x4 \
|
||||
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,"_r7"(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"movq "_r2"(%[y]),%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x61F861F8,%[a]\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm1={-1}x4 \
|
||||
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"mov $0x28005460,%[a]\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm1,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
|
||||
mm4=s=(25080*u>>16)-t2''*/ \
|
||||
"movq %%mm4,%%mm6\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm5,%%mm5\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm4\n\t" \
|
||||
/*mm2=s+(s!=0) \
|
||||
mm4:mm3=s*21600+0x2800*/ \
|
||||
"movq %%mm4,%%mm3\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm1,%%mm0\n\t" \
|
||||
"punpcklwd %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm3\n\t" \
|
||||
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
|
||||
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
|
||||
"movq "_r4"(%[y]),%%mm0\n\t" \
|
||||
"psrad $18,%%mm4\n\t" \
|
||||
"movq "_r5"(%[y]),%%mm5\n\t" \
|
||||
"psrad $18,%%mm3\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm1\n\t" \
|
||||
"packssdw %%mm4,%%mm3\n\t" \
|
||||
"movq "_r0"(%[y]),%%mm4\n\t" \
|
||||
"paddw %%mm2,%%mm3\n\t" \
|
||||
|
||||
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
|
||||
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
|
||||
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
|
||||
# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
|
||||
"#OC_TRANSPOSE8x4\n\t" \
|
||||
/*First 4x4 transpose:*/ \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm5 = f3 f2 f1 f0 \
|
||||
mm3 = g3 g2 g1 g0 \
|
||||
mm1 = h3 h2 h1 h0*/ \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||
"punpckhwd %%mm5,%%mm2\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm2 = f3 e3 f2 e2 \
|
||||
mm3 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm3,%%mm0\n\t" \
|
||||
"movq %%mm0,"_r4"(%[y])\n\t" \
|
||||
"punpckhdq %%mm3,%%mm1\n\t" \
|
||||
"movq "_r1"(%[y]),%%mm0\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
"movq "_r3"(%[y]),%%mm5\n\t" \
|
||||
/*_y[4] = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm0 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm5 = d3 d2 d1 d0*/ \
|
||||
"movq %%mm4,%%mm7\n\t" \
|
||||
"punpcklwd %%mm0,%%mm4\n\t" \
|
||||
"punpckhwd %%mm0,%%mm7\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm5,%%mm6\n\t" \
|
||||
"punpckhwd %%mm5,%%mm0\n\t" \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm7,%%mm6\n\t" \
|
||||
"punpckhdq %%mm0,%%mm7\n\t" \
|
||||
"punpckldq %%mm0,%%mm6\n\t" \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
|
||||
/*MMX implementation of the fDCT.*/
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
ptrdiff_t a;
|
||||
__asm__ __volatile__(
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
/*We also add biases to correct for some systematic error that remains in
|
||||
the full fDCT->iDCT round trip.*/
|
||||
"movq 0x00(%[x]),%%mm0\n\t"
|
||||
"movq 0x10(%[x]),%%mm1\n\t"
|
||||
"movq 0x20(%[x]),%%mm2\n\t"
|
||||
"movq 0x30(%[x]),%%mm3\n\t"
|
||||
"pcmpeqb %%mm4,%%mm4\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"movq %%mm0,%%mm5\n\t"
|
||||
"psllw $2,%%mm0\n\t"
|
||||
"pcmpeqw %%mm7,%%mm5\n\t"
|
||||
"movq 0x70(%[x]),%%mm7\n\t"
|
||||
"psllw $2,%%mm1\n\t"
|
||||
"psubw %%mm4,%%mm5\n\t"
|
||||
"psllw $2,%%mm2\n\t"
|
||||
"mov $1,%[a]\n\t"
|
||||
"pslld $16,%%mm5\n\t"
|
||||
"movd %[a],%%mm6\n\t"
|
||||
"psllq $16,%%mm5\n\t"
|
||||
"mov $0x10001,%[a]\n\t"
|
||||
"psllw $2,%%mm3\n\t"
|
||||
"movd %[a],%%mm4\n\t"
|
||||
"punpckhwd %%mm6,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"movq 0x60(%[x]),%%mm6\n\t"
|
||||
"paddw %%mm5,%%mm0\n\t"
|
||||
"movq 0x50(%[x]),%%mm5\n\t"
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"movq 0x40(%[x]),%%mm4\n\t"
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
"psllw $2,%%mm7\n\t"
|
||||
"psubw %%mm7,%%mm0\n\t"
|
||||
"psllw $2,%%mm6\n\t"
|
||||
"paddw %%mm7,%%mm7\n\t"
|
||||
/*mm1=t6'=t1-t6*/
|
||||
"psllw $2,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psllw $2,%%mm4\n\t"
|
||||
"paddw %%mm6,%%mm6\n\t"
|
||||
/*mm2=t5'=t2-t5*/
|
||||
"psubw %%mm5,%%mm2\n\t"
|
||||
"paddw %%mm5,%%mm5\n\t"
|
||||
/*mm3=t4'=t3-t4*/
|
||||
"psubw %%mm4,%%mm3\n\t"
|
||||
"paddw %%mm4,%%mm4\n\t"
|
||||
/*mm7=t0'=t0+t7*/
|
||||
"paddw %%mm0,%%mm7\n\t"
|
||||
/*mm6=t1'=t1+t6*/
|
||||
"paddw %%mm1,%%mm6\n\t"
|
||||
/*mm5=t2'=t2+t5*/
|
||||
"paddw %%mm2,%%mm5\n\t"
|
||||
/*mm4=t3'=t3+t4*/
|
||||
"paddw %%mm3,%%mm4\n\t"
|
||||
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
|
||||
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
|
||||
/*Swap out this 8x4 block for the next one.*/
|
||||
"movq 0x08(%[x]),%%mm0\n\t"
|
||||
"movq %%mm7,0x30(%[y])\n\t"
|
||||
"movq 0x78(%[x]),%%mm7\n\t"
|
||||
"movq %%mm1,0x50(%[y])\n\t"
|
||||
"movq 0x18(%[x]),%%mm1\n\t"
|
||||
"movq %%mm6,0x20(%[y])\n\t"
|
||||
"movq 0x68(%[x]),%%mm6\n\t"
|
||||
"movq %%mm2,0x60(%[y])\n\t"
|
||||
"movq 0x28(%[x]),%%mm2\n\t"
|
||||
"movq %%mm5,0x10(%[y])\n\t"
|
||||
"movq 0x58(%[x]),%%mm5\n\t"
|
||||
"movq %%mm3,0x70(%[y])\n\t"
|
||||
"movq 0x38(%[x]),%%mm3\n\t"
|
||||
/*And increase its working precision, too.*/
|
||||
"psllw $2,%%mm0\n\t"
|
||||
"movq %%mm4,0x00(%[y])\n\t"
|
||||
"psllw $2,%%mm7\n\t"
|
||||
"movq 0x48(%[x]),%%mm4\n\t"
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
"psubw %%mm7,%%mm0\n\t"
|
||||
"psllw $2,%%mm1\n\t"
|
||||
"paddw %%mm7,%%mm7\n\t"
|
||||
"psllw $2,%%mm6\n\t"
|
||||
/*mm1=t6'=t1-t6*/
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psllw $2,%%mm2\n\t"
|
||||
"paddw %%mm6,%%mm6\n\t"
|
||||
"psllw $2,%%mm5\n\t"
|
||||
/*mm2=t5'=t2-t5*/
|
||||
"psubw %%mm5,%%mm2\n\t"
|
||||
"psllw $2,%%mm3\n\t"
|
||||
"paddw %%mm5,%%mm5\n\t"
|
||||
"psllw $2,%%mm4\n\t"
|
||||
/*mm3=t4'=t3-t4*/
|
||||
"psubw %%mm4,%%mm3\n\t"
|
||||
"paddw %%mm4,%%mm4\n\t"
|
||||
/*mm7=t0'=t0+t7*/
|
||||
"paddw %%mm0,%%mm7\n\t"
|
||||
/*mm6=t1'=t1+t6*/
|
||||
"paddw %%mm1,%%mm6\n\t"
|
||||
/*mm5=t2'=t2+t5*/
|
||||
"paddw %%mm2,%%mm5\n\t"
|
||||
/*mm4=t3'=t3+t4*/
|
||||
"paddw %%mm3,%%mm4\n\t"
|
||||
OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
|
||||
OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place,
|
||||
so we only have to do half the stores and loads.*/
|
||||
"movq 0x00(%[y]),%%mm0\n\t"
|
||||
"movq %%mm1,0x58(%[y])\n\t"
|
||||
"movq 0x10(%[y]),%%mm1\n\t"
|
||||
"movq %%mm2,0x68(%[y])\n\t"
|
||||
"movq 0x20(%[y]),%%mm2\n\t"
|
||||
"movq %%mm3,0x78(%[y])\n\t"
|
||||
"movq 0x30(%[y]),%%mm3\n\t"
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
|
||||
/*mm2={-2}x4*/
|
||||
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||
"paddw %%mm2,%%mm2\n\t"
|
||||
/*Round and store the results (no transpose).*/
|
||||
"movq 0x10(%[y]),%%mm7\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm6\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm0\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq 0x30(%[y]),%%mm4\n\t"
|
||||
"psraw $2,%%mm6\n\t"
|
||||
"psubw %%mm2,%%mm5\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"psraw $2,%%mm0\n\t"
|
||||
"psubw %%mm2,%%mm3\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
|
||||
"psraw $2,%%mm5\n\t"
|
||||
"psubw %%mm2,%%mm1\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
|
||||
"psraw $2,%%mm3\n\t"
|
||||
"psubw %%mm2,%%mm7\n\t"
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
|
||||
"psraw $2,%%mm1\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
|
||||
"psraw $2,%%mm7\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
/*Load the next block.*/
|
||||
"movq 0x40(%[y]),%%mm0\n\t"
|
||||
"movq 0x78(%[y]),%%mm7\n\t"
|
||||
"movq 0x50(%[y]),%%mm1\n\t"
|
||||
"movq 0x68(%[y]),%%mm6\n\t"
|
||||
"movq 0x60(%[y]),%%mm2\n\t"
|
||||
"movq 0x58(%[y]),%%mm5\n\t"
|
||||
"movq 0x70(%[y]),%%mm3\n\t"
|
||||
"movq 0x48(%[y]),%%mm4\n\t"
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
|
||||
/*mm2={-2}x4*/
|
||||
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||
"paddw %%mm2,%%mm2\n\t"
|
||||
/*Round and store the results (no transpose).*/
|
||||
"movq 0x50(%[y]),%%mm7\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm6\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm0\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
|
||||
"movq 0x70(%[y]),%%mm4\n\t"
|
||||
"psraw $2,%%mm6\n\t"
|
||||
"psubw %%mm2,%%mm5\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
|
||||
"psraw $2,%%mm0\n\t"
|
||||
"psubw %%mm2,%%mm3\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
|
||||
"psraw $2,%%mm5\n\t"
|
||||
"psubw %%mm2,%%mm1\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
|
||||
"psraw $2,%%mm3\n\t"
|
||||
"psubw %%mm2,%%mm7\n\t"
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
|
||||
"psraw $2,%%mm1\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
|
||||
"psraw $2,%%mm7\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
|
||||
/*Final transpose and zig-zag.*/
|
||||
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
|
||||
|
||||
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
|
||||
|
||||
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||
#undef OC_ZZ_LOAD_ROW_LO
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[y]"r"(_y),[x]"r"(_x)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
368
thirdparty/libtheora/x86/mmxfrag.c
vendored
Normal file
368
thirdparty/libtheora/x86/mmxfrag.c
vendored
Normal file
@@ -0,0 +1,368 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of fragment reconstruction for motion compensation.
|
||||
Originally written by Rudolf Marek.
|
||||
Additional optimization by Nils Pipenbrinck.
|
||||
Note: Loops are unrolled for best performance.
|
||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||
#include <stddef.h>
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
ptrdiff_t ystride3; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm__ __volatile__( \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue){
|
||||
__asm__ __volatile__(
|
||||
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
|
||||
"pcmpeqw %%mm0,%%mm0\n\t"
|
||||
/*#0 Load low residue.*/
|
||||
"movq 0*8(%[residue]),%%mm1\n\t"
|
||||
/*#0 Load high residue.*/
|
||||
"movq 1*8(%[residue]),%%mm2\n\t"
|
||||
/*Set mm0 to 0x8000800080008000.*/
|
||||
"psllw $15,%%mm0\n\t"
|
||||
/*#1 Load low residue.*/
|
||||
"movq 2*8(%[residue]),%%mm3\n\t"
|
||||
/*#1 Load high residue.*/
|
||||
"movq 3*8(%[residue]),%%mm4\n\t"
|
||||
/*Set mm0 to 0x0080008000800080.*/
|
||||
"psrlw $8,%%mm0\n\t"
|
||||
/*#2 Load low residue.*/
|
||||
"movq 4*8(%[residue]),%%mm5\n\t"
|
||||
/*#2 Load high residue.*/
|
||||
"movq 5*8(%[residue]),%%mm6\n\t"
|
||||
/*#0 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#0 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#0 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#1 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#1 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#1 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#2 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm5\n\t"
|
||||
/*#2 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm6\n\t"
|
||||
/*#2 Pack to byte.*/
|
||||
"packuswb %%mm6,%%mm5\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm1,(%[dst])\n\t"
|
||||
/*#1 Write row.*/
|
||||
"movq %%mm3,(%[dst],%[ystride])\n\t"
|
||||
/*#2 Write row.*/
|
||||
"movq %%mm5,(%[dst],%[ystride],2)\n\t"
|
||||
/*#3 Load low residue.*/
|
||||
"movq 6*8(%[residue]),%%mm1\n\t"
|
||||
/*#3 Load high residue.*/
|
||||
"movq 7*8(%[residue]),%%mm2\n\t"
|
||||
/*#4 Load high residue.*/
|
||||
"movq 8*8(%[residue]),%%mm3\n\t"
|
||||
/*#4 Load high residue.*/
|
||||
"movq 9*8(%[residue]),%%mm4\n\t"
|
||||
/*#5 Load high residue.*/
|
||||
"movq 10*8(%[residue]),%%mm5\n\t"
|
||||
/*#5 Load high residue.*/
|
||||
"movq 11*8(%[residue]),%%mm6\n\t"
|
||||
/*#3 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#3 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#3 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#4 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#4 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#4 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#5 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm5\n\t"
|
||||
/*#5 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm6\n\t"
|
||||
/*#5 Pack to byte.*/
|
||||
"packuswb %%mm6,%%mm5\n\t"
|
||||
/*#3 Write row.*/
|
||||
"movq %%mm1,(%[dst],%[ystride3])\n\t"
|
||||
/*#4 Write row.*/
|
||||
"movq %%mm3,(%[dst4])\n\t"
|
||||
/*#5 Write row.*/
|
||||
"movq %%mm5,(%[dst4],%[ystride])\n\t"
|
||||
/*#6 Load low residue.*/
|
||||
"movq 12*8(%[residue]),%%mm1\n\t"
|
||||
/*#6 Load high residue.*/
|
||||
"movq 13*8(%[residue]),%%mm2\n\t"
|
||||
/*#7 Load low residue.*/
|
||||
"movq 14*8(%[residue]),%%mm3\n\t"
|
||||
/*#7 Load high residue.*/
|
||||
"movq 15*8(%[residue]),%%mm4\n\t"
|
||||
/*#6 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#6 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#6 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#7 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#7 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#7 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#6 Write row.*/
|
||||
"movq %%mm1,(%[dst4],%[ystride],2)\n\t"
|
||||
/*#7 Write row.*/
|
||||
"movq %%mm3,(%[dst4],%[ystride3])\n\t"
|
||||
:
|
||||
:[residue]"r"(_residue),
|
||||
[dst]"r"(_dst),
|
||||
[dst4]"r"(_dst+(_ystride*4)),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),
|
||||
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm0.*/
|
||||
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*#0 Load source.*/
|
||||
"movq (%[src]),%%mm3\n\t"
|
||||
/*#1 Load source.*/
|
||||
"movq (%[src],%[ystride]),%%mm7\n\t"
|
||||
/*#0 Get copy of src.*/
|
||||
"movq %%mm3,%%mm4\n\t"
|
||||
/*#0 Expand high source.*/
|
||||
"punpckhbw %%mm0,%%mm4\n\t"
|
||||
/*#0 Expand low source.*/
|
||||
"punpcklbw %%mm0,%%mm3\n\t"
|
||||
/*#0 Add residue high.*/
|
||||
"paddsw 8(%[residue]),%%mm4\n\t"
|
||||
/*#1 Get copy of src.*/
|
||||
"movq %%mm7,%%mm2\n\t"
|
||||
/*#0 Add residue low.*/
|
||||
"paddsw (%[residue]), %%mm3\n\t"
|
||||
/*#1 Expand high source.*/
|
||||
"punpckhbw %%mm0,%%mm2\n\t"
|
||||
/*#0 Pack final row pixels.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#1 Expand low source.*/
|
||||
"punpcklbw %%mm0,%%mm7\n\t"
|
||||
/*#1 Add residue low.*/
|
||||
"paddsw 16(%[residue]),%%mm7\n\t"
|
||||
/*#1 Add residue high.*/
|
||||
"paddsw 24(%[residue]),%%mm2\n\t"
|
||||
/*Advance residue.*/
|
||||
"lea 32(%[residue]),%[residue]\n\t"
|
||||
/*#1 Pack final row pixels.*/
|
||||
"packuswb %%mm2,%%mm7\n\t"
|
||||
/*Advance src.*/
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm3,(%[dst])\n\t"
|
||||
/*#1 Write row.*/
|
||||
"movq %%mm7,(%[dst],%[ystride])\n\t"
|
||||
/*Advance dst.*/
|
||||
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
||||
:[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm7.*/
|
||||
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*#0 Load src1.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
/*#0 Load src2.*/
|
||||
"movq (%[src2]),%%mm2\n\t"
|
||||
/*#0 Copy src1.*/
|
||||
"movq %%mm0,%%mm1\n\t"
|
||||
/*#0 Copy src2.*/
|
||||
"movq %%mm2,%%mm3\n\t"
|
||||
/*#1 Load src1.*/
|
||||
"movq (%[src1],%[ystride]),%%mm4\n\t"
|
||||
/*#0 Unpack lower src1.*/
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
/*#1 Load src2.*/
|
||||
"movq (%[src2],%[ystride]),%%mm5\n\t"
|
||||
/*#0 Unpack higher src1.*/
|
||||
"punpckhbw %%mm7,%%mm1\n\t"
|
||||
/*#0 Unpack lower src2.*/
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
/*#0 Unpack higher src2.*/
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
/*Advance src1 ptr.*/
|
||||
"lea (%[src1],%[ystride],2),%[src1]\n\t"
|
||||
/*Advance src2 ptr.*/
|
||||
"lea (%[src2],%[ystride],2),%[src2]\n\t"
|
||||
/*#0 Lower src1+src2.*/
|
||||
"paddsw %%mm2,%%mm0\n\t"
|
||||
/*#0 Higher src1+src2.*/
|
||||
"paddsw %%mm3,%%mm1\n\t"
|
||||
/*#1 Copy src1.*/
|
||||
"movq %%mm4,%%mm2\n\t"
|
||||
/*#0 Build lo average.*/
|
||||
"psraw $1,%%mm0\n\t"
|
||||
/*#1 Copy src2.*/
|
||||
"movq %%mm5,%%mm3\n\t"
|
||||
/*#1 Unpack lower src1.*/
|
||||
"punpcklbw %%mm7,%%mm4\n\t"
|
||||
/*#0 Build hi average.*/
|
||||
"psraw $1,%%mm1\n\t"
|
||||
/*#1 Unpack higher src1.*/
|
||||
"punpckhbw %%mm7,%%mm2\n\t"
|
||||
/*#0 low+=residue.*/
|
||||
"paddsw (%[residue]),%%mm0\n\t"
|
||||
/*#1 Unpack lower src2.*/
|
||||
"punpcklbw %%mm7,%%mm5\n\t"
|
||||
/*#0 high+=residue.*/
|
||||
"paddsw 8(%[residue]),%%mm1\n\t"
|
||||
/*#1 Unpack higher src2.*/
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
/*#1 Lower src1+src2.*/
|
||||
"paddsw %%mm4,%%mm5\n\t"
|
||||
/*#0 Pack and saturate.*/
|
||||
"packuswb %%mm1,%%mm0\n\t"
|
||||
/*#1 Higher src1+src2.*/
|
||||
"paddsw %%mm2,%%mm3\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm0,(%[dst])\n\t"
|
||||
/*#1 Build lo average.*/
|
||||
"psraw $1,%%mm5\n\t"
|
||||
/*#1 Build hi average.*/
|
||||
"psraw $1,%%mm3\n\t"
|
||||
/*#1 low+=residue.*/
|
||||
"paddsw 16(%[residue]),%%mm5\n\t"
|
||||
/*#1 high+=residue.*/
|
||||
"paddsw 24(%[residue]),%%mm3\n\t"
|
||||
/*#1 Pack and saturate.*/
|
||||
"packuswb %%mm3,%%mm5\n\t"
|
||||
/*#1 Write row ptr.*/
|
||||
"movq %%mm5,(%[dst],%[ystride])\n\t"
|
||||
/*Advance residue ptr.*/
|
||||
"add $32,%[residue]\n\t"
|
||||
/*Advance dest ptr.*/
|
||||
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
||||
:[dst]"+r"(_dst),[residue]"+r"(_residue),
|
||||
[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_restore_fpu_mmx(void){
|
||||
__asm__ __volatile__("emms\n\t");
|
||||
}
|
||||
#endif
|
558
thirdparty/libtheora/x86/mmxidct.c
vendored
Normal file
558
thirdparty/libtheora/x86/mmxidct.c
vendored
Normal file
@@ -0,0 +1,558 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of Theora's iDCT.
|
||||
Originally written by Rudolf Marek, based on code from On2's VP3.*/
|
||||
#include "x86int.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*These are offsets into the table of constants below.*/
|
||||
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
||||
#define OC_COSINE_OFFSET (0)
|
||||
/*A row of 8's.*/
|
||||
#define OC_EIGHT_OFFSET (56)
|
||||
|
||||
|
||||
|
||||
/*38 cycles*/
|
||||
#define OC_IDCT_BEGIN(_y,_x) \
|
||||
"#OC_IDCT_BEGIN\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_J(5,_x)",%%mm7\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm5\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_J(7,_x)",%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"movq %%mm0,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"paddw %%mm7,%%mm4\n\t" \
|
||||
"pmulhw %%mm1,%%mm5\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw %%mm7,%%mm3\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm2\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"movq "OC_J(6,_x)",%%mm5\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm7\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"paddw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"movq "OC_J(4,_x)",%%mm3\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"pmulhw %%mm4,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm0,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"pmulhw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm4\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"#end OC_IDCT_BEGIN\n\t" \
|
||||
|
||||
/*38+8=46 cycles.*/
|
||||
#define OC_ROW_IDCT(_y,_x) \
|
||||
"#OC_ROW_IDCT\n" \
|
||||
OC_IDCT_BEGIN(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G.+C.*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT\n\t" \
|
||||
|
||||
/*The following macro does two 4x4 transposes in place.
|
||||
At entry, we assume:
|
||||
r0 = a3 a2 a1 a0
|
||||
I(1) = b3 b2 b1 b0
|
||||
r2 = c3 c2 c1 c0
|
||||
r3 = d3 d2 d1 d0
|
||||
|
||||
r4 = e3 e2 e1 e0
|
||||
r5 = f3 f2 f1 f0
|
||||
r6 = g3 g2 g1 g0
|
||||
r7 = h3 h2 h1 h0
|
||||
|
||||
At exit, we have:
|
||||
I(0) = d0 c0 b0 a0
|
||||
I(1) = d1 c1 b1 a1
|
||||
I(2) = d2 c2 b2 a2
|
||||
I(3) = d3 c3 b3 a3
|
||||
|
||||
J(4) = h0 g0 f0 e0
|
||||
J(5) = h1 g1 f1 e1
|
||||
J(6) = h2 g2 f2 e2
|
||||
J(7) = h3 g3 f3 e3
|
||||
|
||||
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first.*/
|
||||
/*19 cycles.*/
|
||||
#define OC_TRANSPOSE(_y) \
|
||||
"#OC_TRANSPOSE\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
"punpckhdq %%mm0,%%mm6\n\t" \
|
||||
"movq "OC_I(0,_y)",%%mm4\n\t" \
|
||||
"punpckldq %%mm0,%%mm1\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm5\n\t" \
|
||||
"movq %%mm4,%%mm0\n\t" \
|
||||
"movq %%mm6,"OC_J(7,_y)"\n\t" \
|
||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm1,"OC_J(6,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm2,%%mm0\n\t" \
|
||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
"punpckhdq %%mm5,%%mm4\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
"movq %%mm4,"OC_I(3,_y)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
"#end OC_TRANSPOSE\n\t" \
|
||||
|
||||
/*38+19=57 cycles.*/
|
||||
#define OC_COLUMN_IDCT(_y) \
|
||||
"#OC_COLUMN_IDCT\n" \
|
||||
OC_IDCT_BEGIN(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r2=NR2*/ \
|
||||
"psraw $4,%%mm2\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r4=NR4*/ \
|
||||
"psraw $4,%%mm4\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT\n\t" \
|
||||
|
||||
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||
Every 4x4 block is transposed.*/
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64)
|
||||
);
|
||||
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
|
||||
:[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*25 cycles.*/
|
||||
#define OC_IDCT_BEGIN_10(_y,_x) \
|
||||
"#OC_IDCT_BEGIN_10\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm1\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm6\n\t" \
|
||||
"paddw "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"#end OC_IDCT_BEGIN_10\n\t" \
|
||||
|
||||
/*25+8=33 cycles.*/
|
||||
#define OC_ROW_IDCT_10(_y,_x) \
|
||||
"#OC_ROW_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT_10\n\t" \
|
||||
|
||||
/*25+19=44 cycles'*/
|
||||
#define OC_COLUMN_IDCT_10(_y) \
|
||||
"#OC_COLUMN_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r2=NR2*/ \
|
||||
"psraw $4,%%mm2\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r4=NR4*/ \
|
||||
"psraw $4,%%mm4\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT_10\n\t" \
|
||||
|
||||
static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
/*Done with dequant, descramble, and partial transpose.
|
||||
Now do the iDCT itself.*/
|
||||
OC_ROW_IDCT_10(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64)
|
||||
);
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
|
||||
);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
|
||||
else oc_idct8x8_slow_mmx(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
318
thirdparty/libtheora/x86/mmxloop.h
vendored
Normal file
318
thirdparty/libtheora/x86/mmxloop.h
vendored
Normal file
@@ -0,0 +1,318 @@
|
||||
#if !defined(_x86_mmxloop_H)
|
||||
# define _x86_mmxloop_H (1)
|
||||
# include <stddef.h>
|
||||
# include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMX \
|
||||
"#OC_LOOP_FILTER8_MMX\n\t" \
|
||||
/*mm7=0*/ \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
/*mm6:mm0={a0,...,a7}*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"punpckhbw %%mm7,%%mm6\n\t" \
|
||||
/*mm3:mm5={d0,...,d7}*/ \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*mm3:mm1={b0,...,b7}*/ \
|
||||
"movq %%mm1,%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm3\n\t" \
|
||||
/*mm5:mm4={c0,...,c7}*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={3}x4 \
|
||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||
"pcmpeqw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm5\n\t" \
|
||||
/*Scale by 3.*/ \
|
||||
"pmullw %%mm7,%%mm4\n\t" \
|
||||
"pmullw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={4}x4 \
|
||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psllw $2,%%mm7\n\t" \
|
||||
"movq (%[ll]),%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"psraw $3,%%mm4\n\t" \
|
||||
"psraw $3,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"packsswb %%mm5,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"packuswb %%mm3,%%mm1\n\t" \
|
||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but working in 8 bits gives much better parallelism). \
|
||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||
mm6, and add and subtract them as appropriate.*/ \
|
||||
/*mm4=abs(-R_i)*/ \
|
||||
/*mm7=255-2*L*/ \
|
||||
"pcmpgtb %%mm4,%%mm6\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm6,%%mm4\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"psubb %%mm6,%%mm4\n\t" \
|
||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||
"paddusb %%mm4,%%mm7\n\t" \
|
||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||
"paddusb %%mm7,%%mm4\n\t" \
|
||||
"psubusb %%mm7,%%mm4\n\t" \
|
||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"pand %%mm6,%%mm4\n\t" \
|
||||
"pandn %%mm5,%%mm6\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm4,%%mm1\n\t" \
|
||||
"psubusb %%mm4,%%mm2\n\t" \
|
||||
"psubusb %%mm6,%%mm1\n\t" \
|
||||
"paddusb %%mm6,%%mm2\n\t" \
|
||||
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
|
||||
All other MMX registers are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMXEXT \
|
||||
"#OC_LOOP_FILTER8_MMXEXT\n\t" \
|
||||
/*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
|
||||
-R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
|
||||
/*This first part is based on the transformation \
|
||||
f = -(3*(c-b)+a-d+4>>3) \
|
||||
= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
|
||||
= -(3*(c+~b)+(a+~d)-1016>>3) \
|
||||
= 127-(3*(c+~b)+(a+~d)>>3) \
|
||||
= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
|
||||
Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
|
||||
fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
|
||||
Using this, the last expression above can be computed in 8 bits of working \
|
||||
precision via: \
|
||||
u = ~pavgb(~b,c); \
|
||||
v = pavgb(b,~c); \
|
||||
This mask is 0 or 0xFF, and controls whether t is biased up or down: \
|
||||
m = u-v; \
|
||||
t = m^pavgb(m^~a,m^d); \
|
||||
f = 128+pavgb(pavgb(t,u),v); \
|
||||
This required some careful analysis to ensure that carries are propagated \
|
||||
correctly in all cases, but has been checked exhaustively.*/ \
|
||||
/*input (a, b, c, d, ., ., ., .)*/ \
|
||||
/*ff=0xFF; \
|
||||
u=b; \
|
||||
v=c; \
|
||||
ll=255-2*L;*/ \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"movq (%[ll]),%%mm6\n\t" \
|
||||
/*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u^=ff; \
|
||||
v^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm5\n\t" \
|
||||
/*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u=pavgb(u,c); \
|
||||
v=pavgb(v,b);*/ \
|
||||
"pavgb %%mm2,%%mm4\n\t" \
|
||||
"pavgb %%mm1,%%mm5\n\t" \
|
||||
/*u^=ff; \
|
||||
a^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm0\n\t" \
|
||||
/*m=u-v;*/ \
|
||||
"psubb %%mm5,%%mm4\n\t" \
|
||||
/*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
|
||||
/*a^=m; \
|
||||
d^=m;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"pxor %%mm4,%%mm3\n\t" \
|
||||
/*t=pavgb(a,d);*/ \
|
||||
"pavgb %%mm3,%%mm0\n\t" \
|
||||
"psllw $7,%%mm7\n\t" \
|
||||
/*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
|
||||
/*t^=m; \
|
||||
u=m+v;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"paddb %%mm5,%%mm4\n\t" \
|
||||
/*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
|
||||
/*f=pavgb(f,u); \
|
||||
of=128;*/ \
|
||||
"pavgb %%mm4,%%mm0\n\t" \
|
||||
"packsswb %%mm7,%%mm7\n\t" \
|
||||
/*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
|
||||
/*f=pavgb(f,v);*/ \
|
||||
"pavgb %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm7,%%mm3\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
/*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
|
||||
/*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but staying in 8 bits gives much better parallelism).*/ \
|
||||
/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
|
||||
This is the same number of instructions as computing a mask and splitting \
|
||||
after the lflim computation, but has shorter dependency chains.*/ \
|
||||
/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
|
||||
mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
|
||||
"psubusb %%mm0,%%mm3\n\t" \
|
||||
"psubusb %%mm7,%%mm0\n\t" \
|
||||
/*mm6=255-max(2*L-abs(R_i<0),0) \
|
||||
mm4=255-max(2*L-abs(R_i>0),0)*/ \
|
||||
"paddusb %%mm3,%%mm4\n\t" \
|
||||
"paddusb %%mm0,%%mm6\n\t" \
|
||||
/*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
|
||||
mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
|
||||
"paddusb %%mm4,%%mm3\n\t" \
|
||||
"paddusb %%mm6,%%mm0\n\t" \
|
||||
"psubusb %%mm4,%%mm3\n\t" \
|
||||
"psubusb %%mm6,%%mm0\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm3,%%mm1\n\t" \
|
||||
"psubusb %%mm3,%%mm2\n\t" \
|
||||
"psubusb %%mm0,%%mm1\n\t" \
|
||||
"paddusb %%mm0,%%mm2\n\t" \
|
||||
|
||||
#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
ptrdiff_t ystride3__; \
|
||||
__asm__ __volatile__( \
|
||||
/*mm0={a0,...,a7}*/ \
|
||||
"movq (%[pix]),%%mm0\n\t" \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*mm3={d0,...,d7}*/ \
|
||||
"movq (%[pix],%[ystride3]),%%mm3\n\t" \
|
||||
/*mm1={b0,...,b7}*/ \
|
||||
"movq (%[pix],%[ystride]),%%mm1\n\t" \
|
||||
/*mm2={c0,...,c7}*/ \
|
||||
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||
_filter \
|
||||
/*Write it back out.*/ \
|
||||
"movq %%mm1,(%[pix],%[ystride])\n\t" \
|
||||
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
|
||||
:[ystride3]"=&r"(ystride3__) \
|
||||
:[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
|
||||
[ll]"r"(_ll) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
unsigned char *pix__; \
|
||||
ptrdiff_t ystride3__; \
|
||||
ptrdiff_t d__; \
|
||||
pix__=(_pix)-2; \
|
||||
__asm__ __volatile__( \
|
||||
/*x x x x d0 c0 b0 a0*/ \
|
||||
"movd (%[pix]),%%mm0\n\t" \
|
||||
/*x x x x d1 c1 b1 a1*/ \
|
||||
"movd (%[pix],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*x x x x d2 c2 b2 a2*/ \
|
||||
"movd (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||
/*x x x x d3 c3 b3 a3*/ \
|
||||
"lea (%[pix],%[ystride],4),%[d]\n\t" \
|
||||
"movd (%[pix],%[ystride3]),%%mm3\n\t" \
|
||||
/*x x x x d4 c4 b4 a4*/ \
|
||||
"movd (%[d]),%%mm4\n\t" \
|
||||
/*x x x x d5 c5 b5 a5*/ \
|
||||
"movd (%[d],%[ystride]),%%mm5\n\t" \
|
||||
/*x x x x d6 c6 b6 a6*/ \
|
||||
"movd (%[d],%[ystride],2),%%mm6\n\t" \
|
||||
/*x x x x d7 c7 b7 a7*/ \
|
||||
"movd (%[d],%[ystride3]),%%mm7\n\t" \
|
||||
/*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
"punpcklbw %%mm1,%%mm0\n\t" \
|
||||
/*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
|
||||
"punpcklbw %%mm3,%%mm2\n\t" \
|
||||
/*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
"movq %%mm0,%%mm3\n\t" \
|
||||
/*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
"punpcklwd %%mm2,%%mm0\n\t" \
|
||||
/*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
"punpckhwd %%mm2,%%mm3\n\t" \
|
||||
/*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
/*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
"punpcklbw %%mm5,%%mm4\n\t" \
|
||||
/*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
|
||||
"punpcklbw %%mm7,%%mm6\n\t" \
|
||||
/*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
/*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
/*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
/*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
"movq %%mm3,%%mm2\n\t" \
|
||||
/*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"punpckldq %%mm4,%%mm0\n\t" \
|
||||
/*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
|
||||
"punpckhdq %%mm4,%%mm1\n\t" \
|
||||
/*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
_filter \
|
||||
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
||||
"movq %%mm1,%%mm0\n\t" \
|
||||
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
||||
"punpcklbw %%mm2,%%mm1\n\t" \
|
||||
/*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
|
||||
"punpckhbw %%mm2,%%mm0\n\t" \
|
||||
/*[d]=c1 b1 c0 b0*/ \
|
||||
"movd %%mm1,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix])\n\t" \
|
||||
"psrlq $32,%%mm1\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride])\n\t" \
|
||||
/*[d]=c3 b3 c2 b2*/ \
|
||||
"movd %%mm1,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
|
||||
"lea (%[pix],%[ystride],4),%[pix]\n\t" \
|
||||
/*[d]=c5 b5 c4 b4*/ \
|
||||
"movd %%mm0,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix])\n\t" \
|
||||
"psrlq $32,%%mm0\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride])\n\t" \
|
||||
/*[d]=c7 b7 c6 b6*/ \
|
||||
"movd %%mm0,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
|
||||
:[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
# endif
|
||||
#endif
|
226
thirdparty/libtheora/x86/mmxstate.c
vendored
Normal file
226
thirdparty/libtheora/x86/mmxstate.c
vendored
Normal file
@@ -0,0 +1,226 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of complete fragment reconstruction algorithm.
|
||||
Originally written by Rudolf Marek.*/
|
||||
#include <string.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxloop.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
/*Note that this value must be unsigned, to keep the __asm__ block from
|
||||
sign-extending it when it puts it in a register.*/
|
||||
ogg_uint16_t p;
|
||||
int i;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
/*Fill _dct_coeffs with p.*/
|
||||
__asm__ __volatile__(
|
||||
/*mm0=0000 0000 0000 AAAA*/
|
||||
"movd %[p],%%mm0\n\t"
|
||||
/*mm0=0000 0000 AAAA AAAA*/
|
||||
"punpcklwd %%mm0,%%mm0\n\t"
|
||||
/*mm0=AAAA AAAA AAAA AAAA*/
|
||||
"punpckldq %%mm0,%%mm0\n\t"
|
||||
:
|
||||
:[p]"r"((unsigned)p)
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
/*We copy these entire function to inline the actual MMX routines so that we
|
||||
use only a single indirect call.*/
|
||||
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||
memset(_bv,_flimit,8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
OC_ALIGN8(unsigned char ll[8]);
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride*8),ystride,ll);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
|
||||
memset(_bv,~(_flimit<<1),8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride*8),ystride,_bv);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
500
thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
500
thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
@@ -0,0 +1,500 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
#include "sse2trans.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
|
||||
16-bit differences.
|
||||
On output, these are stored in _m0, xmm1, xmm2, and xmm3.
|
||||
xmm4 and xmm5 are clobbered.*/
|
||||
#define OC_LOAD_SUB_4x8(_m0) \
|
||||
"#OC_LOAD_SUB_4x8\n\t" \
|
||||
/*Load the first three rows.*/ \
|
||||
"movq (%[src]),"_m0"\n\t" \
|
||||
"movq (%[ref]),%%xmm4\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||
"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
|
||||
/*Unpack and subtract.*/ \
|
||||
"punpcklbw %%xmm4,"_m0"\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,"_m0"\n\t" \
|
||||
"psubw %%xmm3,%%xmm1\n\t" \
|
||||
/*Load the last row.*/ \
|
||||
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||
"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
|
||||
/*Unpack, subtract, and advance the pointers.*/ \
|
||||
"punpcklbw %%xmm5,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
"psubw %%xmm5,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ystride],4),%[ref]\n\t" \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
|
||||
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
|
||||
On output, xmm0 contains the sum of two of the rows, and the other two are
|
||||
added to xmm7.*/
|
||||
#define OC_SSD_4x8(_m0) \
|
||||
"pmaddwd "_m0","_m0"\n\t" \
|
||||
"pmaddwd %%xmm1,%%xmm1\n\t" \
|
||||
"pmaddwd %%xmm2,%%xmm2\n\t" \
|
||||
"pmaddwd %%xmm3,%%xmm3\n\t" \
|
||||
"paddd %%xmm1,"_m0"\n\t" \
|
||||
"paddd %%xmm3,%%xmm2\n\t" \
|
||||
"paddd %%xmm2,%%xmm7\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
unsigned ret;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_4x8("%%xmm7")
|
||||
OC_SSD_4x8("%%xmm7")
|
||||
OC_LOAD_SUB_4x8("%%xmm0")
|
||||
OC_SSD_4x8("%%xmm0")
|
||||
"paddd %%xmm0,%%xmm7\n\t"
|
||||
"movdqa %%xmm7,%%xmm6\n\t"
|
||||
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"movd %%xmm7,%[ret]\n\t"
|
||||
:[ret]"=a"(ret)
|
||||
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
|
||||
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
|
||||
0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
|
||||
};
|
||||
|
||||
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
|
||||
horizontal sums as well as their 16-bit differences subject to a mask.
|
||||
%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
|
||||
#define OC_LOAD_SUB_MASK_2x8 \
|
||||
"#OC_LOAD_SUB_MASK_2x8\n\t" \
|
||||
/*Start the loads and expand the next 8 bits of the mask.*/ \
|
||||
"shl $8,%[m]\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"mov %h[m],%b[m]\n\t" \
|
||||
"movq (%[ref]),%%xmm2\n\t" \
|
||||
"movd %[m],%%xmm4\n\t" \
|
||||
"shr $8,%[m]\n\t" \
|
||||
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||
"mov %h[m],%b[m]\n\t" \
|
||||
"pand %%xmm6,%%xmm4\n\t" \
|
||||
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||
/*Perform the masking.*/ \
|
||||
"pand %%xmm4,%%xmm0\n\t" \
|
||||
"pand %%xmm4,%%xmm2\n\t" \
|
||||
/*Finish the loads while unpacking the first set of rows, and expand the next
|
||||
8 bits of the mask.*/ \
|
||||
"movd %[m],%%xmm4\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||
"pand %%xmm6,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm2,%%xmm0\n\t" \
|
||||
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm2,%%xmm2\n\t" \
|
||||
/*Mask and unpack the second set of rows.*/ \
|
||||
"pand %%xmm4,%%xmm1\n\t" \
|
||||
"pand %%xmm4,%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||
"psubw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm3,%%xmm1\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||
ptrdiff_t ystride;
|
||||
unsigned ret;
|
||||
int i;
|
||||
ystride=_ystride;
|
||||
__asm__ __volatile__(
|
||||
"pxor %%xmm7,%%xmm7\n\t"
|
||||
"movq %[c],%%xmm6\n\t"
|
||||
:
|
||||
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
unsigned m;
|
||||
m=_mask&0xFFFF;
|
||||
_mask>>=16;
|
||||
if(m){
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_MASK_2x8
|
||||
"pmaddwd %%xmm0,%%xmm0\n\t"
|
||||
"pmaddwd %%xmm1,%%xmm1\n\t"
|
||||
"paddd %%xmm0,%%xmm7\n\t"
|
||||
"paddd %%xmm1,%%xmm7\n\t"
|
||||
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
|
||||
);
|
||||
}
|
||||
_src+=2*ystride;
|
||||
_ref+=2*ystride;
|
||||
}
|
||||
__asm__ __volatile__(
|
||||
"movdqa %%xmm7,%%xmm6\n\t"
|
||||
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"movd %%xmm7,%[ret]\n\t"
|
||||
:[ret]"=a"(ret)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
|
||||
16-bit difference in %%xmm0...%%xmm7.*/
|
||||
#define OC_LOAD_SUB_8x8 \
|
||||
"#OC_LOAD_SUB_8x8\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"movq (%[ref]),%%xmm4\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"movq (%[src]),%%xmm2\n\t" \
|
||||
"movq (%[ref]),%%xmm7\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"psubw %%xmm4,%%xmm0\n\t" \
|
||||
"movq (%[src]),%%xmm4\n\t" \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
"movq (%[ref]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm5,%%xmm1\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm7,%%xmm2\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm3\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||
"psubw %%xmm6,%%xmm3\n\t" \
|
||||
"movq (%[src]),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
"movq (%[ref]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm5\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm7,%%xmm5\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm6\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"psubw %%xmm0,%%xmm6\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||
|
||||
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
|
||||
#define OC_LOAD_8x8 \
|
||||
"#OC_LOAD_8x8\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||
"pxor %%xmm7,%%xmm7\n\t" \
|
||||
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm0\n\t" \
|
||||
"movq (%[src4]),%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm1\n\t" \
|
||||
"movq (%[src4],%[ystride]),%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||
"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm3\n\t" \
|
||||
"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"psrlw $8,%%xmm4\n\t" \
|
||||
"psrlw $8,%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psrlw $8,%%xmm6\n\t" \
|
||||
"psrlw $8,%%xmm7\n\t" \
|
||||
|
||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
|
||||
perform this stage in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_AB_8x8 \
|
||||
"#OC_HADAMARD_AB_8x8\n\t" \
|
||||
/*Stage A:*/ \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"paddw %%xmm6,%%xmm2\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"paddw %%xmm6,%%xmm6\n\t" \
|
||||
"psubw %%xmm1,%%xmm5\n\t" \
|
||||
"psubw %%xmm2,%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm3\n\t" \
|
||||
"paddw %%xmm4,%%xmm0\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"psubw %%xmm3,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
/*Stage B:*/ \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm1\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm5\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm6,%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm2\n\t" \
|
||||
"psubw %%xmm1,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm6\n\t" \
|
||||
"psubw %%xmm5,%%xmm7\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||
place with no temporary registers).*/
|
||||
#define OC_HADAMARD_C_8x8 \
|
||||
"#OC_HADAMARD_C_8x8\n\t" \
|
||||
/*Stage C:*/ \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"paddw %%xmm5,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm1\n\t" \
|
||||
"psubw %%xmm2,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm5\n\t" \
|
||||
"psubw %%xmm6,%%xmm7\n\t" \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
|
||||
in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_8x8 \
|
||||
OC_HADAMARD_AB_8x8 \
|
||||
OC_HADAMARD_C_8x8 \
|
||||
|
||||
/*Performs the first part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.
|
||||
At the end of this part, %%xmm1 will contain the DC coefficient of the
|
||||
transform.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||
/*We use the fact that \
|
||||
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||
to merge the final butterfly with the abs and the first stage of \
|
||||
accumulation. \
|
||||
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||
Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
|
||||
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||
registers). \
|
||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm7={0x7FFF}x4 \
|
||||
xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
|
||||
"pcmpeqb %%xmm7,%%xmm7\n\t" \
|
||||
"movdqa %%xmm4,%%xmm6\n\t" \
|
||||
"psrlw $1,%%xmm7\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"pmaxsw %%xmm5,%%xmm4\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"psubw %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
|
||||
xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
|
||||
"movdqa %%xmm2,%%xmm6\n\t" \
|
||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||
"pmaxsw %%xmm3,%%xmm2\n\t" \
|
||||
"pmaxsw %%xmm1,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm6\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
|
||||
|
||||
/*Performs the second part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"paddsw %%xmm7,%%xmm1\n\t" \
|
||||
"psubw %%xmm6,%%xmm2\n\t" \
|
||||
"psubw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm7={1}x4 (needed for the horizontal add that follows) \
|
||||
xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"pmaxsw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm0\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"paddw %%xmm3,%%xmm0\n\t" \
|
||||
"psrlw $14,%%xmm7\n\t" \
|
||||
"psubw %%xmm6,%%xmm0\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||
absolute value of each component, and accumulates everything into xmm0.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||
component, and accumulates everything into xmm0.
|
||||
Note that xmm0 will have an extra 4 added to each column, and that after
|
||||
removing this value, the remainder will be half the conventional value.*/
|
||||
#define OC_HADAMARD_ABS_ACCUM_8x8 \
|
||||
OC_HADAMARD_AB_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x8
|
||||
|
||||
static unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_8x8
|
||||
OC_HADAMARD_8x8
|
||||
OC_TRANSPOSE_8x8
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x8
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||
"movd %%xmm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
|
||||
latency of pmaddwd by starting to compute abs(dc) here.*/
|
||||
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||
"movsx %w[dc],%[dc]\n\t"
|
||||
"cdq\n\t"
|
||||
"movdqa %%xmm0,%%xmm1\n\t"
|
||||
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"movd %%xmm0,%[ret]\n\t"
|
||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
|
||||
added to them, a factor of two removed, and the DC value included;
|
||||
correct the final sum here.*/
|
||||
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||
"xor %[dc],%[ret2]\n\t"
|
||||
"sub %[ret2],%[ret]\n\t"
|
||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||
and %[dc] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||
constraints, otherwise if gcc can prove they're equal it will allocate
|
||||
them to the same register (which is bad); _src and _ref face a similar
|
||||
problem.
|
||||
All four are destructively modified, but if we list them as output
|
||||
constraints, gcc can't alias them with other outputs.*/
|
||||
:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||
:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||
/*We have to use neg, so we actually clobber the condition codes for once
|
||||
(not to mention sub, and add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
unsigned ret;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_8x8
|
||||
OC_HADAMARD_8x8
|
||||
OC_TRANSPOSE_8x8
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x8
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||
"movd %%xmm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.*/
|
||||
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||
/*We assume that the DC coefficient is always positive (which is true,
|
||||
because the input to the INTRA transform was not a difference).*/
|
||||
"movzx %w[dc],%[dc]\n\t"
|
||||
"movdqa %%xmm0,%%xmm1\n\t"
|
||||
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"movd %%xmm0,%[ret]\n\t"
|
||||
"lea -64(%[ret],%[ret]),%[ret]\n\t"
|
||||
"sub %[dc],%[ret]\n\t"
|
||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||
and %[dc] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
:[ret]"=a"(ret),[dc]"=r"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||
/*We have to use sub, so we actually clobber the condition codes for once.*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
452
thirdparty/libtheora/x86/sse2fdct.c
vendored
Normal file
452
thirdparty/libtheora/x86/sse2fdct.c
vendored
Normal file
@@ -0,0 +1,452 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
|
||||
* by the Xiph.Org Foundation https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************/
|
||||
/*SSE2 fDCT implementation for x86_64.*/
|
||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
#include "x86zigzag.h"
|
||||
#include "sse2trans.h"
|
||||
|
||||
#if defined(OC_X86_64_ASM)
|
||||
|
||||
# define OC_FDCT_8x8 \
|
||||
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
|
||||
"#OC_FDCT_8x8\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
"movdqa %%xmm0,%%xmm11\n\t" \
|
||||
"movdqa %%xmm1,%%xmm10\n\t" \
|
||||
"movdqa %%xmm2,%%xmm9\n\t" \
|
||||
"movdqa %%xmm3,%%xmm8\n\t" \
|
||||
/*xmm11=t7'=t0-t7*/ \
|
||||
"psubw %%xmm7,%%xmm11\n\t" \
|
||||
/*xmm10=t6'=t1-t6*/ \
|
||||
"psubw %%xmm6,%%xmm10\n\t" \
|
||||
/*xmm9=t5'=t2-t5*/ \
|
||||
"psubw %%xmm5,%%xmm9\n\t" \
|
||||
/*xmm8=t4'=t3-t4*/ \
|
||||
"psubw %%xmm4,%%xmm8\n\t" \
|
||||
/*xmm0=t0'=t0+t7*/ \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
/*xmm1=t1'=t1+t6*/ \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
/*xmm5=t2'=t2+t5*/ \
|
||||
"paddw %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm4=t3'=t3+t4*/ \
|
||||
"paddw %%xmm3,%%xmm4\n\t" \
|
||||
/*xmm2,3,6,7 are now free.*/ \
|
||||
/*Stage 2:*/ \
|
||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||
"mov $0x5A806A0A,%[a]\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"movdqa %%xmm10,%%xmm6\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
/*xmm2=t2''=t1'-t2'*/ \
|
||||
"psubw %%xmm5,%%xmm2\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
/*xmm3=t3''=t0'-t3'*/ \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
/*xmm10=t5''=t6'-t5'*/ \
|
||||
"psubw %%xmm9,%%xmm10\n\t" \
|
||||
"paddw %%xmm12,%%xmm12\n\t" \
|
||||
/*xmm4=t0''=t0'+t3'*/ \
|
||||
"paddw %%xmm0,%%xmm4\n\t" \
|
||||
/*xmm1=t1''=t1'+t2'*/ \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
/*xmm6=t6''=t6'+t5'*/ \
|
||||
"paddw %%xmm9,%%xmm6\n\t" \
|
||||
/*xmm0,xmm5,xmm9 are now free.*/ \
|
||||
/*Stage 3:*/ \
|
||||
/*xmm10:xmm5=t5''*27146+0xB500 \
|
||||
xmm0=t5''*/ \
|
||||
"movdqa %%xmm10,%%xmm5\n\t" \
|
||||
"movdqa %%xmm10,%%xmm0\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm5\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm5\n\t" \
|
||||
/*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"psrad $16,%%xmm5\n\t" \
|
||||
"packssdw %%xmm10,%%xmm5\n\t" \
|
||||
"paddw %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm0\n\t" \
|
||||
"psubw %%xmm14,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm0\n\t" \
|
||||
"movdqa %%xmm8,%%xmm5\n\t" \
|
||||
"psraw $1,%%xmm0\n\t" \
|
||||
/*xmm5=t5'''=t4'-s*/ \
|
||||
"psubw %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm8=t4''=t4'+s*/ \
|
||||
"paddw %%xmm0,%%xmm8\n\t" \
|
||||
/*xmm0,xmm7,xmm9,xmm10 are free.*/ \
|
||||
/*xmm7:xmm9=t6''*27146+0xB500*/ \
|
||||
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||
"movdqa %%xmm6,%%xmm9\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm7\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm7\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm9\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm9\n\t" \
|
||||
/*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
|
||||
"psrad $16,%%xmm7\n\t" \
|
||||
"psrad $16,%%xmm9\n\t" \
|
||||
"packssdw %%xmm7,%%xmm9\n\t" \
|
||||
"paddw %%xmm6,%%xmm9\n\t" \
|
||||
/*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm6\n\t" \
|
||||
"psubw %%xmm14,%%xmm6\n\t" \
|
||||
"paddw %%xmm6,%%xmm9\n\t" \
|
||||
"movdqa %%xmm11,%%xmm7\n\t" \
|
||||
"psraw $1,%%xmm9\n\t" \
|
||||
/*xmm7=t6'''=t7'-s*/ \
|
||||
"psubw %%xmm9,%%xmm7\n\t" \
|
||||
/*xmm9=t7''=t7'+s*/ \
|
||||
"paddw %%xmm11,%%xmm9\n\t" \
|
||||
/*xmm0,xmm6,xmm10,xmm11 are free.*/ \
|
||||
/*Stage 4:*/ \
|
||||
/*xmm10:xmm0=t1''*27146+0xB500*/ \
|
||||
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||
"movdqa %%xmm1,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm0\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm0\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
/*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
|
||||
"psrad $16,%%xmm0\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"mov $0x20006A0A,%[a]\n\t" \
|
||||
"packssdw %%xmm10,%%xmm0\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm10:xmm4=t0''*27146+0x4000*/ \
|
||||
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa %%xmm4,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm4\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm4\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
/*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
|
||||
"psrad $16,%%xmm4\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"mov $0x6CB7,%[a]\n\t" \
|
||||
"packssdw %%xmm10,%%xmm4\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"paddw %%xmm1,%%xmm4\n\t" \
|
||||
/*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"mov $0x7FFF6C84,%[a]\n\t" \
|
||||
"paddw %%xmm1,%%xmm4\n\t" \
|
||||
/*xmm0=_y[0]=u=r+s>>1 \
|
||||
The naive implementation could cause overflow, so we use \
|
||||
u=(r&s)+((r^s)>>1).*/ \
|
||||
"movdqa %%xmm0,%%xmm6\n\t" \
|
||||
"pxor %%xmm4,%%xmm0\n\t" \
|
||||
"pand %%xmm4,%%xmm6\n\t" \
|
||||
"psraw $1,%%xmm0\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm6,%%xmm0\n\t" \
|
||||
/*xmm4=_y[4]=v=r-u*/ \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
/*xmm1,xmm6,xmm10,xmm11 are free.*/ \
|
||||
/*xmm6:xmm10=60547*t3''+0x6CB7*/ \
|
||||
"movdqa %%xmm3,%%xmm10\n\t" \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"punpcklwd %%xmm3,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x61F861F8,%[a]\n\t" \
|
||||
"punpckhwd %%xmm3,%%xmm6\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm6\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm6\n\t" \
|
||||
/*xmm1:xmm2=25080*t2'' \
|
||||
xmm12=t2''*/ \
|
||||
"movdqa %%xmm2,%%xmm11\n\t" \
|
||||
"movdqa %%xmm2,%%xmm12\n\t" \
|
||||
"pmullw %%xmm13,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm11\n\t" \
|
||||
"movdqa %%xmm2,%%xmm1\n\t" \
|
||||
"punpcklwd %%xmm11,%%xmm2\n\t" \
|
||||
"punpckhwd %%xmm11,%%xmm1\n\t" \
|
||||
/*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
|
||||
"paddd %%xmm2,%%xmm10\n\t" \
|
||||
"paddd %%xmm1,%%xmm6\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm3\n\t" \
|
||||
"psrad $16,%%xmm6\n\t" \
|
||||
"psubw %%xmm14,%%xmm3\n\t" \
|
||||
"packssdw %%xmm6,%%xmm10\n\t" \
|
||||
"paddw %%xmm3,%%xmm10\n\t" \
|
||||
/*xmm2=_y[2]=u \
|
||||
xmm10=s=(25080*u>>16)-t2''*/ \
|
||||
"movdqa %%xmm10,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"psubw %%xmm12,%%xmm10\n\t" \
|
||||
/*xmm1:xmm6=s*21600+0x2800*/ \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"mov $0x28005460,%[a]\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"movdqa %%xmm10,%%xmm6\n\t" \
|
||||
"movdqa %%xmm10,%%xmm1\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm6\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm6\n\t" \
|
||||
"mov $0x0E3D,%[a]\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm1\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm1\n\t" \
|
||||
/*xmm6=(s*21600+0x2800>>18)+s*/ \
|
||||
"psrad $18,%%xmm6\n\t" \
|
||||
"psrad $18,%%xmm1\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"packssdw %%xmm1,%%xmm6\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm10,%%xmm6\n\t" \
|
||||
/*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
|
||||
"mov $0x7FFF54DC,%[a]\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddw %%xmm10,%%xmm6\n\t " \
|
||||
/*xmm1,xmm3,xmm10,xmm11 are free.*/ \
|
||||
/*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
|
||||
"movdqa %%xmm5,%%xmm10\n\t" \
|
||||
"movdqa %%xmm5,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm5,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x8E3A8E3A,%[a]\n\t" \
|
||||
"punpckhwd %%xmm5,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
/*xmm7:xmm12=36410*t6''' \
|
||||
xmm1=t6'''*/ \
|
||||
"movdqa %%xmm7,%%xmm3\n\t" \
|
||||
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm3\n\t" \
|
||||
"pmullw %%xmm13,%%xmm7\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
"movdqa %%xmm7,%%xmm12\n\t" \
|
||||
"punpckhwd %%xmm3,%%xmm7\n\t" \
|
||||
"punpcklwd %%xmm3,%%xmm12\n\t" \
|
||||
/*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"paddd %%xmm7,%%xmm11\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm5\n\t" \
|
||||
"psrad $16,%%xmm11\n\t" \
|
||||
"psubw %%xmm14,%%xmm5\n\t" \
|
||||
"packssdw %%xmm11,%%xmm10\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm5,%%xmm10\n\t" \
|
||||
/*xmm5=_y[5]=u \
|
||||
xmm1=s=t6'''-(36410*u>>16)*/ \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"movdqa %%xmm10,%%xmm5\n\t" \
|
||||
"mov $0x340067C8,%[a]\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm5,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm10,%%xmm1\n\t" \
|
||||
/*xmm11:xmm3=s*26568+0x3400*/ \
|
||||
"movdqa %%xmm1,%%xmm3\n\t" \
|
||||
"movdqa %%xmm1,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm3\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm3\n\t" \
|
||||
"mov $0x7B1B,%[a]\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
/*xmm3=(s*26568+0x3400>>17)+s*/ \
|
||||
"psrad $17,%%xmm3\n\t" \
|
||||
"psrad $17,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"packssdw %%xmm11,%%xmm3\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
/*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
|
||||
"mov $0x7FFF7B16,%[a]\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t " \
|
||||
/*xmm1,xmm7,xmm10,xmm11 are free.*/ \
|
||||
/*xmm11:xmm10=64277*t7''+0x7B1B*/ \
|
||||
"movdqa %%xmm9,%%xmm10\n\t" \
|
||||
"movdqa %%xmm9,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm9,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x31F131F1,%[a]\n\t" \
|
||||
"punpckhwd %%xmm9,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
/*xmm12:xmm7=12785*t4''*/ \
|
||||
"movdqa %%xmm8,%%xmm7\n\t" \
|
||||
"movdqa %%xmm8,%%xmm1\n\t" \
|
||||
"pmullw %%xmm13,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm1\n\t" \
|
||||
"movdqa %%xmm7,%%xmm12\n\t" \
|
||||
"punpcklwd %%xmm1,%%xmm7\n\t" \
|
||||
"punpckhwd %%xmm1,%%xmm12\n\t" \
|
||||
/*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
|
||||
"paddd %%xmm7,%%xmm10\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm9\n\t" \
|
||||
"psrad $16,%%xmm11\n\t" \
|
||||
"psubw %%xmm14,%%xmm9\n\t" \
|
||||
"packssdw %%xmm11,%%xmm10\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm9,%%xmm10\n\t" \
|
||||
/*xmm1=_y[1]=u \
|
||||
xmm10=s=(12785*u>>16)-t4''*/ \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"movdqa %%xmm10,%%xmm1\n\t" \
|
||||
"mov $0x3000503B,%[a]\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm8,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
/*xmm8:xmm7=s*20539+0x3000*/ \
|
||||
"movdqa %%xmm10,%%xmm7\n\t" \
|
||||
"movdqa %%xmm10,%%xmm8\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm7\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm7\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm8\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm8\n\t" \
|
||||
/*xmm7=(s*20539+0x3000>>20)+s*/ \
|
||||
"psrad $20,%%xmm7\n\t" \
|
||||
"psrad $20,%%xmm8\n\t" \
|
||||
"packssdw %%xmm8,%%xmm7\n\t" \
|
||||
"paddw %%xmm10,%%xmm7\n\t" \
|
||||
/*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm10\n\t" \
|
||||
"psubw %%xmm14,%%xmm10\n\t" \
|
||||
"paddw %%xmm10,%%xmm7\n\t " \
|
||||
|
||||
/*SSE2 implementation of the fDCT for x86-64 only.
|
||||
Because of the 8 extra XMM registers on x86-64, this version can operate
|
||||
without any temporary stack access at all.*/
|
||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
ptrdiff_t a;
|
||||
__asm__ __volatile__(
|
||||
/*Load the input.*/
|
||||
"movdqa 0x00(%[x]),%%xmm0\n\t"
|
||||
"movdqa 0x10(%[x]),%%xmm1\n\t"
|
||||
"movdqa 0x20(%[x]),%%xmm2\n\t"
|
||||
"movdqa 0x30(%[x]),%%xmm3\n\t"
|
||||
"movdqa 0x40(%[x]),%%xmm4\n\t"
|
||||
"movdqa 0x50(%[x]),%%xmm5\n\t"
|
||||
"movdqa 0x60(%[x]),%%xmm6\n\t"
|
||||
"movdqa 0x70(%[x]),%%xmm7\n\t"
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
/*We also add a few biases to correct for some systematic error that
|
||||
remains in the full fDCT->iDCT round trip.*/
|
||||
/*xmm15={0}x8*/
|
||||
"pxor %%xmm15,%%xmm15\n\t"
|
||||
/*xmm14={-1}x8*/
|
||||
"pcmpeqb %%xmm14,%%xmm14\n\t"
|
||||
"psllw $2,%%xmm0\n\t"
|
||||
/*xmm8=xmm0*/
|
||||
"movdqa %%xmm0,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm1\n\t"
|
||||
/*xmm8={_x[7...0]==0}*/
|
||||
"pcmpeqw %%xmm15,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm2\n\t"
|
||||
/*xmm8={_x[7...0]!=0}*/
|
||||
"psubw %%xmm14,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm3\n\t"
|
||||
/*%[a]=1*/
|
||||
"mov $1,%[a]\n\t"
|
||||
/*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
|
||||
"pslld $16,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm4\n\t"
|
||||
/*xmm9={0,0,0,0,0,0,0,1}*/
|
||||
"movd %[a],%%xmm9\n\t"
|
||||
/*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
|
||||
"pshufhw $0x00,%%xmm8,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm5\n\t"
|
||||
/*%[a]={1}x2*/
|
||||
"mov $0x10001,%[a]\n\t"
|
||||
/*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
|
||||
"pshuflw $0x01,%%xmm8,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm6\n\t"
|
||||
/*xmm10={0,0,0,0,0,0,1,1}*/
|
||||
"movd %[a],%%xmm10\n\t"
|
||||
/*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
|
||||
"paddw %%xmm8,%%xmm0\n\t"
|
||||
"psllw $2,%%xmm7\n\t"
|
||||
/*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
|
||||
"paddw %%xmm10,%%xmm0\n\t"
|
||||
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
|
||||
"psubw %%xmm9,%%xmm1\n\t"
|
||||
/*Transform columns.*/
|
||||
OC_FDCT_8x8
|
||||
/*Transform rows.*/
|
||||
OC_TRANSPOSE_8x8
|
||||
OC_FDCT_8x8
|
||||
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
|
||||
"paddw %%xmm14,%%xmm14\n\t"
|
||||
"psubw %%xmm14,%%xmm0\n\t"
|
||||
"psubw %%xmm14,%%xmm1\n\t"
|
||||
"psraw $2,%%xmm0\n\t"
|
||||
"psubw %%xmm14,%%xmm2\n\t"
|
||||
"psraw $2,%%xmm1\n\t"
|
||||
"psubw %%xmm14,%%xmm3\n\t"
|
||||
"psraw $2,%%xmm2\n\t"
|
||||
"psubw %%xmm14,%%xmm4\n\t"
|
||||
"psraw $2,%%xmm3\n\t"
|
||||
"psubw %%xmm14,%%xmm5\n\t"
|
||||
"psraw $2,%%xmm4\n\t"
|
||||
"psubw %%xmm14,%%xmm6\n\t"
|
||||
"psraw $2,%%xmm5\n\t"
|
||||
"psubw %%xmm14,%%xmm7\n\t"
|
||||
"psraw $2,%%xmm6\n\t"
|
||||
"psraw $2,%%xmm7\n\t"
|
||||
/*Transpose, zig-zag, and store the result.*/
|
||||
/*We could probably do better using SSSE3's palignr, but re-using MMXEXT
|
||||
version will do for now.*/
|
||||
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||
|
||||
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||
"punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
|
||||
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||
|
||||
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||
#undef OC_ZZ_LOAD_ROW_LO
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
:[a]"=&r"(a)
|
||||
:[y]"r"(_y),[x]"r"(_x)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
#endif
|
456
thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
456
thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
@@ -0,0 +1,456 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*SSE2 acceleration of Theora's iDCT.*/
|
||||
#include "x86int.h"
|
||||
#include "sse2trans.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
|
||||
8, 8, 8, 8, 8, 8, 8, 8,
|
||||
OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
|
||||
OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
|
||||
OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
|
||||
OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
|
||||
OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
|
||||
OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
|
||||
OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
|
||||
};
|
||||
|
||||
|
||||
/*Performs the first three stages of the iDCT.
|
||||
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
|
||||
(accessed in that order).
|
||||
The remaining rows must be in _x at their corresponding locations.
|
||||
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.*/
|
||||
#define OC_IDCT_8x8_ABC(_x) \
|
||||
"#OC_IDCT_8x8_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm1\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm6,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
"paddw %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm6,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
|
||||
"movdqa %%xmm3,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm7\n\t" \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
|
||||
"paddw %%xmm7,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm6,%%xmm7\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"movdqa %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm5\n\t" \
|
||||
"paddw %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D \
|
||||
"#OC_IDCT_8x8_D\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D_STORE \
|
||||
"#OC_IDCT_8x8_D_STORE\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm4,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm5\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"psraw $4,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
|
||||
"psraw $4,%%xmm1\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
|
||||
"psraw $4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
|
||||
"psraw $4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
|
||||
"psraw $4,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"psraw $4,%%xmm5\n\t" \
|
||||
"movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
|
||||
"psraw $4,%%xmm6\n\t" \
|
||||
"movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
|
||||
"psraw $4,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
|
||||
|
||||
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
|
||||
"movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
|
||||
OC_IDCT_8x8_ABC(x)
|
||||
OC_IDCT_8x8_D
|
||||
OC_TRANSPOSE_8x8
|
||||
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
OC_IDCT_8x8_ABC(y)
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
|
||||
);
|
||||
__asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
for(i=0;i<2;i++){
|
||||
__asm__ __volatile__(
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
|
||||
need to work with four columns at a time.
|
||||
Doing this in MMX is faster on processors with a 64-bit data path.*/
|
||||
#define OC_IDCT_8x8_10_MMX \
|
||||
"#OC_IDCT_8x8_10_MMX\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
|
||||
"movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
|
||||
"pmulhw %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
|
||||
"pmulhw %%mm1,%%mm3\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
mm4=C4, mm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: mm3=t[4], mm5=t[5] \
|
||||
7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"pmulhw %%mm0,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"movq %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"psubw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm2\n\t" \
|
||||
"pmulhw %%mm6,%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm2\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
|
||||
"paddw %%mm7,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
|
||||
0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
|
||||
1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm5\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
|
||||
1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
|
||||
2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
|
||||
3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
|
||||
#define OC_IDCT_8x8_10_ABC \
|
||||
"#OC_IDCT_8x8_10_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
|
||||
"paddw %%xmm6,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
|
||||
"pmulhw %%xmm3,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
|
||||
"paddw %%xmm3,%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
|
||||
"pmulhw %%xmm1,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm1,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=C4, xmm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
|
||||
"psubw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm0,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm4\n\t" \
|
||||
"movdqa %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
"movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
|
||||
OC_IDCT_8x8_10_MMX
|
||||
OC_TRANSPOSE_8x4_MMX2SSE
|
||||
OC_IDCT_8x8_10_ABC
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
|
||||
);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
|
||||
);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
|
||||
else oc_idct8x8_slow_sse2(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
242
thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
242
thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_sse2trans_H)
|
||||
# define _x86_sse2trans_H (1)
|
||||
# include "x86int.h"
|
||||
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*On x86-64 we can transpose in-place without spilling registers.
|
||||
By clever choices of the order to apply the butterflies and the order of
|
||||
their outputs, we can take the rows in order and output the columns in order
|
||||
without any extra operations and using just one temporary register.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
"movdqa %%xmm4,%%xmm8\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm8\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm7\n\t" \
|
||||
/*xmm3 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm7,%%xmm0\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm3\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm1,%%xmm6\n\t" \
|
||||
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm1,%%xmm8\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm6,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm8,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm8 is free.*/ \
|
||||
|
||||
# else
|
||||
/*Otherwise, we need to spill some values to %[buf] temporarily.
|
||||
Again, the butterflies are carefully arranged to get the columns to come out
|
||||
in order, minimizing register spills and maximizing the delay between a load
|
||||
and when the value loaded is actually used.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
/*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
|
||||
/*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm2\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm5\n\t" \
|
||||
/*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm3\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
|
||||
/*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm1\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm0\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm6\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm6\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||
/*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm7,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm7\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
|
||||
|
||||
# endif
|
||||
|
||||
/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
|
||||
four SSE registers.
|
||||
No need to be clever here; we have plenty of room.*/
|
||||
# define OC_TRANSPOSE_8x4_MMX2SSE \
|
||||
"#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
|
||||
"movq2dq %%mm0,%%xmm0\n\t" \
|
||||
"movq2dq %%mm1,%%xmm1\n\t" \
|
||||
/*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
"movq2dq %%mm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm3,%%xmm2\n\t" \
|
||||
/*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm4,%%xmm4\n\t" \
|
||||
"movq2dq %%mm5,%%xmm5\n\t" \
|
||||
/*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
"movq2dq %%mm6,%%xmm7\n\t" \
|
||||
"movq2dq %%mm7,%%xmm6\n\t" \
|
||||
/*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm6,%%xmm7\n\t" \
|
||||
"movdqa %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
/*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa %%xmm2,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm5,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm5,%%xmm3\n\t" \
|
||||
|
||||
#endif
|
182
thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
182
thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
@@ -0,0 +1,182 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
CPU capability detection for x86 processors.
|
||||
Originally written by Rudolf Marek.
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86cpu.h"
|
||||
|
||||
#if !defined(OC_X86_ASM)
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
# if defined(__amd64__)||defined(__x86_64__)
|
||||
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
||||
compiling with -fPIC.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"cpuid\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# else
|
||||
/*On x86-32, not so much.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
"cpuid\n\t" \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# endif
|
||||
|
||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
|
||||
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
|
||||
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
|
||||
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
|
||||
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
|
||||
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
|
||||
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
|
||||
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
|
||||
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
|
||||
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
|
||||
return flags;
|
||||
}
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
ogg_uint32_t eax;
|
||||
ogg_uint32_t ebx;
|
||||
ogg_uint32_t ecx;
|
||||
ogg_uint32_t edx;
|
||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||
__asm__ __volatile__(
|
||||
"pushfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"movl %[a],%[b]\n\t"
|
||||
"xorl $0x200000,%[a]\n\t"
|
||||
"pushl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
:[a]"=r"(eax),[b]"=r"(ebx)
|
||||
:
|
||||
:"cc"
|
||||
);
|
||||
/*No cpuid.*/
|
||||
if(eax==ebx)return 0;
|
||||
# endif
|
||||
cpuid(0,eax,ebx,ecx,edx);
|
||||
/* l e t n I e n i u n e G*/
|
||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||
/* 6 8 x M T e n i u n e G*/
|
||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||
int family;
|
||||
int model;
|
||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
family=(eax>>8)&0xF;
|
||||
model=(eax>>4)&0xF;
|
||||
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||
unit, so don't use it.*/
|
||||
if(family==6&&(model==9||model==13||model==14)){
|
||||
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||
}
|
||||
}
|
||||
/* D M A c i t n e h t u A*/
|
||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
||||
/* C S N y b e d o e G*/
|
||||
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
|
||||
/*AMD, Geode:*/
|
||||
cpuid(0x80000000,eax,ebx,ecx,edx);
|
||||
if(eax<0x80000001)flags=0;
|
||||
else{
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
/*Also check for SSE.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags|=oc_parse_intel_flags(edx,ecx);
|
||||
}
|
||||
/*Technically some VIA chips can be configured in the BIOS to return any
|
||||
string here the user wants.
|
||||
There is a special detection method that can be used to identify such
|
||||
processors, but in my opinion, if the user really wants to change it, they
|
||||
deserve what they get.*/
|
||||
/* s l u a H r u a t n e C*/
|
||||
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
|
||||
/*VIA:*/
|
||||
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
|
||||
chips (thanks to the engineers from Centaur Technology who provided it).
|
||||
These chips support Intel-like cpuid info.
|
||||
The C3-2 (Nehemiah) cores appear to, as well.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
if(eax>=0x80000001){
|
||||
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
|
||||
We need to check this even if the Intel test succeeds to pick up 3DNow!
|
||||
support on these processors.
|
||||
Unlike actual AMD processors, we cannot _rely_ on this info, since
|
||||
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
|
||||
this function, yet return edx=0, despite the Intel test indicating
|
||||
MMX support.
|
||||
Therefore the features detected here are strictly added to those
|
||||
detected by the Intel test.*/
|
||||
/*TODO: How about earlier chips?*/
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
/*Note: As of the C7, this function returns Intel-style extended feature
|
||||
flags, not AMD-style.
|
||||
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
|
||||
do not conflict with any of the AMD flags we inspect.
|
||||
For the remaining bits, Intel tells us, "Do not count on their value",
|
||||
but VIA assures us that they will all be zero (at least on the C7 and
|
||||
Isaiah chips).
|
||||
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
|
||||
(0xC0C00000) for something else, we will have to add code to detect
|
||||
the model to decide when it is appropriate to inspect them.*/
|
||||
flags|=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Implement me.*/
|
||||
flags=0;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#endif
|
36
thirdparty/libtheora/x86/x86cpu.h
vendored
Normal file
36
thirdparty/libtheora/x86/x86cpu.h
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86cpu_H)
|
||||
# define _x86_x86cpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
#define OC_CPU_X86_MMX (1<<0)
|
||||
#define OC_CPU_X86_3DNOW (1<<1)
|
||||
#define OC_CPU_X86_3DNOWEXT (1<<2)
|
||||
#define OC_CPU_X86_MMXEXT (1<<3)
|
||||
#define OC_CPU_X86_SSE (1<<4)
|
||||
#define OC_CPU_X86_SSE2 (1<<5)
|
||||
#define OC_CPU_X86_PNI (1<<6)
|
||||
#define OC_CPU_X86_SSSE3 (1<<7)
|
||||
#define OC_CPU_X86_SSE4_1 (1<<8)
|
||||
#define OC_CPU_X86_SSE4_2 (1<<9)
|
||||
#define OC_CPU_X86_SSE4A (1<<10)
|
||||
#define OC_CPU_X86_SSE5 (1<<11)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
63
thirdparty/libtheora/x86/x86enc.c
vendored
Normal file
63
thirdparty/libtheora/x86/x86enc.c
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
|
||||
ogg_uint32_t cpu_flags;
|
||||
cpu_flags=_enc->state.cpu_flags;
|
||||
oc_enc_accel_init_c(_enc);
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
if(cpu_flags&OC_CPU_X86_MMX){
|
||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
|
||||
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
|
||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_SSE2){
|
||||
# if defined(OC_X86_64_ASM)
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
|
||||
# endif
|
||||
_enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
|
||||
_enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
|
||||
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
|
||||
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
|
||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
|
||||
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
|
||||
_enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
|
||||
_enc->opt_vtable.quantize=oc_enc_quantize_sse2;
|
||||
# else
|
||||
(void) cpu_flags;
|
||||
# endif
|
||||
_enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
|
||||
_enc->opt_data.enquant_table_alignment=16;
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
114
thirdparty/libtheora/x86/x86enc.h
vendored
Normal file
114
thirdparty/libtheora/x86/x86enc.h
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86enc_H)
|
||||
# define _x86_x86enc_H (1)
|
||||
# include "x86int.h"
|
||||
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_enc_accel_init oc_enc_accel_init_x86
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||
If the best routine we have available only needs SSE2 (which at the moment
|
||||
covers all of them), then we can avoid runtime detection and the indirect
|
||||
call.*/
|
||||
# define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
|
||||
oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
|
||||
# define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
|
||||
oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
|
||||
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
|
||||
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||
oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
|
||||
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||
oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
|
||||
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
|
||||
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||
oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
|
||||
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||
oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
|
||||
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
|
||||
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||
oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
|
||||
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
|
||||
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||
oc_enc_enquant_table_init_x86(_enquant,_dequant)
|
||||
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||
oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
|
||||
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||
oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
|
||||
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||
# define oc_enc_fdct8x8(_enc,_y,_x) \
|
||||
oc_enc_fdct8x8_x86_64sse2(_y,_x)
|
||||
# else
|
||||
# define OC_ENC_USE_VTABLE (1)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# include "../encint.h"
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
|
||||
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,const unsigned char *_y,int _stride);
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,int _stride);
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]);
|
||||
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
|
||||
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
|
||||
# if defined(OC_X86_64_ASM)
|
||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
# endif
|
||||
|
||||
#endif
|
149
thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
149
thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
@@ -0,0 +1,149 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
|
||||
|
||||
/*The default enquant table is not quite suitable for SIMD purposes.
|
||||
First, the m and l parameters need to be separated so that an entire row full
|
||||
of m's or l's can be loaded at a time.
|
||||
Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
|
||||
emulate one with a multiply.
|
||||
Therefore we translate the shift count into a scale factor.*/
|
||||
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]){
|
||||
ogg_int16_t *m;
|
||||
ogg_int16_t *l;
|
||||
int zzi;
|
||||
m=(ogg_int16_t *)_enquant;
|
||||
l=m+64;
|
||||
for(zzi=0;zzi<64;zzi++){
|
||||
oc_iquant q;
|
||||
oc_iquant_init(&q,_dequant[zzi]);
|
||||
m[zzi]=q.m;
|
||||
/*q.l must be at least 2 for this to work; fortunately, once all the scale
|
||||
factors are baked in, the minimum quantizer is much larger than that.*/
|
||||
l[zzi]=1<<16-q.l;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
|
||||
int pli;
|
||||
int qii;
|
||||
int qti;
|
||||
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
|
||||
((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
|
||||
((ogg_int16_t *)_enquant[pli][0][qti])[0];
|
||||
((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
|
||||
((ogg_int16_t *)_enquant[pli][0][qti])[64];
|
||||
}
|
||||
}
|
||||
|
||||
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||
ptrdiff_t r;
|
||||
__asm__ __volatile__(
|
||||
"xor %[r],%[r]\n\t"
|
||||
/*Loop through two rows at a time.*/
|
||||
".p2align 4\n\t"
|
||||
"0:\n\t"
|
||||
/*Load the first two rows of the data and the quant matrices.*/
|
||||
"movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
|
||||
"movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
|
||||
"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
|
||||
"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
|
||||
"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
|
||||
"movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
|
||||
/*Double the input and propagate its sign to the rounding factor.
|
||||
Using SSSE3's psignw would help here, but we need the mask later anyway.*/
|
||||
"movdqa %%xmm0,%%xmm6\n\t"
|
||||
"psraw $15,%%xmm0\n\t"
|
||||
"movdqa %%xmm1,%%xmm7\n\t"
|
||||
"paddw %%xmm6,%%xmm6\n\t"
|
||||
"psraw $15,%%xmm1\n\t"
|
||||
"paddw %%xmm7,%%xmm7\n\t"
|
||||
"paddw %%xmm0,%%xmm2\n\t"
|
||||
"paddw %%xmm1,%%xmm3\n\t"
|
||||
"pxor %%xmm0,%%xmm2\n\t"
|
||||
"pxor %%xmm1,%%xmm3\n\t"
|
||||
/*Add the rounding factor and perform the first multiply.*/
|
||||
"paddw %%xmm2,%%xmm6\n\t"
|
||||
"paddw %%xmm3,%%xmm7\n\t"
|
||||
"pmulhw %%xmm6,%%xmm4\n\t"
|
||||
"pmulhw %%xmm7,%%xmm5\n\t"
|
||||
"movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
|
||||
"movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
|
||||
"paddw %%xmm4,%%xmm6\n\t"
|
||||
"paddw %%xmm5,%%xmm7\n\t"
|
||||
/*Emulate an element-wise right-shift via a second multiply.*/
|
||||
"pmulhw %%xmm2,%%xmm6\n\t"
|
||||
"pmulhw %%xmm3,%%xmm7\n\t"
|
||||
"add $32,%[r]\n\t"
|
||||
"cmp $96,%[r]\n\t"
|
||||
/*Correct for the sign.*/
|
||||
"psubw %%xmm0,%%xmm6\n\t"
|
||||
"psubw %%xmm1,%%xmm7\n\t"
|
||||
/*Save the result.*/
|
||||
"movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
|
||||
"movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
|
||||
"jle 0b\n\t"
|
||||
/*Now find the location of the last non-zero value.*/
|
||||
"movdqa 0x50(%[qdct]),%%xmm5\n\t"
|
||||
"movdqa 0x40(%[qdct]),%%xmm4\n\t"
|
||||
"packsswb %%xmm7,%%xmm6\n\t"
|
||||
"packsswb %%xmm5,%%xmm4\n\t"
|
||||
"pxor %%xmm0,%%xmm0\n\t"
|
||||
"mov $-1,%k[dq]\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||
"shl $16,%k[q]\n\t"
|
||||
"or %k[r],%k[q]\n\t"
|
||||
"mov $32,%[r]\n\t"
|
||||
/*We have to use xor here instead of not in order to set the flags.*/
|
||||
"xor %k[dq],%k[q]\n\t"
|
||||
"jnz 1f\n\t"
|
||||
"movdqa 0x30(%[qdct]),%%xmm7\n\t"
|
||||
"movdqa 0x20(%[qdct]),%%xmm6\n\t"
|
||||
"movdqa 0x10(%[qdct]),%%xmm5\n\t"
|
||||
"movdqa 0x00(%[qdct]),%%xmm4\n\t"
|
||||
"packsswb %%xmm7,%%xmm6\n\t"
|
||||
"packsswb %%xmm5,%%xmm4\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||
"shl $16,%k[q]\n\t"
|
||||
"or %k[r],%k[q]\n\t"
|
||||
"xor %[r],%[r]\n\t"
|
||||
"not %k[q]\n\t"
|
||||
"or $1,%k[q]\n\t"
|
||||
"1:\n\t"
|
||||
"bsr %k[q],%k[q]\n\t"
|
||||
"add %k[q],%k[r]\n\t"
|
||||
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
|
||||
:[dct]"r"(_dct),[qdct]"r"(_qdct)
|
||||
:"cc","memory"
|
||||
);
|
||||
return (int)r;
|
||||
}
|
||||
|
||||
#endif
|
122
thirdparty/libtheora/x86/x86int.h
vendored
Normal file
122
thirdparty/libtheora/x86/x86int.h
vendored
Normal file
@@ -0,0 +1,122 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86int_H)
|
||||
# define _x86_x86int_H (1)
|
||||
# include "../internal.h"
|
||||
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_state_accel_init oc_state_accel_init_x86
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||
If the best routine we have available only needs SSE2 (which at the moment
|
||||
covers all of them), then we can avoid runtime detection and the indirect
|
||||
call.*/
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
oc_frag_copy_mmx(_dst,_src,_ystride)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs)
|
||||
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||
oc_idct8x8_sse2(_y,_x,_last_zzi)
|
||||
# define oc_state_frag_recon oc_state_frag_recon_mmx
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
oc_loop_filter_init_mmxext(_bv,_flimit)
|
||||
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
|
||||
# define oc_restore_fpu(_state) \
|
||||
oc_restore_fpu_mmx()
|
||||
# else
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# include "../state.h"
|
||||
# include "x86cpu.h"
|
||||
|
||||
/*Converts the expression in the argument to a string.*/
|
||||
#define OC_M2STR(_s) #_s
|
||||
|
||||
/*Memory operands do not always include an offset.
|
||||
To avoid warnings, we force an offset with %H (which adds 8).*/
|
||||
# if __GNUC_PREREQ(4,0)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs-8+%H[_name])
|
||||
# endif
|
||||
/*If your gcc version doesn't support %H, then you get to suffer the warnings.
|
||||
Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
|
||||
whole offset, instead of substituting in 0 for the missing operand to +.*/
|
||||
# if !defined(OC_MEM_OFFS)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs+%[_name])
|
||||
# endif
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
const struct{_type array_value__[(_size)];} *array_addr__= \
|
||||
(const void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_mmx(void);
|
||||
|
||||
#endif
|
97
thirdparty/libtheora/x86/x86state.c
vendored
Normal file
97
thirdparty/libtheora/x86/x86state.c
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
#if defined(OC_STATE_USE_VTABLE)
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||
each quadrant of the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3,32,11,18,25, 4,12,
|
||||
5,26,19,40,33,34,41,48,
|
||||
27, 6,13,20,28,21,14, 7,
|
||||
56,49,42,35,43,50,57,36,
|
||||
15,22,29,30,23,44,37,58,
|
||||
51,59,38,45,52,31,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
#endif
|
||||
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
|
||||
the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_SSE2[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3, 4,11,18,25,32,40,
|
||||
33,26,19,12, 5, 6,13,20,
|
||||
27,34,41,48,56,49,42,35,
|
||||
28,21,14, 7,15,22,29,36,
|
||||
43,50,57,58,51,44,37,30,
|
||||
23,31,38,45,52,59,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||
oc_state_accel_init_c(_state);
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmx;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||
}
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmxext;
|
||||
}
|
||||
if(_state->cpu_flags&OC_CPU_X86_SSE2){
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_sse2;
|
||||
# endif
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
244
thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
244
thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
@@ -0,0 +1,244 @@
|
||||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors *
|
||||
* https://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86zigzag_H)
|
||||
# define _x86_x86zigzag_H (1)
|
||||
# include "x86enc.h"
|
||||
|
||||
|
||||
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
||||
stores them in %[y].
|
||||
This relies on two macros to load the contents of each row:
|
||||
OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
|
||||
the first four and second four entries of each row into the specified
|
||||
register, respectively.
|
||||
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
||||
(because when the rows are already in SSE2 registers, loading the high half
|
||||
destructively modifies the register).
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 8 1 2 9 16 24 17 B
|
||||
C 10 3 4 11 18 25 32 40 E
|
||||
F 33 26 19 12 5 6 13 20 D
|
||||
G 27 34 41 48 56 49 42 35 I
|
||||
L 28 21 14 7 15 22 29 36 M
|
||||
H 43 50 57 58 51 44 37 30 O
|
||||
N 23 31 38 45 52 59 60 53 J
|
||||
P 46 39 47 54 61 62 55 63 K
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
||||
OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \
|
||||
"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
|
||||
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
|
||||
"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
|
||||
"punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \
|
||||
"pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
|
||||
"punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \
|
||||
"movq %%mm7,0x00(%[y])\n\t" \
|
||||
"punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \
|
||||
"movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \
|
||||
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \
|
||||
"punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \
|
||||
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
|
||||
"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
|
||||
"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \
|
||||
"movq %%mm1,0x08(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \
|
||||
"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
|
||||
"movq %%mm0,0x10(%[y])\n\t" \
|
||||
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
|
||||
"punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \
|
||||
"movq %%mm4,0x28(%[y])\n\t" \
|
||||
"psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \
|
||||
"pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
|
||||
"movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \
|
||||
"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
|
||||
"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
|
||||
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \
|
||||
"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
|
||||
"movq %%mm4,0x18(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \
|
||||
"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
|
||||
"movq %%mm2,0x20(%[y])\n\t" \
|
||||
"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
|
||||
"pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
|
||||
"movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \
|
||||
"punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \
|
||||
"pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
|
||||
"punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \
|
||||
"movq %%mm3,0x30(%[y])\n\t" \
|
||||
"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
|
||||
"movq %%mm1,0x50(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \
|
||||
"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \
|
||||
"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
|
||||
"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
|
||||
"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \
|
||||
"movq %%mm4,0x38(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
|
||||
"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
|
||||
"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
|
||||
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
|
||||
"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \
|
||||
"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
|
||||
"movq %%mm3,0x68(%[y])\n\t" \
|
||||
"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
|
||||
"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
|
||||
"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \
|
||||
"movq %%mm4,0x78(%[y])\n\t" \
|
||||
"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
|
||||
"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \
|
||||
"pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
|
||||
"pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
|
||||
"movq %%mm5,0x40(%[y])\n\t" \
|
||||
"punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \
|
||||
"movq %%mm7,0x48(%[y])\n\t" \
|
||||
"pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
|
||||
"punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \
|
||||
"punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \
|
||||
"movq %%mm6,0x60(%[y])\n\t" \
|
||||
"punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \
|
||||
"punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \
|
||||
"movq %%mm3,0x58(%[y])\n\t" \
|
||||
"pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
|
||||
"movq %%mm0,0x70(%[y])\n\t" \
|
||||
|
||||
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
||||
order and stores them in %[qdct].
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 1 8 16 9 2 3 10 B
|
||||
C 17 24 32 25 18 11 4 5 D
|
||||
E 12 19 26 33 40 48 41 34 I
|
||||
H 27 20 13 6 7 14 21 28 G
|
||||
K 35 42 49 56 57 50 43 36 J
|
||||
F 29 22 15 23 30 37 44 51 M
|
||||
P 58 59 52 45 38 31 39 46 L
|
||||
N 53 60 61 54 47 55 62 63 O
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_ZIG_ZAG_MMXEXT \
|
||||
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
||||
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
||||
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
||||
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
||||
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
||||
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
||||
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
||||
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
||||
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
||||
"movq %%mm0,0x00(%[qdct])\n\t" \
|
||||
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
||||
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
||||
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
||||
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
||||
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
||||
"movq %%mm6,0x08(%[qdct])\n\t" \
|
||||
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
||||
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
||||
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
||||
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
||||
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
||||
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
||||
"movq %%mm2,0x10(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x18(%[qdct])\n\t" \
|
||||
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
||||
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
||||
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
||||
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
||||
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
||||
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
||||
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
||||
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
||||
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
||||
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
||||
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
||||
"movq %%mm0,0x20(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x50(%[qdct])\n\t" \
|
||||
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
||||
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
||||
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
||||
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
||||
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
||||
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
||||
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
||||
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
||||
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
||||
"movq %%mm2,0x30(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x38(%[qdct])\n\t" \
|
||||
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
||||
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
||||
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
||||
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
||||
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
||||
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
||||
"movq %%mm0,0x28(%[qdct])\n\t" \
|
||||
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
||||
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
||||
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
||||
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
||||
"movq %%mm4,0x40(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x48(%[qdct])\n\t" \
|
||||
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
||||
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
||||
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
||||
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
||||
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
||||
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
||||
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
||||
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
||||
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
||||
"movq %%mm2,0x68(%[qdct])\n\t" \
|
||||
"movq %%mm1,0x58(%[qdct])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
||||
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
||||
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
||||
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
||||
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
||||
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
||||
"movq %%mm6,0x70(%[qdct])\n\t" \
|
||||
"movq %%mm5,0x78(%[qdct])\n\t" \
|
||||
"movq %%mm7,0x60(%[qdct])\n\t" \
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user