initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

982
thirdparty/libtheora/x86_vc/mmxencfrag.c vendored Normal file
View File

@@ -0,0 +1,982 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include <stddef.h>
#include "x86enc.h"
#if defined(OC_X86_ASM)
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride){
ptrdiff_t ret;
__asm{
#define SRC esi
#define REF edx
#define YSTRIDE ecx
#define YSTRIDE3 edi
mov YSTRIDE,_ystride
mov SRC,_src
mov REF,_ref
/*Load the first 4 rows of each block.*/
movq mm0,[SRC]
movq mm1,[REF]
movq mm2,[SRC][YSTRIDE]
movq mm3,[REF][YSTRIDE]
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
movq mm4,[SRC+YSTRIDE*2]
movq mm5,[REF+YSTRIDE*2]
movq mm6,[SRC+YSTRIDE3]
movq mm7,[REF+YSTRIDE3]
/*Compute their SADs and add them in mm0*/
psadbw mm0,mm1
psadbw mm2,mm3
lea SRC,[SRC+YSTRIDE*4]
paddw mm0,mm2
lea REF,[REF+YSTRIDE*4]
/*Load the next 3 rows as registers become available.*/
movq mm2,[SRC]
movq mm3,[REF]
psadbw mm4,mm5
psadbw mm6,mm7
paddw mm0,mm4
movq mm5,[REF+YSTRIDE]
movq mm4,[SRC+YSTRIDE]
paddw mm0,mm6
movq mm7,[REF+YSTRIDE*2]
movq mm6,[SRC+YSTRIDE*2]
/*Start adding their SADs to mm0*/
psadbw mm2,mm3
psadbw mm4,mm5
paddw mm0,mm2
psadbw mm6,mm7
/*Load last row as registers become available.*/
movq mm2,[SRC+YSTRIDE3]
movq mm3,[REF+YSTRIDE3]
/*And finish adding up their SADs.*/
paddw mm0,mm4
psadbw mm2,mm3
paddw mm0,mm6
paddw mm0,mm2
movd [ret],mm0
#undef SRC
#undef REF
#undef YSTRIDE
#undef YSTRIDE3
}
return (unsigned)ret;
}
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride,unsigned _thresh){
/*Early termination is for suckers.*/
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
}
#define OC_SAD2_LOOP __asm{ \
/*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
pavgb computes (mm0+mm1+1>>1). \
The latter is exactly 1 too large when the low bit of two corresponding \
bytes is only set in one of them. \
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
correct the output of pavgb.*/ \
__asm movq mm6,mm0 \
__asm lea REF1,[REF1+YSTRIDE*2] \
__asm pxor mm0,mm1 \
__asm pavgb mm6,mm1 \
__asm lea REF2,[REF2+YSTRIDE*2] \
__asm movq mm1,mm2 \
__asm pand mm0,mm7 \
__asm pavgb mm2,mm3 \
__asm pxor mm1,mm3 \
__asm movq mm3,[REF2+YSTRIDE] \
__asm psubb mm6,mm0 \
__asm movq mm0,[REF1] \
__asm pand mm1,mm7 \
__asm psadbw mm4,mm6 \
__asm movd mm6,RET \
__asm psubb mm2,mm1 \
__asm movq mm1,[REF2] \
__asm lea SRC,[SRC+YSTRIDE*2] \
__asm psadbw mm5,mm2 \
__asm movq mm2,[REF1+YSTRIDE] \
__asm paddw mm5,mm4 \
__asm movq mm4,[SRC] \
__asm paddw mm6,mm5 \
__asm movq mm5,[SRC+YSTRIDE] \
__asm movd RET,mm6 \
}
/*Same as above, but does not pre-load the next two rows.*/
#define OC_SAD2_TAIL __asm{ \
__asm movq mm6,mm0 \
__asm pavgb mm0,mm1 \
__asm pxor mm6,mm1 \
__asm movq mm1,mm2 \
__asm pand mm6,mm7 \
__asm pavgb mm2,mm3 \
__asm pxor mm1,mm3 \
__asm psubb mm0,mm6 \
__asm pand mm1,mm7 \
__asm psadbw mm4,mm0 \
__asm psubb mm2,mm1 \
__asm movd mm6,RET \
__asm psadbw mm5,mm2 \
__asm paddw mm5,mm4 \
__asm paddw mm6,mm5 \
__asm movd RET,mm6 \
}
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh){
ptrdiff_t ret;
__asm{
#define REF1 ecx
#define REF2 edi
#define YSTRIDE esi
#define SRC edx
#define RET eax
mov YSTRIDE,_ystride
mov SRC,_src
mov REF1,_ref1
mov REF2,_ref2
movq mm0,[REF1]
movq mm1,[REF2]
movq mm2,[REF1+YSTRIDE]
movq mm3,[REF2+YSTRIDE]
xor RET,RET
movq mm4,[SRC]
pxor mm7,mm7
pcmpeqb mm6,mm6
movq mm5,[SRC+YSTRIDE]
psubb mm7,mm6
OC_SAD2_LOOP
OC_SAD2_LOOP
OC_SAD2_LOOP
OC_SAD2_TAIL
mov [ret],RET
#undef REF1
#undef REF2
#undef YSTRIDE
#undef SRC
#undef RET
}
return (unsigned)ret;
}
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
16-bit difference in mm0...mm7.*/
#define OC_LOAD_SUB_8x4(_off) __asm{ \
__asm movd mm0,[_off+SRC] \
__asm movd mm4,[_off+REF] \
__asm movd mm1,[_off+SRC+SRC_YSTRIDE] \
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
__asm movd mm5,[_off+REF+REF_YSTRIDE] \
__asm lea REF,[REF+REF_YSTRIDE*2] \
__asm movd mm2,[_off+SRC] \
__asm movd mm7,[_off+REF] \
__asm movd mm3,[_off+SRC+SRC_YSTRIDE] \
__asm movd mm6,[_off+REF+REF_YSTRIDE] \
__asm punpcklbw mm0,mm4 \
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
__asm punpcklbw mm4,mm4 \
__asm lea REF,[REF+REF_YSTRIDE*2] \
__asm psubw mm0,mm4 \
__asm movd mm4,[_off+SRC] \
__asm movq [_off*2+BUF],mm0 \
__asm movd mm0,[_off+REF] \
__asm punpcklbw mm1,mm5 \
__asm punpcklbw mm5,mm5 \
__asm psubw mm1,mm5 \
__asm movd mm5,[_off+SRC+SRC_YSTRIDE] \
__asm punpcklbw mm2,mm7 \
__asm punpcklbw mm7,mm7 \
__asm psubw mm2,mm7 \
__asm movd mm7,[_off+REF+REF_YSTRIDE] \
__asm punpcklbw mm3,mm6 \
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
__asm punpcklbw mm6,mm6 \
__asm psubw mm3,mm6 \
__asm movd mm6,[_off+SRC] \
__asm punpcklbw mm4,mm0 \
__asm lea REF,[REF+REF_YSTRIDE*2] \
__asm punpcklbw mm0,mm0 \
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
__asm psubw mm4,mm0 \
__asm movd mm0,[_off+REF] \
__asm punpcklbw mm5,mm7 \
__asm neg SRC_YSTRIDE \
__asm punpcklbw mm7,mm7 \
__asm psubw mm5,mm7 \
__asm movd mm7,[_off+SRC+SRC_YSTRIDE] \
__asm punpcklbw mm6,mm0 \
__asm lea REF,[REF+REF_YSTRIDE*2] \
__asm punpcklbw mm0,mm0 \
__asm neg REF_YSTRIDE \
__asm psubw mm6,mm0 \
__asm movd mm0,[_off+REF+REF_YSTRIDE] \
__asm lea SRC,[SRC+SRC_YSTRIDE*8] \
__asm punpcklbw mm7,mm0 \
__asm neg SRC_YSTRIDE \
__asm punpcklbw mm0,mm0 \
__asm lea REF,[REF+REF_YSTRIDE*8] \
__asm psubw mm7,mm0 \
__asm neg REF_YSTRIDE \
__asm movq mm0,[_off*2+BUF] \
}
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
#define OC_LOAD_8x4(_off) __asm{ \
__asm movd mm0,[_off+SRC] \
__asm movd mm1,[_off+SRC+YSTRIDE] \
__asm movd mm2,[_off+SRC+YSTRIDE*2] \
__asm pxor mm7,mm7 \
__asm movd mm3,[_off+SRC+YSTRIDE3] \
__asm punpcklbw mm0,mm7 \
__asm movd mm4,[_off+SRC4] \
__asm punpcklbw mm1,mm7 \
__asm movd mm5,[_off+SRC4+YSTRIDE] \
__asm punpcklbw mm2,mm7 \
__asm movd mm6,[_off+SRC4+YSTRIDE*2] \
__asm punpcklbw mm3,mm7 \
__asm movd mm7,[_off+SRC4+YSTRIDE3] \
__asm punpcklbw mm4,mm4 \
__asm punpcklbw mm5,mm5 \
__asm psrlw mm4,8 \
__asm psrlw mm5,8 \
__asm punpcklbw mm6,mm6 \
__asm punpcklbw mm7,mm7 \
__asm psrlw mm6,8 \
__asm psrlw mm7,8 \
}
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 __asm{ \
/*Stage A: \
Outputs 0-3 are swapped with 4-7 here.*/ \
__asm paddw mm5,mm1 \
__asm paddw mm6,mm2 \
__asm paddw mm1,mm1 \
__asm paddw mm2,mm2 \
__asm psubw mm1,mm5 \
__asm psubw mm2,mm6 \
__asm paddw mm7,mm3 \
__asm paddw mm4,mm0 \
__asm paddw mm3,mm3 \
__asm paddw mm0,mm0 \
__asm psubw mm3,mm7 \
__asm psubw mm0,mm4 \
/*Stage B:*/ \
__asm paddw mm0,mm2 \
__asm paddw mm1,mm3 \
__asm paddw mm4,mm6 \
__asm paddw mm5,mm7 \
__asm paddw mm2,mm2 \
__asm paddw mm3,mm3 \
__asm paddw mm6,mm6 \
__asm paddw mm7,mm7 \
__asm psubw mm2,mm0 \
__asm psubw mm3,mm1 \
__asm psubw mm6,mm4 \
__asm psubw mm7,mm5 \
}
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 __asm{ \
/*Stage C:*/ \
__asm paddw mm0,mm1 \
__asm paddw mm2,mm3 \
__asm paddw mm4,mm5 \
__asm paddw mm6,mm7 \
__asm paddw mm1,mm1 \
__asm paddw mm3,mm3 \
__asm paddw mm5,mm5 \
__asm paddw mm7,mm7 \
__asm psubw mm1,mm0 \
__asm psubw mm3,mm2 \
__asm psubw mm5,mm4 \
__asm psubw mm7,mm6 \
}
/*Performs an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
in place with no temporary registers).*/
#define OC_HADAMARD_8x4 __asm{ \
OC_HADAMARD_AB_8x4 \
OC_HADAMARD_C_8x4 \
}
/*Performs the first part of the final stage of the Hadamard transform and
summing of absolute values.
At the end of this part, mm1 will contain the DC coefficient of the
transform.*/
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
/*We use the fact that \
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
to merge the final butterfly with the abs and the first stage of \
accumulation. \
Thus we can avoid using pabsw, which is not available until SSSE3. \
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
registers). \
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
This implementation is only 26 (+4 for spilling registers).*/ \
__asm movq [_r7+BUF],mm7 \
__asm movq [_r6+BUF],mm6 \
/*mm7={0x7FFF}x4 \
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
__asm pcmpeqb mm7,mm7 \
__asm movq mm6,mm0 \
__asm psrlw mm7,1 \
__asm paddw mm6,mm1 \
__asm pmaxsw mm0,mm1 \
__asm paddsw mm6,mm7 \
__asm psubw mm0,mm6 \
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
__asm movq mm6,mm2 \
__asm movq mm1,mm4 \
__asm pmaxsw mm2,mm3 \
__asm pmaxsw mm4,mm5 \
__asm paddw mm6,mm3 \
__asm paddw mm1,mm5 \
__asm movq mm3,[_r7+BUF] \
}
/*Performs the second part of the final stage of the Hadamard transform and
summing of absolute values.*/
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
__asm paddsw mm6,mm7 \
__asm movq mm5,[_r6+BUF] \
__asm paddsw mm1,mm7 \
__asm psubw mm2,mm6 \
__asm psubw mm4,mm1 \
/*mm7={1}x4 (needed for the horizontal add that follows) \
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
__asm movq mm6,mm3 \
__asm pmaxsw mm3,mm5 \
__asm paddw mm0,mm2 \
__asm paddw mm6,mm5 \
__asm paddw mm0,mm4 \
__asm paddsw mm6,mm7 \
__asm paddw mm0,mm3 \
__asm psrlw mm7,14 \
__asm psubw mm0,mm6 \
}
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
absolute value of each component, and accumulates everything into mm0.
This is the only portion of SATD which requires MMXEXT (we could use plain
MMX, but it takes 4 instructions and an extra register to work around the
lack of a pmaxsw, which is a pretty serious penalty).*/
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
}
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
component, and accumulates everything into mm0.
Note that mm0 will have an extra 4 added to each column, and that after
removing this value, the remainder will be half the conventional value.*/
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
OC_HADAMARD_AB_8x4 \
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
}
/*Performs two 4x4 transposes (mostly) in place.
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
contains rows {a,b,c,d}.
On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
/*First 4x4 transpose:*/ \
__asm movq [0x10+_off+BUF],mm5 \
/*mm0 = e3 e2 e1 e0 \
mm1 = f3 f2 f1 f0 \
mm2 = g3 g2 g1 g0 \
mm3 = h3 h2 h1 h0*/ \
__asm movq mm5,mm2 \
__asm punpcklwd mm2,mm3 \
__asm punpckhwd mm5,mm3 \
__asm movq mm3,mm0 \
__asm punpcklwd mm0,mm1 \
__asm punpckhwd mm3,mm1 \
/*mm0 = f1 e1 f0 e0 \
mm3 = f3 e3 f2 e2 \
mm2 = h1 g1 h0 g0 \
mm5 = h3 g3 h2 g2*/ \
__asm movq mm1,mm0 \
__asm punpckldq mm0,mm2 \
__asm punpckhdq mm1,mm2 \
__asm movq mm2,mm3 \
__asm punpckhdq mm3,mm5 \
__asm movq [0x40+_off+BUF],mm0 \
__asm punpckldq mm2,mm5 \
/*mm0 = h0 g0 f0 e0 \
mm1 = h1 g1 f1 e1 \
mm2 = h2 g2 f2 e2 \
mm3 = h3 g3 f3 e3*/ \
__asm movq mm5,[0x10+_off+BUF] \
/*Second 4x4 transpose:*/ \
/*mm4 = a3 a2 a1 a0 \
mm5 = b3 b2 b1 b0 \
mm6 = c3 c2 c1 c0 \
mm7 = d3 d2 d1 d0*/ \
__asm movq mm0,mm6 \
__asm punpcklwd mm6,mm7 \
__asm movq [0x50+_off+BUF],mm1 \
__asm punpckhwd mm0,mm7 \
__asm movq mm7,mm4 \
__asm punpcklwd mm4,mm5 \
__asm movq [0x60+_off+BUF],mm2 \
__asm punpckhwd mm7,mm5 \
/*mm4 = b1 a1 b0 a0 \
mm7 = b3 a3 b2 a2 \
mm6 = d1 c1 d0 c0 \
mm0 = d3 c3 d2 c2*/ \
__asm movq mm5,mm4 \
__asm punpckldq mm4,mm6 \
__asm movq [0x70+_off+BUF],mm3 \
__asm punpckhdq mm5,mm6 \
__asm movq mm6,mm7 \
__asm punpckhdq mm7,mm0 \
__asm punpckldq mm6,mm0 \
/*mm4 = d0 c0 b0 a0 \
mm5 = d1 c1 b1 a1 \
mm6 = d2 c2 b2 a2 \
mm7 = d3 c3 b3 a3*/ \
}
static unsigned oc_int_frag_satd_mmxext(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
ogg_int16_t *bufp;
unsigned ret;
unsigned ret2;
int dc;
bufp=buf;
__asm{
#define SRC esi
#define REF eax
#define SRC_YSTRIDE ecx
#define REF_YSTRIDE edx
#define BUF edi
#define RET edx
#define RET2 ecx
#define DC eax
#define DC_WORD ax
mov SRC,_src
mov SRC_YSTRIDE,_src_ystride
mov REF,_ref
mov REF_YSTRIDE,_ref_ystride
mov BUF,bufp
OC_LOAD_SUB_8x4(0x00)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x00)
/*Finish swapping out this 8x4 block to make room for the next one.
mm0...mm3 have been swapped out already.*/
movq [0x00+BUF],mm4
movq [0x10+BUF],mm5
movq [0x20+BUF],mm6
movq [0x30+BUF],mm7
OC_LOAD_SUB_8x4(0x04)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x08)
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place, so
we only have to do half the loads.*/
movq mm1,[0x10+BUF]
movq mm2,[0x20+BUF]
movq mm3,[0x30+BUF]
movq mm0,[0x00+BUF]
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x4
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
movd DC,mm1
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
latency of pmaddwd by starting the next series of loads now.*/
pmaddwd mm0,mm7
movq mm1,[0x50+BUF]
movq mm5,[0x58+BUF]
movq mm4,mm0
movq mm2,[0x60+BUF]
punpckhdq mm0,mm0
movq mm6,[0x68+BUF]
paddd mm4,mm0
movq mm3,[0x70+BUF]
movd RET2,mm4
movq mm7,[0x78+BUF]
movq mm0,[0x40+BUF]
movq mm4,[0x48+BUF]
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
pmaddwd mm0,mm7
/*Subtract abs(dc) from 2*ret2.*/
movsx DC,DC_WORD
cdq
lea RET2,[RET+RET2*2]
movq mm4,mm0
punpckhdq mm0,mm0
xor RET,DC
paddd mm4,mm0
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
added to them, a factor of two removed, and the DC value included;
correct the final sum here.*/
sub RET2,RET
movd RET,mm4
lea RET,[RET2+RET*2-64]
mov ret,RET
mov dc,DC
#undef SRC
#undef REF
#undef SRC_YSTRIDE
#undef REF_YSTRIDE
#undef BUF
#undef RET
#undef RET2
#undef DC
#undef DC_WORD
}
*_dc=dc;
return ret;
}
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
}
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
we can share code with oc_enc_frag_satd2_mmxext().*/
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
__asm{
/*Load the first 3 rows.*/
#define DST_YSTRIDE edi
#define SRC_YSTRIDE esi
#define DST eax
#define SRC1 edx
#define SRC2 ecx
mov DST_YSTRIDE,_dst_ystride
mov SRC_YSTRIDE,_src_ystride
mov DST,_dst
mov SRC1,_src1
mov SRC2,_src2
movq mm0,[SRC1]
movq mm1,[SRC2]
movq mm2,[SRC1+SRC_YSTRIDE]
lea SRC1,[SRC1+SRC_YSTRIDE*2]
movq mm3,[SRC2+SRC_YSTRIDE]
lea SRC2,[SRC2+SRC_YSTRIDE*2]
pxor mm7,mm7
movq mm4,[SRC1]
pcmpeqb mm6,mm6
movq mm5,[SRC2]
/*mm7={1}x8.*/
psubb mm7,mm6
/*Start averaging mm0 and mm1 into mm6.*/
movq mm6,mm0
pxor mm0,mm1
pavgb mm6,mm1
/*mm1 is free, start averaging mm3 into mm2 using mm1.*/
movq mm1,mm2
pand mm0,mm7
pavgb mm2,mm3
pxor mm1,mm3
/*mm3 is free.*/
psubb mm6,mm0
/*mm0 is free, start loading the next row.*/
movq mm0,[SRC1+SRC_YSTRIDE]
/*Start averaging mm5 and mm4 using mm3.*/
movq mm3,mm4
/*mm6 [row 0] is done; write it out.*/
movq [DST],mm6
pand mm1,mm7
pavgb mm4,mm5
psubb mm2,mm1
/*mm1 is free, continue loading the next row.*/
movq mm1,[SRC2+SRC_YSTRIDE]
pxor mm3,mm5
lea SRC1,[SRC1+SRC_YSTRIDE*2]
/*mm2 [row 1] is done; write it out.*/
movq [DST+DST_YSTRIDE],mm2
pand mm3,mm7
/*Start loading the next row.*/
movq mm2,[SRC1]
lea DST,[DST+DST_YSTRIDE*2]
psubb mm4,mm3
lea SRC2,[SRC2+SRC_YSTRIDE*2]
/*mm4 [row 2] is done; write it out.*/
movq [DST],mm4
/*Continue loading the next row.*/
movq mm3,[SRC2]
/*Start averaging mm0 and mm1 into mm6.*/
movq mm6,mm0
pxor mm0,mm1
/*Start loading the next row.*/
movq mm4,[SRC1+SRC_YSTRIDE]
pavgb mm6,mm1
/*mm1 is free; start averaging mm3 into mm2 using mm1.*/
movq mm1,mm2
pand mm0,mm7
/*Continue loading the next row.*/
movq mm5,[SRC2+SRC_YSTRIDE]
pavgb mm2,mm3
lea SRC1,[SRC1+SRC_YSTRIDE*2]
pxor mm1,mm3
/*mm3 is free.*/
psubb mm6,mm0
/*mm0 is free, start loading the next row.*/
movq mm0,[SRC1]
/*Start averaging mm5 into mm4 using mm3.*/
movq mm3,mm4
/*mm6 [row 3] is done; write it out.*/
movq [DST+DST_YSTRIDE],mm6
pand mm1,mm7
lea SRC2,[SRC2+SRC_YSTRIDE*2]
pavgb mm4,mm5
lea DST,[DST+DST_YSTRIDE*2]
psubb mm2,mm1
/*mm1 is free; continue loading the next row.*/
movq mm1,[SRC2]
pxor mm3,mm5
/*mm2 [row 4] is done; write it out.*/
movq [DST],mm2
pand mm3,mm7
/*Start loading the next row.*/
movq mm2,[SRC1+SRC_YSTRIDE]
psubb mm4,mm3
/*Start averaging mm0 and mm1 into mm6.*/
movq mm6,mm0
/*Continue loading the next row.*/
movq mm3,[SRC2+SRC_YSTRIDE]
/*mm4 [row 5] is done; write it out.*/
movq [DST+DST_YSTRIDE],mm4
pxor mm0,mm1
pavgb mm6,mm1
/*mm4 is free; start averaging mm3 into mm2 using mm4.*/
movq mm4,mm2
pand mm0,mm7
pavgb mm2,mm3
pxor mm4,mm3
lea DST,[DST+DST_YSTRIDE*2]
psubb mm6,mm0
pand mm4,mm7
/*mm6 [row 6] is done, write it out.*/
movq [DST],mm6
psubb mm2,mm4
/*mm2 [row 7] is done, write it out.*/
movq [DST+DST_YSTRIDE],mm2
#undef SRC1
#undef SRC2
#undef SRC_YSTRIDE
#undef DST_YSTRIDE
#undef DST
}
}
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
}
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
int _ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
ogg_int16_t *bufp;
unsigned ret1;
unsigned ret2;
int dc;
bufp=buf;
__asm{
#define SRC eax
#define SRC4 esi
#define BUF edi
#define YSTRIDE edx
#define YSTRIDE3 ecx
#define RET eax
#define RET2 ecx
#define DC edx
#define DC_WORD dx
mov SRC,_src
mov BUF,bufp
mov YSTRIDE,_ystride
/* src4 = src+4*ystride */
lea SRC4,[SRC+YSTRIDE*4]
/* ystride3 = 3*ystride */
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
OC_LOAD_8x4(0x00)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x00)
/*Finish swapping out this 8x4 block to make room for the next one.
mm0...mm3 have been swapped out already.*/
movq [0x00+BUF],mm4
movq [0x10+BUF],mm5
movq [0x20+BUF],mm6
movq [0x30+BUF],mm7
OC_LOAD_8x4(0x04)
OC_HADAMARD_8x4
OC_TRANSPOSE_4x4x2(0x08)
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place, so
we only have to do half the loads.*/
movq mm1,[0x10+BUF]
movq mm2,[0x20+BUF]
movq mm3,[0x30+BUF]
movq mm0,[0x00+BUF]
/*We split out the stages here so we can save the DC coefficient in the
middle.*/
OC_HADAMARD_AB_8x4
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
movd DC,mm1
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
latency of pmaddwd by starting the next series of loads now.*/
pmaddwd mm0,mm7
movq mm1,[0x50+BUF]
movq mm5,[0x58+BUF]
movq mm2,[0x60+BUF]
movq mm4,mm0
movq mm6,[0x68+BUF]
punpckhdq mm0,mm0
movq mm3,[0x70+BUF]
paddd mm4,mm0
movq mm7,[0x78+BUF]
movd RET,mm4
movq mm0,[0x40+BUF]
movq mm4,[0x48+BUF]
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
pmaddwd mm0,mm7
/*We assume that the DC coefficient is always positive (which is true,
because the input to the INTRA transform was not a difference).*/
movzx DC,DC_WORD
add RET,RET
sub RET,DC
movq mm4,mm0
punpckhdq mm0,mm0
paddd mm4,mm0
movd RET2,mm4
lea RET,[-64+RET+RET2*2]
mov [dc],DC
mov [ret1],RET
#undef SRC
#undef SRC4
#undef BUF
#undef YSTRIDE
#undef YSTRIDE3
#undef RET
#undef RET2
#undef DC
#undef DC_WORD
}
*_dc=dc;
return ret1;
}
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
const unsigned char *_src, const unsigned char *_ref,int _ystride){
int i;
__asm pxor mm7,mm7
for(i=4;i-->0;){
__asm{
#define SRC edx
#define YSTRIDE esi
#define RESIDUE eax
#define REF ecx
mov YSTRIDE,_ystride
mov RESIDUE,_residue
mov SRC,_src
mov REF,_ref
/*mm0=[src]*/
movq mm0,[SRC]
/*mm1=[ref]*/
movq mm1,[REF]
/*mm4=[src+ystride]*/
movq mm4,[SRC+YSTRIDE]
/*mm5=[ref+ystride]*/
movq mm5,[REF+YSTRIDE]
/*Compute [src]-[ref].*/
movq mm2,mm0
punpcklbw mm0,mm7
movq mm3,mm1
punpckhbw mm2,mm7
punpcklbw mm1,mm7
punpckhbw mm3,mm7
psubw mm0,mm1
psubw mm2,mm3
/*Compute [src+ystride]-[ref+ystride].*/
movq mm1,mm4
punpcklbw mm4,mm7
movq mm3,mm5
punpckhbw mm1,mm7
lea SRC,[SRC+YSTRIDE*2]
punpcklbw mm5,mm7
lea REF,[REF+YSTRIDE*2]
punpckhbw mm3,mm7
psubw mm4,mm5
psubw mm1,mm3
/*Write the answer out.*/
movq [RESIDUE+0x00],mm0
movq [RESIDUE+0x08],mm2
movq [RESIDUE+0x10],mm4
movq [RESIDUE+0x18],mm1
lea RESIDUE,[RESIDUE+0x20]
mov _residue,RESIDUE
mov _src,SRC
mov _ref,REF
#undef SRC
#undef YSTRIDE
#undef RESIDUE
#undef REF
}
}
}
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
const unsigned char *_src,int _ystride){
__asm{
#define YSTRIDE edx
#define YSTRIDE3 edi
#define RESIDUE ecx
#define SRC eax
mov YSTRIDE,_ystride
mov RESIDUE,_residue
mov SRC,_src
/*mm0=[src]*/
movq mm0,[SRC]
/*mm1=[src+ystride]*/
movq mm1,[SRC+YSTRIDE]
/*mm6={-1}x4*/
pcmpeqw mm6,mm6
/*mm2=[src+2*ystride]*/
movq mm2,[SRC+YSTRIDE*2]
/*[ystride3]=3*[ystride]*/
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
/*mm6={1}x4*/
psllw mm6,15
/*mm3=[src+3*ystride]*/
movq mm3,[SRC+YSTRIDE3]
/*mm6={128}x4*/
psrlw mm6,8
/*mm7=0*/
pxor mm7,mm7
/*[src]=[src]+4*[ystride]*/
lea SRC,[SRC+YSTRIDE*4]
/*Compute [src]-128 and [src+ystride]-128*/
movq mm4,mm0
punpcklbw mm0,mm7
movq mm5,mm1
punpckhbw mm4,mm7
psubw mm0,mm6
punpcklbw mm1,mm7
psubw mm4,mm6
punpckhbw mm5,mm7
psubw mm1,mm6
psubw mm5,mm6
/*Write the answer out.*/
movq [RESIDUE+0x00],mm0
movq [RESIDUE+0x08],mm4
movq [RESIDUE+0x10],mm1
movq [RESIDUE+0x18],mm5
/*mm0=[src+4*ystride]*/
movq mm0,[SRC]
/*mm1=[src+5*ystride]*/
movq mm1,[SRC+YSTRIDE]
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
movq mm4,mm2
punpcklbw mm2,mm7
movq mm5,mm3
punpckhbw mm4,mm7
psubw mm2,mm6
punpcklbw mm3,mm7
psubw mm4,mm6
punpckhbw mm5,mm7
psubw mm3,mm6
psubw mm5,mm6
/*Write the answer out.*/
movq [RESIDUE+0x20],mm2
movq [RESIDUE+0x28],mm4
movq [RESIDUE+0x30],mm3
movq [RESIDUE+0x38],mm5
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
movq mm2,[SRC+YSTRIDE*2]
movq mm3,[SRC+YSTRIDE3]
movq mm4,mm0
punpcklbw mm0,mm7
movq mm5,mm1
punpckhbw mm4,mm7
psubw mm0,mm6
punpcklbw mm1,mm7
psubw mm4,mm6
punpckhbw mm5,mm7
psubw mm1,mm6
psubw mm5,mm6
/*Write the answer out.*/
movq [RESIDUE+0x40],mm0
movq [RESIDUE+0x48],mm4
movq [RESIDUE+0x50],mm1
movq [RESIDUE+0x58],mm5
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
movq mm4,mm2
punpcklbw mm2,mm7
movq mm5,mm3
punpckhbw mm4,mm7
psubw mm2,mm6
punpcklbw mm3,mm7
psubw mm4,mm6
punpckhbw mm5,mm7
psubw mm3,mm6
psubw mm5,mm6
/*Write the answer out.*/
movq [RESIDUE+0x60],mm2
movq [RESIDUE+0x68],mm4
movq [RESIDUE+0x70],mm3
movq [RESIDUE+0x78],mm5
#undef YSTRIDE
#undef YSTRIDE3
#undef RESIDUE
#undef SRC
}
}
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
}
#endif

686
thirdparty/libtheora/x86_vc/mmxfdct.c vendored Normal file
View File

@@ -0,0 +1,686 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
* by the Xiph.Org Foundation https://www.xiph.org/ *
* *
********************************************************************/
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
#include "x86zigzag.h"
#if defined(OC_X86_ASM)
#define OC_FDCT_STAGE1_8x4 __asm{ \
/*Stage 1:*/ \
/*mm0=t7'=t0-t7*/ \
__asm psubw mm0,mm7 \
__asm paddw mm7,mm7 \
/*mm1=t6'=t1-t6*/ \
__asm psubw mm1, mm6 \
__asm paddw mm6,mm6 \
/*mm2=t5'=t2-t5*/ \
__asm psubw mm2,mm5 \
__asm paddw mm5,mm5 \
/*mm3=t4'=t3-t4*/ \
__asm psubw mm3,mm4 \
__asm paddw mm4,mm4 \
/*mm7=t0'=t0+t7*/ \
__asm paddw mm7,mm0 \
/*mm6=t1'=t1+t6*/ \
__asm paddw mm6,mm1 \
/*mm5=t2'=t2+t5*/ \
__asm paddw mm5,mm2 \
/*mm4=t3'=t3+t4*/ \
__asm paddw mm4,mm3\
}
#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
/*Stage 2:*/ \
/*mm7=t3''=t0'-t3'*/ \
__asm psubw mm7,mm4 \
__asm paddw mm4,mm4 \
/*mm6=t2''=t1'-t2'*/ \
__asm psubw mm6,mm5 \
__asm movq [Y+_r6],mm7 \
__asm paddw mm5,mm5 \
/*mm1=t5''=t6'-t5'*/ \
__asm psubw mm1,mm2 \
__asm movq [Y+_r2],mm6 \
/*mm4=t0''=t0'+t3'*/ \
__asm paddw mm4,mm7 \
__asm paddw mm2,mm2 \
/*mm5=t1''=t1'+t2'*/ \
__asm movq [Y+_r0],mm4 \
__asm paddw mm5,mm6 \
/*mm2=t6''=t6'+t5'*/ \
__asm paddw mm2,mm1 \
__asm movq [Y+_r4],mm5 \
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
/*mm4, mm5, mm6, mm7 are free.*/ \
/*Stage 3:*/ \
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
__asm mov A,0x5A806A0A \
__asm pcmpeqb mm6,mm6 \
__asm movd mm7,A \
__asm psrlw mm6,15 \
__asm punpckldq mm7,mm7 \
__asm paddw mm6,mm6 \
/*mm0=0, m2={-1}x4 \
mm5:mm4=t5''*27146+0xB500*/ \
__asm movq mm4,mm1 \
__asm movq mm5,mm1 \
__asm punpcklwd mm4,mm6 \
__asm movq [Y+_r3],mm2 \
__asm pmaddwd mm4,mm7 \
__asm movq [Y+_r7],mm0 \
__asm punpckhwd mm5,mm6 \
__asm pxor mm0,mm0 \
__asm pmaddwd mm5,mm7 \
__asm pcmpeqb mm2,mm2 \
/*mm2=t6'', mm1=t5''+(t5''!=0) \
mm4=(t5''*27146+0xB500>>16)*/ \
__asm pcmpeqw mm0,mm1 \
__asm psrad mm4,16 \
__asm psubw mm0,mm2 \
__asm movq mm2, [Y+_r3] \
__asm psrad mm5,16 \
__asm paddw mm1,mm0 \
__asm packssdw mm4,mm5 \
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
__asm paddw mm4,mm1 \
__asm movq mm0, [Y+_r7] \
__asm psraw mm4,1 \
__asm movq mm1,mm3 \
/*mm3=t4''=t4'+s*/ \
__asm paddw mm3,mm4 \
/*mm1=t5'''=t4'-s*/ \
__asm psubw mm1,mm4 \
/*mm1=0, mm3={-1}x4 \
mm5:mm4=t6''*27146+0xB500*/ \
__asm movq mm4,mm2 \
__asm movq mm5,mm2 \
__asm punpcklwd mm4,mm6 \
__asm movq [Y+_r5],mm1 \
__asm pmaddwd mm4,mm7 \
__asm movq [Y+_r1],mm3 \
__asm punpckhwd mm5,mm6 \
__asm pxor mm1,mm1 \
__asm pmaddwd mm5,mm7 \
__asm pcmpeqb mm3,mm3 \
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
__asm psrad mm4,16 \
__asm pcmpeqw mm1,mm2 \
__asm psrad mm5,16 \
__asm psubw mm1,mm3 \
__asm packssdw mm4,mm5 \
__asm paddw mm2,mm1 \
/*mm1=t1'' \
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
__asm paddw mm4,mm2 \
__asm movq mm1,[Y+_r4] \
__asm psraw mm4,1 \
__asm movq mm2,mm0 \
/*mm7={54491-0x7FFF,0x7FFF}x2 \
mm0=t7''=t7'+s*/ \
__asm paddw mm0,mm4 \
/*mm2=t6'''=t7'-s*/ \
__asm psubw mm2,mm4 \
/*Stage 4:*/ \
/*mm0=0, mm2=t0'' \
mm5:mm4=t1''*27146+0xB500*/ \
__asm movq mm4,mm1 \
__asm movq mm5,mm1 \
__asm punpcklwd mm4,mm6 \
__asm movq [Y+_r3],mm2 \
__asm pmaddwd mm4,mm7 \
__asm movq mm2,[Y+_r0] \
__asm punpckhwd mm5,mm6 \
__asm movq [Y+_r7],mm0 \
__asm pmaddwd mm5,mm7 \
__asm pxor mm0,mm0 \
/*mm7={27146,0x4000>>1}x2 \
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
__asm psrad mm4,16 \
__asm mov A,0x20006A0A \
__asm pcmpeqw mm0,mm1 \
__asm movd mm7,A \
__asm psrad mm5,16 \
__asm psubw mm0,mm3 \
__asm packssdw mm4,mm5 \
__asm paddw mm0,mm1 \
__asm punpckldq mm7,mm7 \
__asm paddw mm0,mm4 \
/*mm6={0x00000E3D}x2 \
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
__asm movq mm4,mm2 \
__asm movq mm5,mm2 \
__asm punpcklwd mm4,mm6 \
__asm mov A,0x0E3D \
__asm pmaddwd mm4,mm7 \
__asm punpckhwd mm5,mm6 \
__asm movd mm6,A \
__asm pmaddwd mm5,mm7 \
__asm pxor mm1,mm1 \
__asm punpckldq mm6,mm6 \
__asm pcmpeqw mm1,mm2 \
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
__asm psrad mm4,16 \
__asm psubw mm1,mm3 \
__asm psrad mm5,16 \
__asm paddw mm2,mm1 \
__asm packssdw mm4,mm5 \
__asm movq mm1,[Y+_r5] \
__asm paddw mm4,mm2 \
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
The naive implementation could cause overflow, so we use \
u=(r&s)+((r^s)>>1).*/ \
__asm movq mm2,[Y+_r3] \
__asm movq mm7,mm0 \
__asm pxor mm0,mm4 \
__asm pand mm7,mm4 \
__asm psraw mm0,1 \
__asm mov A,0x7FFF54DC \
__asm paddw mm0,mm7 \
__asm movd mm7,A \
/*mm7={54491-0x7FFF,0x7FFF}x2 \
mm4=_y[4]=v=r-u*/ \
__asm psubw mm4,mm0 \
__asm punpckldq mm7,mm7 \
__asm movq [Y+_r4],mm4 \
/*mm0=0, mm7={36410}x4 \
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
__asm movq mm4,mm1 \
__asm movq mm5,mm1 \
__asm punpcklwd mm4,mm1 \
__asm mov A,0x8E3A8E3A \
__asm pmaddwd mm4,mm7 \
__asm movq [Y+_r0],mm0 \
__asm punpckhwd mm5,mm1 \
__asm pxor mm0,mm0 \
__asm pmaddwd mm5,mm7 \
__asm pcmpeqw mm1,mm0 \
__asm movd mm7,A \
__asm psubw mm1,mm3 \
__asm punpckldq mm7,mm7 \
__asm paddd mm4,mm6 \
__asm paddd mm5,mm6 \
/*mm0=0 \
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
__asm movq mm6,mm2 \
__asm movq mm3,mm2 \
__asm pmulhw mm6,mm7 \
__asm paddw mm1,mm2 \
__asm pmullw mm3,mm7 \
__asm pxor mm0,mm0 \
__asm paddw mm6,mm1 \
__asm movq mm1,mm3 \
__asm punpckhwd mm3,mm6 \
__asm punpcklwd mm1,mm6 \
/*mm3={-1}x4, mm6={1}x4 \
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
__asm paddd mm5,mm3 \
__asm paddd mm4,mm1 \
__asm psrad mm5,16 \
__asm pxor mm6,mm6 \
__asm psrad mm4,16 \
__asm pcmpeqb mm3,mm3 \
__asm packssdw mm4,mm5 \
__asm psubw mm6,mm3 \
/*mm1=t7'', mm7={26568,0x3400}x2 \
mm2=s=t6'''-(36410*u>>16)*/ \
__asm movq mm1,mm4 \
__asm mov A,0x340067C8 \
__asm pmulhw mm4,mm7 \
__asm movd mm7,A \
__asm movq [Y+_r5],mm1 \
__asm punpckldq mm7,mm7 \
__asm paddw mm4,mm1 \
__asm movq mm1,[Y+_r7] \
__asm psubw mm2,mm4 \
/*mm6={0x00007B1B}x2 \
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
__asm movq mm4,mm2 \
__asm movq mm5,mm2 \
__asm punpcklwd mm4,mm6 \
__asm pcmpeqw mm0,mm2 \
__asm pmaddwd mm4,mm7 \
__asm mov A,0x7B1B \
__asm punpckhwd mm5,mm6 \
__asm movd mm6,A \
__asm pmaddwd mm5,mm7 \
__asm psubw mm0,mm3 \
__asm punpckldq mm6,mm6 \
/*mm7={64277-0x7FFF,0x7FFF}x2 \
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
__asm psrad mm4,17 \
__asm paddw mm2,mm0 \
__asm psrad mm5,17 \
__asm mov A,0x7FFF7B16 \
__asm packssdw mm4,mm5 \
__asm movd mm7,A \
__asm paddw mm2,mm4 \
__asm punpckldq mm7,mm7 \
/*mm0=0, mm7={12785}x4 \
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
__asm movq mm4,mm1 \
__asm movq mm5,mm1 \
__asm movq [Y+_r3],mm2 \
__asm punpcklwd mm4,mm1 \
__asm movq mm2,[Y+_r1] \
__asm pmaddwd mm4,mm7 \
__asm mov A,0x31F131F1 \
__asm punpckhwd mm5,mm1 \
__asm pxor mm0,mm0 \
__asm pmaddwd mm5,mm7 \
__asm pcmpeqw mm1,mm0 \
__asm movd mm7,A \
__asm psubw mm1,mm3 \
__asm punpckldq mm7,mm7 \
__asm paddd mm4,mm6 \
__asm paddd mm5,mm6 \
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
__asm movq mm6,mm2 \
__asm movq mm3,mm2 \
__asm pmulhw mm6,mm7 \
__asm pmullw mm3,mm7 \
__asm paddw mm6,mm1 \
__asm movq mm1,mm3 \
__asm punpckhwd mm3,mm6 \
__asm punpcklwd mm1,mm6 \
/*mm3={-1}x4, mm6={1}x4 \
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
__asm paddd mm5,mm3 \
__asm paddd mm4,mm1 \
__asm psrad mm5,16 \
__asm pxor mm6,mm6 \
__asm psrad mm4,16 \
__asm pcmpeqb mm3,mm3 \
__asm packssdw mm4,mm5 \
__asm psubw mm6,mm3 \
/*mm1=t3'', mm7={20539,0x3000}x2 \
mm4=s=(12785*u>>16)-t4''*/ \
__asm movq [Y+_r1],mm4 \
__asm pmulhw mm4,mm7 \
__asm mov A,0x3000503B \
__asm movq mm1,[Y+_r6] \
__asm movd mm7,A \
__asm psubw mm4,mm2 \
__asm punpckldq mm7,mm7 \
/*mm6={0x00006CB7}x2 \
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
__asm movq mm5,mm4 \
__asm movq mm2,mm4 \
__asm punpcklwd mm4,mm6 \
__asm pcmpeqw mm0,mm2 \
__asm pmaddwd mm4,mm7 \
__asm mov A,0x6CB7 \
__asm punpckhwd mm5,mm6 \
__asm movd mm6,A \
__asm pmaddwd mm5,mm7 \
__asm psubw mm0,mm3 \
__asm punpckldq mm6,mm6 \
/*mm7={60547-0x7FFF,0x7FFF}x2 \
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
__asm psrad mm4,20 \
__asm paddw mm2,mm0 \
__asm psrad mm5,20 \
__asm mov A,0x7FFF6C84 \
__asm packssdw mm4,mm5 \
__asm movd mm7,A \
__asm paddw mm2,mm4 \
__asm punpckldq mm7,mm7 \
/*mm0=0, mm7={25080}x4 \
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
__asm movq mm4,mm1 \
__asm movq mm5,mm1 \
__asm movq [Y+_r7],mm2 \
__asm punpcklwd mm4,mm1 \
__asm movq mm2,[Y+_r2] \
__asm pmaddwd mm4,mm7 \
__asm mov A,0x61F861F8 \
__asm punpckhwd mm5,mm1 \
__asm pxor mm0,mm0 \
__asm pmaddwd mm5,mm7 \
__asm movd mm7,A \
__asm pcmpeqw mm1,mm0 \
__asm psubw mm1,mm3 \
__asm punpckldq mm7,mm7 \
__asm paddd mm4,mm6 \
__asm paddd mm5,mm6 \
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
__asm movq mm6,mm2 \
__asm movq mm3,mm2 \
__asm pmulhw mm6,mm7 \
__asm pmullw mm3,mm7 \
__asm paddw mm6,mm1 \
__asm movq mm1,mm3 \
__asm punpckhwd mm3,mm6 \
__asm punpcklwd mm1,mm6 \
/*mm1={-1}x4 \
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
__asm paddd mm5,mm3 \
__asm paddd mm4,mm1 \
__asm psrad mm5,16 \
__asm mov A,0x28005460 \
__asm psrad mm4,16 \
__asm pcmpeqb mm1,mm1 \
__asm packssdw mm4,mm5 \
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
mm4=s=(25080*u>>16)-t2''*/ \
__asm movq mm6,mm4 \
__asm pmulhw mm4,mm7 \
__asm pxor mm5,mm5 \
__asm movd mm7,A \
__asm psubw mm5,mm1 \
__asm punpckldq mm7,mm7 \
__asm psubw mm4,mm2 \
/*mm2=s+(s!=0) \
mm4:mm3=s*21600+0x2800*/ \
__asm movq mm3,mm4 \
__asm movq mm2,mm4 \
__asm punpckhwd mm4,mm5 \
__asm pcmpeqw mm0,mm2 \
__asm pmaddwd mm4,mm7 \
__asm psubw mm0,mm1 \
__asm punpcklwd mm3,mm5 \
__asm paddw mm2,mm0 \
__asm pmaddwd mm3,mm7 \
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
__asm movq mm0,[Y+_r4] \
__asm psrad mm4,18 \
__asm movq mm5,[Y+_r5] \
__asm psrad mm3,18 \
__asm movq mm1,[Y+_r7] \
__asm packssdw mm3,mm4 \
__asm movq mm4,[Y+_r0] \
__asm paddw mm3,mm2 \
}
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
/*First 4x4 transpose:*/ \
/*mm0 = e3 e2 e1 e0 \
mm5 = f3 f2 f1 f0 \
mm3 = g3 g2 g1 g0 \
mm1 = h3 h2 h1 h0*/ \
__asm movq mm2,mm0 \
__asm punpcklwd mm0,mm5 \
__asm punpckhwd mm2,mm5 \
__asm movq mm5,mm3 \
__asm punpcklwd mm3,mm1 \
__asm punpckhwd mm5,mm1 \
/*mm0 = f1 e1 f0 e0 \
mm2 = f3 e3 f2 e2 \
mm3 = h1 g1 h0 g0 \
mm5 = h3 g3 h2 g2*/ \
__asm movq mm1,mm0 \
__asm punpckldq mm0,mm3 \
__asm movq [Y+_r4],mm0 \
__asm punpckhdq mm1,mm3 \
__asm movq mm0,[Y+_r1] \
__asm movq mm3,mm2 \
__asm punpckldq mm2,mm5 \
__asm punpckhdq mm3,mm5 \
__asm movq mm5,[Y+_r3] \
/*_y[4] = h0 g0 f0 e0 \
mm1 = h1 g1 f1 e1 \
mm2 = h2 g2 f2 e2 \
mm3 = h3 g3 f3 e3*/ \
/*Second 4x4 transpose:*/ \
/*mm4 = a3 a2 a1 a0 \
mm0 = b3 b2 b1 b0 \
mm6 = c3 c2 c1 c0 \
mm5 = d3 d2 d1 d0*/ \
__asm movq mm7,mm4 \
__asm punpcklwd mm4,mm0 \
__asm punpckhwd mm7,mm0 \
__asm movq mm0,mm6 \
__asm punpcklwd mm6,mm5 \
__asm punpckhwd mm0,mm5 \
/*mm4 = b1 a1 b0 a0 \
mm7 = b3 a3 b2 a2 \
mm6 = d1 c1 d0 c0 \
mm0 = d3 c3 d2 c2*/ \
__asm movq mm5,mm4 \
__asm punpckldq mm4,mm6 \
__asm punpckhdq mm5,mm6 \
__asm movq mm6,mm7 \
__asm punpckhdq mm7,mm0 \
__asm punpckldq mm6,mm0 \
/*mm4 = d0 c0 b0 a0 \
mm5 = d1 c1 b1 a1 \
mm6 = d2 c2 b2 a2 \
mm7 = d3 c3 b3 a3*/ \
}
/*MMX implementation of the fDCT.*/
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
OC_ALIGN8(ogg_int16_t buf[64]);
ogg_int16_t *bufp;
bufp=buf;
__asm{
#define X edx
#define Y eax
#define A ecx
#define BUF esi
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
/*We also add biases to correct for some systematic error that remains in
the full fDCT->iDCT round trip.*/
mov X, _x
mov Y, _y
mov BUF, bufp
movq mm0,[0x00+X]
movq mm1,[0x10+X]
movq mm2,[0x20+X]
movq mm3,[0x30+X]
pcmpeqb mm4,mm4
pxor mm7,mm7
movq mm5,mm0
psllw mm0,2
pcmpeqw mm5,mm7
movq mm7,[0x70+X]
psllw mm1,2
psubw mm5,mm4
psllw mm2,2
mov A,1
pslld mm5,16
movd mm6,A
psllq mm5,16
mov A,0x10001
psllw mm3,2
movd mm4,A
punpckhwd mm5,mm6
psubw mm1,mm6
movq mm6,[0x60+X]
paddw mm0,mm5
movq mm5,[0x50+X]
paddw mm0,mm4
movq mm4,[0x40+X]
/*We inline stage1 of the transform here so we can get better instruction
scheduling with the shifts.*/
/*mm0=t7'=t0-t7*/
psllw mm7,2
psubw mm0,mm7
psllw mm6,2
paddw mm7,mm7
/*mm1=t6'=t1-t6*/
psllw mm5,2
psubw mm1,mm6
psllw mm4,2
paddw mm6,mm6
/*mm2=t5'=t2-t5*/
psubw mm2,mm5
paddw mm5,mm5
/*mm3=t4'=t3-t4*/
psubw mm3,mm4
paddw mm4,mm4
/*mm7=t0'=t0+t7*/
paddw mm7,mm0
/*mm6=t1'=t1+t6*/
paddw mm6,mm1
/*mm5=t2'=t2+t5*/
paddw mm5,mm2
/*mm4=t3'=t3+t4*/
paddw mm4,mm3
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
/*Swap out this 8x4 block for the next one.*/
movq mm0,[0x08+X]
movq [0x30+Y],mm7
movq mm7,[0x78+X]
movq [0x50+Y],mm1
movq mm1,[0x18+X]
movq [0x20+Y],mm6
movq mm6,[0x68+X]
movq [0x60+Y],mm2
movq mm2,[0x28+X]
movq [0x10+Y],mm5
movq mm5,[0x58+X]
movq [0x70+Y],mm3
movq mm3,[0x38+X]
/*And increase its working precision, too.*/
psllw mm0,2
movq [0x00+Y],mm4
psllw mm7,2
movq mm4,[0x48+X]
/*We inline stage1 of the transform here so we can get better instruction
scheduling with the shifts.*/
/*mm0=t7'=t0-t7*/
psubw mm0,mm7
psllw mm1,2
paddw mm7,mm7
psllw mm6,2
/*mm1=t6'=t1-t6*/
psubw mm1,mm6
psllw mm2,2
paddw mm6,mm6
psllw mm5,2
/*mm2=t5'=t2-t5*/
psubw mm2,mm5
psllw mm3,2
paddw mm5,mm5
psllw mm4,2
/*mm3=t4'=t3-t4*/
psubw mm3,mm4
paddw mm4,mm4
/*mm7=t0'=t0+t7*/
paddw mm7,mm0
/*mm6=t1'=t1+t6*/
paddw mm6,mm1
/*mm5=t2'=t2+t5*/
paddw mm5,mm2
/*mm4=t3'=t3+t4*/
paddw mm4,mm3
OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
/*Here the first 4x4 block of output from the last transpose is the second
4x4 block of input for the next transform.
We have cleverly arranged that it already be in the appropriate place,
so we only have to do half the stores and loads.*/
movq mm0,[0x00+Y]
movq [0x58+Y],mm1
movq mm1,[0x10+Y]
movq [0x68+Y],mm2
movq mm2,[0x20+Y]
movq [0x78+Y],mm3
movq mm3,[0x30+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
/*mm0={-2}x4*/
pcmpeqw mm2,mm2
paddw mm2,mm2
/*Round and store the results (no transpose).*/
movq mm7,[Y+0x10]
psubw mm4,mm2
psubw mm6,mm2
psraw mm4,2
psubw mm0,mm2
movq [BUF+0x00],mm4
movq mm4,[Y+0x30]
psraw mm6,2
psubw mm5,mm2
movq [BUF+0x20],mm6
psraw mm0,2
psubw mm3,mm2
movq [BUF+0x40],mm0
psraw mm5,2
psubw mm1,mm2
movq [BUF+0x50],mm5
psraw mm3,2
psubw mm7,mm2
movq [BUF+0x60],mm3
psraw mm1,2
psubw mm4,mm2
movq [BUF+0x70],mm1
psraw mm7,2
movq [BUF+0x10],mm7
psraw mm4,2
movq [BUF+0x30],mm4
/*Load the next block.*/
movq mm0,[0x40+Y]
movq mm7,[0x78+Y]
movq mm1,[0x50+Y]
movq mm6,[0x68+Y]
movq mm2,[0x60+Y]
movq mm5,[0x58+Y]
movq mm3,[0x70+Y]
movq mm4,[0x48+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
/*mm0={-2}x4*/
pcmpeqw mm2,mm2
paddw mm2,mm2
/*Round and store the results (no transpose).*/
movq mm7,[Y+0x50]
psubw mm4,mm2
psubw mm6,mm2
psraw mm4,2
psubw mm0,mm2
movq [BUF+0x08],mm4
movq mm4,[Y+0x70]
psraw mm6,2
psubw mm5,mm2
movq [BUF+0x28],mm6
psraw mm0,2
psubw mm3,mm2
movq [BUF+0x48],mm0
psraw mm5,2
psubw mm1,mm2
movq [BUF+0x58],mm5
psraw mm3,2
psubw mm7,mm2
movq [BUF+0x68],mm3
psraw mm1,2
psubw mm4,mm2
movq [BUF+0x78],mm1
psraw mm7,2
movq [BUF+0x18],mm7
psraw mm4,2
movq [BUF+0x38],mm4
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
__asm movq _reg,[BUF+16*(_row)] \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
__asm movq _reg,[BUF+16*(_row)+8] \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
#undef OC_ZZ_LOAD_ROW_HI
#undef X
#undef Y
#undef A
#undef BUF
}
}
#endif

416
thirdparty/libtheora/x86_vc/mmxfrag.c vendored Normal file
View File

@@ -0,0 +1,416 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of fragment reconstruction for motion compensation.
Originally written by Rudolf Marek.
Additional optimization by Nils Pipenbrinck.
Note: Loops are unrolled for best performance.
The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"
#if defined(OC_X86_ASM)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
do{ \
const unsigned char *src; \
unsigned char *dst; \
src=(_src); \
dst=(_dst); \
__asm mov SRC,src \
__asm mov DST,dst \
__asm mov YSTRIDE,_ystride \
/*src+0*ystride*/ \
__asm movq mm0,[SRC] \
/*src+1*ystride*/ \
__asm movq mm1,[SRC+YSTRIDE] \
/*ystride3=ystride*3*/ \
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
/*src+2*ystride*/ \
__asm movq mm2,[SRC+YSTRIDE*2] \
/*src+3*ystride*/ \
__asm movq mm3,[SRC+YSTRIDE3] \
/*dst+0*ystride*/ \
__asm movq [DST],mm0 \
/*dst+1*ystride*/ \
__asm movq [DST+YSTRIDE],mm1 \
/*Pointer to next 4.*/ \
__asm lea SRC,[SRC+YSTRIDE*4] \
/*dst+2*ystride*/ \
__asm movq [DST+YSTRIDE*2],mm2 \
/*dst+3*ystride*/ \
__asm movq [DST+YSTRIDE3],mm3 \
/*Pointer to next 4.*/ \
__asm lea DST,[DST+YSTRIDE*4] \
/*src+0*ystride*/ \
__asm movq mm0,[SRC] \
/*src+1*ystride*/ \
__asm movq mm1,[SRC+YSTRIDE] \
/*src+2*ystride*/ \
__asm movq mm2,[SRC+YSTRIDE*2] \
/*src+3*ystride*/ \
__asm movq mm3,[SRC+YSTRIDE3] \
/*dst+0*ystride*/ \
__asm movq [DST],mm0 \
/*dst+1*ystride*/ \
__asm movq [DST+YSTRIDE],mm1 \
/*dst+2*ystride*/ \
__asm movq [DST+YSTRIDE*2],mm2 \
/*dst+3*ystride*/ \
__asm movq [DST+YSTRIDE3],mm3 \
} \
while(0)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride){
#define SRC edx
#define DST eax
#define YSTRIDE ecx
#define YSTRIDE3 esi
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
#undef SRC
#undef DST
#undef YSTRIDE
#undef YSTRIDE3
}
/*Copies the fragments specified by the lists of fragment indices from one
frame to another.
_dst_frame: The reference frame to copy to.
_src_frame: The reference frame to copy from.
_ystride: The row stride of the reference frames.
_fragis: A pointer to a list of fragment indices.
_nfragis: The number of fragment indices to copy.
_frag_buf_offs: The offsets of fragments in the reference frames.*/
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
const unsigned char *_src_frame,int _ystride,
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
ptrdiff_t fragii;
for(fragii=0;fragii<_nfragis;fragii++){
ptrdiff_t frag_buf_off;
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
#define SRC edx
#define DST eax
#define YSTRIDE ecx
#define YSTRIDE3 edi
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
_src_frame+frag_buf_off,_ystride);
#undef SRC
#undef DST
#undef YSTRIDE
#undef YSTRIDE3
}
}
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue){
__asm{
#define DST edx
#define DST4 esi
#define YSTRIDE eax
#define YSTRIDE3 edi
#define RESIDUE ecx
mov DST,_dst
mov YSTRIDE,_ystride
mov RESIDUE,_residue
lea DST4,[DST+YSTRIDE*4]
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
pcmpeqw mm0,mm0
/*#0 Load low residue.*/
movq mm1,[0*8+RESIDUE]
/*#0 Load high residue.*/
movq mm2,[1*8+RESIDUE]
/*Set mm0 to 0x8000800080008000.*/
psllw mm0,15
/*#1 Load low residue.*/
movq mm3,[2*8+RESIDUE]
/*#1 Load high residue.*/
movq mm4,[3*8+RESIDUE]
/*Set mm0 to 0x0080008000800080.*/
psrlw mm0,8
/*#2 Load low residue.*/
movq mm5,[4*8+RESIDUE]
/*#2 Load high residue.*/
movq mm6,[5*8+RESIDUE]
/*#0 Bias low residue.*/
paddsw mm1,mm0
/*#0 Bias high residue.*/
paddsw mm2,mm0
/*#0 Pack to byte.*/
packuswb mm1,mm2
/*#1 Bias low residue.*/
paddsw mm3,mm0
/*#1 Bias high residue.*/
paddsw mm4,mm0
/*#1 Pack to byte.*/
packuswb mm3,mm4
/*#2 Bias low residue.*/
paddsw mm5,mm0
/*#2 Bias high residue.*/
paddsw mm6,mm0
/*#2 Pack to byte.*/
packuswb mm5,mm6
/*#0 Write row.*/
movq [DST],mm1
/*#1 Write row.*/
movq [DST+YSTRIDE],mm3
/*#2 Write row.*/
movq [DST+YSTRIDE*2],mm5
/*#3 Load low residue.*/
movq mm1,[6*8+RESIDUE]
/*#3 Load high residue.*/
movq mm2,[7*8+RESIDUE]
/*#4 Load high residue.*/
movq mm3,[8*8+RESIDUE]
/*#4 Load high residue.*/
movq mm4,[9*8+RESIDUE]
/*#5 Load high residue.*/
movq mm5,[10*8+RESIDUE]
/*#5 Load high residue.*/
movq mm6,[11*8+RESIDUE]
/*#3 Bias low residue.*/
paddsw mm1,mm0
/*#3 Bias high residue.*/
paddsw mm2,mm0
/*#3 Pack to byte.*/
packuswb mm1,mm2
/*#4 Bias low residue.*/
paddsw mm3,mm0
/*#4 Bias high residue.*/
paddsw mm4,mm0
/*#4 Pack to byte.*/
packuswb mm3,mm4
/*#5 Bias low residue.*/
paddsw mm5,mm0
/*#5 Bias high residue.*/
paddsw mm6,mm0
/*#5 Pack to byte.*/
packuswb mm5,mm6
/*#3 Write row.*/
movq [DST+YSTRIDE3],mm1
/*#4 Write row.*/
movq [DST4],mm3
/*#5 Write row.*/
movq [DST4+YSTRIDE],mm5
/*#6 Load low residue.*/
movq mm1,[12*8+RESIDUE]
/*#6 Load high residue.*/
movq mm2,[13*8+RESIDUE]
/*#7 Load low residue.*/
movq mm3,[14*8+RESIDUE]
/*#7 Load high residue.*/
movq mm4,[15*8+RESIDUE]
/*#6 Bias low residue.*/
paddsw mm1,mm0
/*#6 Bias high residue.*/
paddsw mm2,mm0
/*#6 Pack to byte.*/
packuswb mm1,mm2
/*#7 Bias low residue.*/
paddsw mm3,mm0
/*#7 Bias high residue.*/
paddsw mm4,mm0
/*#7 Pack to byte.*/
packuswb mm3,mm4
/*#6 Write row.*/
movq [DST4+YSTRIDE*2],mm1
/*#7 Write row.*/
movq [DST4+YSTRIDE3],mm3
#undef DST
#undef DST4
#undef YSTRIDE
#undef YSTRIDE3
#undef RESIDUE
}
}
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
int _ystride,const ogg_int16_t *_residue){
int i;
/*Zero mm0.*/
__asm pxor mm0,mm0;
for(i=4;i-->0;){
__asm{
#define DST edx
#define SRC ecx
#define YSTRIDE edi
#define RESIDUE eax
mov DST,_dst
mov SRC,_src
mov YSTRIDE,_ystride
mov RESIDUE,_residue
/*#0 Load source.*/
movq mm3,[SRC]
/*#1 Load source.*/
movq mm7,[SRC+YSTRIDE]
/*#0 Get copy of src.*/
movq mm4,mm3
/*#0 Expand high source.*/
punpckhbw mm4,mm0
/*#0 Expand low source.*/
punpcklbw mm3,mm0
/*#0 Add residue high.*/
paddsw mm4,[8+RESIDUE]
/*#1 Get copy of src.*/
movq mm2,mm7
/*#0 Add residue low.*/
paddsw mm3,[RESIDUE]
/*#1 Expand high source.*/
punpckhbw mm2,mm0
/*#0 Pack final row pixels.*/
packuswb mm3,mm4
/*#1 Expand low source.*/
punpcklbw mm7,mm0
/*#1 Add residue low.*/
paddsw mm7,[16+RESIDUE]
/*#1 Add residue high.*/
paddsw mm2,[24+RESIDUE]
/*Advance residue.*/
lea RESIDUE,[32+RESIDUE]
/*#1 Pack final row pixels.*/
packuswb mm7,mm2
/*Advance src.*/
lea SRC,[SRC+YSTRIDE*2]
/*#0 Write row.*/
movq [DST],mm3
/*#1 Write row.*/
movq [DST+YSTRIDE],mm7
/*Advance dst.*/
lea DST,[DST+YSTRIDE*2]
mov _residue,RESIDUE
mov _dst,DST
mov _src,SRC
#undef DST
#undef SRC
#undef YSTRIDE
#undef RESIDUE
}
}
}
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
int i;
/*Zero mm7.*/
__asm pxor mm7,mm7;
for(i=4;i-->0;){
__asm{
#define SRC1 ecx
#define SRC2 edi
#define YSTRIDE esi
#define RESIDUE edx
#define DST eax
mov YSTRIDE,_ystride
mov DST,_dst
mov RESIDUE,_residue
mov SRC1,_src1
mov SRC2,_src2
/*#0 Load src1.*/
movq mm0,[SRC1]
/*#0 Load src2.*/
movq mm2,[SRC2]
/*#0 Copy src1.*/
movq mm1,mm0
/*#0 Copy src2.*/
movq mm3,mm2
/*#1 Load src1.*/
movq mm4,[SRC1+YSTRIDE]
/*#0 Unpack lower src1.*/
punpcklbw mm0,mm7
/*#1 Load src2.*/
movq mm5,[SRC2+YSTRIDE]
/*#0 Unpack higher src1.*/
punpckhbw mm1,mm7
/*#0 Unpack lower src2.*/
punpcklbw mm2,mm7
/*#0 Unpack higher src2.*/
punpckhbw mm3,mm7
/*Advance src1 ptr.*/
lea SRC1,[SRC1+YSTRIDE*2]
/*Advance src2 ptr.*/
lea SRC2,[SRC2+YSTRIDE*2]
/*#0 Lower src1+src2.*/
paddsw mm0,mm2
/*#0 Higher src1+src2.*/
paddsw mm1,mm3
/*#1 Copy src1.*/
movq mm2,mm4
/*#0 Build lo average.*/
psraw mm0,1
/*#1 Copy src2.*/
movq mm3,mm5
/*#1 Unpack lower src1.*/
punpcklbw mm4,mm7
/*#0 Build hi average.*/
psraw mm1,1
/*#1 Unpack higher src1.*/
punpckhbw mm2,mm7
/*#0 low+=residue.*/
paddsw mm0,[RESIDUE]
/*#1 Unpack lower src2.*/
punpcklbw mm5,mm7
/*#0 high+=residue.*/
paddsw mm1,[8+RESIDUE]
/*#1 Unpack higher src2.*/
punpckhbw mm3,mm7
/*#1 Lower src1+src2.*/
paddsw mm5,mm4
/*#0 Pack and saturate.*/
packuswb mm0,mm1
/*#1 Higher src1+src2.*/
paddsw mm3,mm2
/*#0 Write row.*/
movq [DST],mm0
/*#1 Build lo average.*/
psraw mm5,1
/*#1 Build hi average.*/
psraw mm3,1
/*#1 low+=residue.*/
paddsw mm5,[16+RESIDUE]
/*#1 high+=residue.*/
paddsw mm3,[24+RESIDUE]
/*#1 Pack and saturate.*/
packuswb mm5,mm3
/*#1 Write row ptr.*/
movq [DST+YSTRIDE],mm5
/*Advance residue ptr.*/
add RESIDUE,32
/*Advance dest ptr.*/
lea DST,[DST+YSTRIDE*2]
mov _dst,DST
mov _residue,RESIDUE
mov _src1,SRC1
mov _src2,SRC2
#undef SRC1
#undef SRC2
#undef YSTRIDE
#undef RESIDUE
#undef DST
}
}
}
void oc_restore_fpu_mmx(void){
__asm emms;
}
#endif

592
thirdparty/libtheora/x86_vc/mmxidct.c vendored Normal file
View File

@@ -0,0 +1,592 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of Theora's iDCT.
Originally written by Rudolf Marek, based on code from On2's VP3.*/
#include "x86int.h"
#include "../dct.h"
#if defined(OC_X86_ASM)
/*These are offsets into the table of constants below.*/
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
#define OC_COSINE_OFFSET (8)
/*A row of 8's.*/
#define OC_EIGHT_OFFSET (0)
/*A table of constants used by the MMX routines.*/
static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
8, 8, 8, 8,
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
};
/*38 cycles*/
#define OC_IDCT_BEGIN(_y,_x) __asm{ \
__asm movq mm2,OC_I(3,_x) \
__asm movq mm6,OC_C(3) \
__asm movq mm4,mm2 \
__asm movq mm7,OC_J(5,_x) \
__asm pmulhw mm4,mm6 \
__asm movq mm1,OC_C(5) \
__asm pmulhw mm6,mm7 \
__asm movq mm5,mm1 \
__asm pmulhw mm1,mm2 \
__asm movq mm3,OC_I(1,_x) \
__asm pmulhw mm5,mm7 \
__asm movq mm0,OC_C(1) \
__asm paddw mm4,mm2 \
__asm paddw mm6,mm7 \
__asm paddw mm2,mm1 \
__asm movq mm1,OC_J(7,_x) \
__asm paddw mm7,mm5 \
__asm movq mm5,mm0 \
__asm pmulhw mm0,mm3 \
__asm paddw mm4,mm7 \
__asm pmulhw mm5,mm1 \
__asm movq mm7,OC_C(7) \
__asm psubw mm6,mm2 \
__asm paddw mm0,mm3 \
__asm pmulhw mm3,mm7 \
__asm movq mm2,OC_I(2,_x) \
__asm pmulhw mm7,mm1 \
__asm paddw mm5,mm1 \
__asm movq mm1,mm2 \
__asm pmulhw mm2,OC_C(2) \
__asm psubw mm3,mm5 \
__asm movq mm5,OC_J(6,_x) \
__asm paddw mm0,mm7 \
__asm movq mm7,mm5 \
__asm psubw mm0,mm4 \
__asm pmulhw mm5,OC_C(2) \
__asm paddw mm2,mm1 \
__asm pmulhw mm1,OC_C(6) \
__asm paddw mm4,mm4 \
__asm paddw mm4,mm0 \
__asm psubw mm3,mm6 \
__asm paddw mm5,mm7 \
__asm paddw mm6,mm6 \
__asm pmulhw mm7,OC_C(6) \
__asm paddw mm6,mm3 \
__asm movq OC_I(1,_y),mm4 \
__asm psubw mm1,mm5 \
__asm movq mm4,OC_C(4) \
__asm movq mm5,mm3 \
__asm pmulhw mm3,mm4 \
__asm paddw mm7,mm2 \
__asm movq OC_I(2,_y),mm6 \
__asm movq mm2,mm0 \
__asm movq mm6,OC_I(0,_x) \
__asm pmulhw mm0,mm4 \
__asm paddw mm5,mm3 \
__asm movq mm3,OC_J(4,_x) \
__asm psubw mm5,mm1 \
__asm paddw mm2,mm0 \
__asm psubw mm6,mm3 \
__asm movq mm0,mm6 \
__asm pmulhw mm6,mm4 \
__asm paddw mm3,mm3 \
__asm paddw mm1,mm1 \
__asm paddw mm3,mm0 \
__asm paddw mm1,mm5 \
__asm pmulhw mm4,mm3 \
__asm paddw mm6,mm0 \
__asm psubw mm6,mm2 \
__asm paddw mm2,mm2 \
__asm movq mm0,OC_I(1,_y) \
__asm paddw mm2,mm6 \
__asm paddw mm4,mm3 \
__asm psubw mm2,mm1 \
}
/*38+8=46 cycles.*/
#define OC_ROW_IDCT(_y,_x) __asm{ \
OC_IDCT_BEGIN(_y,_x) \
/*r3=D'*/ \
__asm movq mm3,OC_I(2,_y) \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*r1=R1=A''+H'*/ \
__asm paddw mm1,mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm3,mm3 \
/*r6=R6=F'-B''*/ \
__asm psubw mm6,mm5 \
__asm paddw mm5,mm5 \
/*r3=R3=E'+D'*/ \
__asm paddw mm3,mm4 \
/*r5=R5=F'+B''*/ \
__asm paddw mm5,mm6 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm0,mm0 \
/*Save R1.*/ \
__asm movq OC_I(1,_y),mm1 \
/*r0=R0=G.+C.*/ \
__asm paddw mm0,mm7 \
}
/*The following macro does two 4x4 transposes in place.
At entry, we assume:
r0 = a3 a2 a1 a0
I(1) = b3 b2 b1 b0
r2 = c3 c2 c1 c0
r3 = d3 d2 d1 d0
r4 = e3 e2 e1 e0
r5 = f3 f2 f1 f0
r6 = g3 g2 g1 g0
r7 = h3 h2 h1 h0
At exit, we have:
I(0) = d0 c0 b0 a0
I(1) = d1 c1 b1 a1
I(2) = d2 c2 b2 a2
I(3) = d3 c3 b3 a3
J(4) = h0 g0 f0 e0
J(5) = h1 g1 f1 e1
J(6) = h2 g2 f2 e2
J(7) = h3 g3 f3 e3
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
Since r1 is free at entry, we calculate the Js first.*/
/*19 cycles.*/
#define OC_TRANSPOSE(_y) __asm{ \
__asm movq mm1,mm4 \
__asm punpcklwd mm4,mm5 \
__asm movq OC_I(0,_y),mm0 \
__asm punpckhwd mm1,mm5 \
__asm movq mm0,mm6 \
__asm punpcklwd mm6,mm7 \
__asm movq mm5,mm4 \
__asm punpckldq mm4,mm6 \
__asm punpckhdq mm5,mm6 \
__asm movq mm6,mm1 \
__asm movq OC_J(4,_y),mm4 \
__asm punpckhwd mm0,mm7 \
__asm movq OC_J(5,_y),mm5 \
__asm punpckhdq mm6,mm0 \
__asm movq mm4,OC_I(0,_y) \
__asm punpckldq mm1,mm0 \
__asm movq mm5,OC_I(1,_y) \
__asm movq mm0,mm4 \
__asm movq OC_J(7,_y),mm6 \
__asm punpcklwd mm0,mm5 \
__asm movq OC_J(6,_y),mm1 \
__asm punpckhwd mm4,mm5 \
__asm movq mm5,mm2 \
__asm punpcklwd mm2,mm3 \
__asm movq mm1,mm0 \
__asm punpckldq mm0,mm2 \
__asm punpckhdq mm1,mm2 \
__asm movq mm2,mm4 \
__asm movq OC_I(0,_y),mm0 \
__asm punpckhwd mm5,mm3 \
__asm movq OC_I(1,_y),mm1 \
__asm punpckhdq mm4,mm5 \
__asm punpckldq mm2,mm5 \
__asm movq OC_I(3,_y),mm4 \
__asm movq OC_I(2,_y),mm2 \
}
/*38+19=57 cycles.*/
#define OC_COLUMN_IDCT(_y) __asm{ \
OC_IDCT_BEGIN(_y,_y) \
__asm paddw mm2,OC_8 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
/*r1=R1=A''+H'*/ \
__asm paddw mm1,mm2 \
/*r2=NR2*/ \
__asm psraw mm2,4 \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=NR1*/ \
__asm psraw mm1,4 \
/*r3=D'*/ \
__asm movq mm3,OC_I(2,_y) \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*Store NR2 at I(2).*/ \
__asm movq OC_I(2,_y),mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*Store NR1 at I(1).*/ \
__asm movq OC_I(1,_y),mm1 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm4,OC_8 \
/*r3=D'+D'*/ \
__asm paddw mm3,mm3 \
/*r3=R3=E'+D'*/ \
__asm paddw mm3,mm4 \
/*r4=NR4*/ \
__asm psraw mm4,4 \
/*r6=R6=F'-B''*/ \
__asm psubw mm6,mm5 \
/*r3=NR3*/ \
__asm psraw mm3,4 \
__asm paddw mm6,OC_8 \
/*r5=B''+B''*/ \
__asm paddw mm5,mm5 \
/*r5=R5=F'+B''*/ \
__asm paddw mm5,mm6 \
/*r6=NR6*/ \
__asm psraw mm6,4 \
/*Store NR4 at J(4).*/ \
__asm movq OC_J(4,_y),mm4 \
/*r5=NR5*/ \
__asm psraw mm5,4 \
/*Store NR3 at I(3).*/ \
__asm movq OC_I(3,_y),mm3 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm7,OC_8 \
/*r0=C'+C'*/ \
__asm paddw mm0,mm0 \
/*r0=R0=G'+C'*/ \
__asm paddw mm0,mm7 \
/*r7=NR7*/ \
__asm psraw mm7,4 \
/*Store NR6 at J(6).*/ \
__asm movq OC_J(6,_y),mm6 \
/*r0=NR0*/ \
__asm psraw mm0,4 \
/*Store NR5 at J(5).*/ \
__asm movq OC_J(5,_y),mm5 \
/*Store NR7 at J(7).*/ \
__asm movq OC_J(7,_y),mm7 \
/*Store NR0 at I(0).*/ \
__asm movq OC_I(0,_y),mm0 \
}
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
int i;
/*This routine accepts an 8x8 matrix, but in partially transposed form.
Every 4x4 block is transposed.*/
__asm{
#define CONSTS eax
#define Y edx
#define X ecx
mov CONSTS,offset OC_IDCT_CONSTS
mov Y,_y
mov X,_x
#define OC_I(_k,_y) [(_y)+(_k)*16]
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
OC_ROW_IDCT(Y,X)
OC_TRANSPOSE(Y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) [(_y)+(_k)*16+64]
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]
OC_ROW_IDCT(Y,X)
OC_TRANSPOSE(Y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) [(_y)+(_k)*16]
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT(Y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT(Y)
#undef OC_I
#undef OC_J
#undef CONSTS
#undef Y
#undef X
}
__asm pxor mm0,mm0;
for(i=0;i<4;i++){
ogg_int16_t *x;
x=_x+16*i;
#define X ecx
__asm{
mov X,x
movq [X+0x00],mm0
movq [X+0x08],mm0
movq [X+0x10],mm0
movq [X+0x18],mm0
}
#undef X
}
}
/*25 cycles.*/
#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
__asm movq mm2,OC_I(3,_x) \
__asm nop \
__asm movq mm6,OC_C(3) \
__asm movq mm4,mm2 \
__asm movq mm1,OC_C(5) \
__asm pmulhw mm4,mm6 \
__asm movq mm3,OC_I(1,_x) \
__asm pmulhw mm1,mm2 \
__asm movq mm0,OC_C(1) \
__asm paddw mm4,mm2 \
__asm pxor mm6,mm6 \
__asm paddw mm2,mm1 \
__asm movq mm5,OC_I(2,_x) \
__asm pmulhw mm0,mm3 \
__asm movq mm1,mm5 \
__asm paddw mm0,mm3 \
__asm pmulhw mm3,OC_C(7) \
__asm psubw mm6,mm2 \
__asm pmulhw mm5,OC_C(2) \
__asm psubw mm0,mm4 \
__asm movq mm7,OC_I(2,_x) \
__asm paddw mm4,mm4 \
__asm paddw mm7,mm5 \
__asm paddw mm4,mm0 \
__asm pmulhw mm1,OC_C(6) \
__asm psubw mm3,mm6 \
__asm movq OC_I(1,_y),mm4 \
__asm paddw mm6,mm6 \
__asm movq mm4,OC_C(4) \
__asm paddw mm6,mm3 \
__asm movq mm5,mm3 \
__asm pmulhw mm3,mm4 \
__asm movq OC_I(2,_y),mm6 \
__asm movq mm2,mm0 \
__asm movq mm6,OC_I(0,_x) \
__asm pmulhw mm0,mm4 \
__asm paddw mm5,mm3 \
__asm paddw mm2,mm0 \
__asm psubw mm5,mm1 \
__asm pmulhw mm6,mm4 \
__asm paddw mm6,OC_I(0,_x) \
__asm paddw mm1,mm1 \
__asm movq mm4,mm6 \
__asm paddw mm1,mm5 \
__asm psubw mm6,mm2 \
__asm paddw mm2,mm2 \
__asm movq mm0,OC_I(1,_y) \
__asm paddw mm2,mm6 \
__asm psubw mm2,mm1 \
__asm nop \
}
/*25+8=33 cycles.*/
#define OC_ROW_IDCT_10(_y,_x) __asm{ \
OC_IDCT_BEGIN_10(_y,_x) \
/*r3=D'*/ \
__asm movq mm3,OC_I(2,_y) \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*r1=R1=A''+H'*/ \
__asm paddw mm1,mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm3,mm3 \
/*r6=R6=F'-B''*/ \
__asm psubw mm6,mm5 \
__asm paddw mm5,mm5 \
/*r3=R3=E'+D'*/ \
__asm paddw mm3,mm4 \
/*r5=R5=F'+B''*/ \
__asm paddw mm5,mm6 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm0,mm0 \
/*Save R1.*/ \
__asm movq OC_I(1,_y),mm1 \
/*r0=R0=G'+C'*/ \
__asm paddw mm0,mm7 \
}
/*25+19=44 cycles'*/
#define OC_COLUMN_IDCT_10(_y) __asm{ \
OC_IDCT_BEGIN_10(_y,_y) \
__asm paddw mm2,OC_8 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
/*r1=R1=A''+H'*/ \
__asm paddw mm1,mm2 \
/*r2=NR2*/ \
__asm psraw mm2,4 \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=NR1*/ \
__asm psraw mm1,4 \
/*r3=D'*/ \
__asm movq mm3,OC_I(2,_y) \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*Store NR2 at I(2).*/ \
__asm movq OC_I(2,_y),mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*Store NR1 at I(1).*/ \
__asm movq OC_I(1,_y),mm1 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm4,OC_8 \
/*r3=D'+D'*/ \
__asm paddw mm3,mm3 \
/*r3=R3=E'+D'*/ \
__asm paddw mm3,mm4 \
/*r4=NR4*/ \
__asm psraw mm4,4 \
/*r6=R6=F'-B''*/ \
__asm psubw mm6,mm5 \
/*r3=NR3*/ \
__asm psraw mm3,4 \
__asm paddw mm6,OC_8 \
/*r5=B''+B''*/ \
__asm paddw mm5,mm5 \
/*r5=R5=F'+B''*/ \
__asm paddw mm5,mm6 \
/*r6=NR6*/ \
__asm psraw mm6,4 \
/*Store NR4 at J(4).*/ \
__asm movq OC_J(4,_y),mm4 \
/*r5=NR5*/ \
__asm psraw mm5,4 \
/*Store NR3 at I(3).*/ \
__asm movq OC_I(3,_y),mm3 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm7,OC_8 \
/*r0=C'+C'*/ \
__asm paddw mm0,mm0 \
/*r0=R0=G'+C'*/ \
__asm paddw mm0,mm7 \
/*r7=NR7*/ \
__asm psraw mm7,4 \
/*Store NR6 at J(6).*/ \
__asm movq OC_J(6,_y),mm6 \
/*r0=NR0*/ \
__asm psraw mm0,4 \
/*Store NR5 at J(5).*/ \
__asm movq OC_J(5,_y),mm5 \
/*Store NR7 at J(7).*/ \
__asm movq OC_J(7,_y),mm7 \
/*Store NR0 at I(0).*/ \
__asm movq OC_I(0,_y),mm0 \
}
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
__asm{
#define CONSTS eax
#define Y edx
#define X ecx
mov CONSTS,offset OC_IDCT_CONSTS
mov Y,_y
mov X,_x
#define OC_I(_k,_y) [(_y)+(_k)*16]
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
/*Done with dequant, descramble, and partial transpose.
Now do the iDCT itself.*/
OC_ROW_IDCT_10(Y,X)
OC_TRANSPOSE(Y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) [(_y)+(_k)*16]
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT_10(Y)
#undef OC_I
#undef OC_J
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
#define OC_J(_k,_y) OC_I(_k,_y)
OC_COLUMN_IDCT_10(Y)
#undef OC_I
#undef OC_J
#undef CONSTS
#undef Y
#undef X
}
#define X ecx
__asm{
pxor mm0,mm0;
mov X,_x
movq [X+0x00],mm0
movq [X+0x10],mm0
movq [X+0x20],mm0
movq [X+0x30],mm0
}
#undef X
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
decoded.
In most cases this is an EOB token (the continuation of an EOB run from a
previous block counts), and so this is the same as the coefficient count.
However, in the case that the last token was NOT an EOB token, but filled
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
Provided the last token was not a pure zero run, the minimum value it can
be is 46, and so that doesn't affect any of the cases in this routine.
However, if the last token WAS a pure zero run of length 63, then _last_zzi
will be 1 while the number of coefficients decoded is 64.
Thus, we will trigger the following special case, where the real
coefficient count would not.
Note also that a zero run of length 64 will give _last_zzi a value of 0,
but we still process the DC coefficient, which might have a non-zero value
due to DC prediction.
Although convoluted, this is arguably the correct behavior: it allows us to
use a smaller transform when the block ends with a long zero run instead
of a normal EOB token.
It could be smarter... multiple separate zero runs at the end of a block
will fool it, but an encoder that generates these really deserves what it
gets.
Needless to say we inherited this approach from VP3.*/
/*Perform the iDCT.*/
if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
else oc_idct8x8_slow(_y,_x);
}
#endif

219
thirdparty/libtheora/x86_vc/mmxloop.h vendored Normal file
View File

@@ -0,0 +1,219 @@
#if !defined(_x86_vc_mmxloop_H)
# define _x86_vc_mmxloop_H (1)
# include <stddef.h>
# include "x86int.h"
#if defined(OC_X86_ASM)
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
#define OC_LOOP_FILTER8_MMX __asm{ \
/*mm7=0*/ \
__asm pxor mm7,mm7 \
/*mm6:mm0={a0,...,a7}*/ \
__asm movq mm6,mm0 \
__asm punpcklbw mm0,mm7 \
__asm punpckhbw mm6,mm7 \
/*mm3:mm5={d0,...,d7}*/ \
__asm movq mm5,mm3 \
__asm punpcklbw mm3,mm7 \
__asm punpckhbw mm5,mm7 \
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
__asm psubw mm0,mm3 \
__asm psubw mm6,mm5 \
/*mm3:mm1={b0,...,b7}*/ \
__asm movq mm3,mm1 \
__asm punpcklbw mm1,mm7 \
__asm movq mm4,mm2 \
__asm punpckhbw mm3,mm7 \
/*mm5:mm4={c0,...,c7}*/ \
__asm movq mm5,mm2 \
__asm punpcklbw mm4,mm7 \
__asm punpckhbw mm5,mm7 \
/*mm7={3}x4 \
mm5:mm4={c0-b0,...,c7-b7}*/ \
__asm pcmpeqw mm7,mm7 \
__asm psubw mm4,mm1 \
__asm psrlw mm7,14 \
__asm psubw mm5,mm3 \
/*Scale by 3.*/ \
__asm pmullw mm4,mm7 \
__asm pmullw mm5,mm7 \
/*mm7={4}x4 \
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
__asm psrlw mm7,1 \
__asm paddw mm4,mm0 \
__asm psllw mm7,2 \
__asm movq mm0,[LL] \
__asm paddw mm5,mm6 \
/*R_i has the range [-127,128], so we compute -R_i instead. \
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
__asm psubw mm4,mm7 \
__asm psubw mm5,mm7 \
__asm psraw mm4,3 \
__asm psraw mm5,3 \
__asm pcmpeqb mm7,mm7 \
__asm packsswb mm4,mm5 \
__asm pxor mm6,mm6 \
__asm pxor mm4,mm7 \
__asm packuswb mm1,mm3 \
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
we have to split things by sign (the other option is to work in 16 bits, \
but working in 8 bits gives much better parallelism). \
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
Finally, we split mm4 into positive and negative pieces using the mask in \
mm6, and add and subtract them as appropriate.*/ \
/*mm4=abs(-R_i)*/ \
/*mm7=255-2*L*/ \
__asm pcmpgtb mm6,mm4 \
__asm psubb mm7,mm0 \
__asm pxor mm4,mm6 \
__asm psubb mm7,mm0 \
__asm psubb mm4,mm6 \
/*mm7=255-max(2*L-abs(R_i),0)*/ \
__asm paddusb mm7,mm4 \
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
__asm paddusb mm4,mm7 \
__asm psubusb mm4,mm7 \
/*Now split mm4 by the original sign of -R_i.*/ \
__asm movq mm5,mm4 \
__asm pand mm4,mm6 \
__asm pandn mm6,mm5 \
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
__asm paddusb mm1,mm4 \
__asm psubusb mm2,mm4 \
__asm psubusb mm1,mm6 \
__asm paddusb mm2,mm6 \
}
#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
do{ \
/*Used local variable pix__ in order to fix compilation errors like: \
"error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \
unsigned char *pix__; \
unsigned char *ll__; \
ll__=(_ll); \
pix__=(_pix); \
__asm mov YSTRIDE,_ystride \
__asm mov LL,ll__ \
__asm mov PIX,pix__ \
__asm sub PIX,YSTRIDE \
__asm sub PIX,YSTRIDE \
/*mm0={a0,...,a7}*/ \
__asm movq mm0,[PIX] \
/*ystride3=_ystride*3*/ \
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
/*mm3={d0,...,d7}*/ \
__asm movq mm3,[PIX+YSTRIDE3] \
/*mm1={b0,...,b7}*/ \
__asm movq mm1,[PIX+YSTRIDE] \
/*mm2={c0,...,c7}*/ \
__asm movq mm2,[PIX+YSTRIDE*2] \
OC_LOOP_FILTER8_MMX \
/*Write it back out.*/ \
__asm movq [PIX+YSTRIDE],mm1 \
__asm movq [PIX+YSTRIDE*2],mm2 \
} \
while(0)
#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
do{ \
/*Used local variable ll__ in order to fix compilation errors like: \
"error C2443: operand size conflict".*/ \
unsigned char *ll__; \
unsigned char *pix__; \
ll__=(_ll); \
pix__=(_pix)-2; \
__asm mov PIX,pix__ \
__asm mov YSTRIDE,_ystride \
__asm mov LL,ll__ \
/*x x x x d0 c0 b0 a0*/ \
__asm movd mm0,[PIX] \
/*x x x x d1 c1 b1 a1*/ \
__asm movd mm1,[PIX+YSTRIDE] \
/*ystride3=_ystride*3*/ \
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
/*x x x x d2 c2 b2 a2*/ \
__asm movd mm2,[PIX+YSTRIDE*2] \
/*x x x x d3 c3 b3 a3*/ \
__asm lea D,[PIX+YSTRIDE*4] \
__asm movd mm3,[PIX+YSTRIDE3] \
/*x x x x d4 c4 b4 a4*/ \
__asm movd mm4,[D] \
/*x x x x d5 c5 b5 a5*/ \
__asm movd mm5,[D+YSTRIDE] \
/*x x x x d6 c6 b6 a6*/ \
__asm movd mm6,[D+YSTRIDE*2] \
/*x x x x d7 c7 b7 a7*/ \
__asm movd mm7,[D+YSTRIDE3] \
/*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
__asm punpcklbw mm0,mm1 \
/*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
__asm punpcklbw mm2,mm3 \
/*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
__asm movq mm3,mm0 \
/*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
__asm punpcklwd mm0,mm2 \
/*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
__asm punpckhwd mm3,mm2 \
/*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
__asm movq mm1,mm0 \
/*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
__asm punpcklbw mm4,mm5 \
/*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
__asm punpcklbw mm6,mm7 \
/*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
__asm movq mm5,mm4 \
/*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
__asm punpcklwd mm4,mm6 \
/*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
__asm punpckhwd mm5,mm6 \
/*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
__asm movq mm2,mm3 \
/*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
__asm punpckldq mm0,mm4 \
/*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
__asm punpckhdq mm1,mm4 \
/*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
__asm punpckldq mm2,mm5 \
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
__asm punpckhdq mm3,mm5 \
OC_LOOP_FILTER8_MMX \
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
__asm movq mm0,mm1 \
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
__asm punpcklbw mm1,mm2 \
/*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
__asm punpckhbw mm0,mm2 \
/*[d]=c1 b1 c0 b0*/ \
__asm movd D,mm1 \
__asm mov [PIX+1],D_WORD \
__asm psrlq mm1,32 \
__asm shr D,16 \
__asm mov [PIX+YSTRIDE+1],D_WORD \
/*[d]=c3 b3 c2 b2*/ \
__asm movd D,mm1 \
__asm mov [PIX+YSTRIDE*2+1],D_WORD \
__asm shr D,16 \
__asm mov [PIX+YSTRIDE3+1],D_WORD \
__asm lea PIX,[PIX+YSTRIDE*4] \
/*[d]=c5 b5 c4 b4*/ \
__asm movd D,mm0 \
__asm mov [PIX+1],D_WORD \
__asm psrlq mm0,32 \
__asm shr D,16 \
__asm mov [PIX+YSTRIDE+1],D_WORD \
/*[d]=c7 b7 c6 b6*/ \
__asm movd D,mm0 \
__asm mov [PIX+YSTRIDE*2+1],D_WORD \
__asm shr D,16 \
__asm mov [PIX+YSTRIDE3+1],D_WORD \
} \
while(0)
# endif
#endif

176
thirdparty/libtheora/x86_vc/mmxstate.c vendored Normal file
View File

@@ -0,0 +1,176 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
/*MMX acceleration of complete fragment reconstruction algorithm.
Originally written by Rudolf Marek.*/
#include <string.h>
#include "x86int.h"
#include "mmxloop.h"
#if defined(OC_X86_ASM)
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
int refi;
/*Apply the inverse transform.*/
/*Special case only having a DC component.*/
if(_last_zzi<2){
/*Note that this value must be unsigned, to keep the __asm__ block from
sign-extending it when it puts it in a register.*/
ogg_uint16_t p;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
/*Fill _dct_coeffs with p.*/
__asm{
#define Y eax
#define P ecx
mov Y,_dct_coeffs
movzx P,p
lea Y,[Y+128]
/*mm0=0000 0000 0000 AAAA*/
movd mm0,P
/*mm0=0000 0000 AAAA AAAA*/
punpcklwd mm0,mm0
/*mm0=AAAA AAAA AAAA AAAA*/
punpckldq mm0,mm0
movq [Y],mm0
movq [8+Y],mm0
movq [16+Y],mm0
movq [24+Y],mm0
movq [32+Y],mm0
movq [40+Y],mm0
movq [48+Y],mm0
movq [56+Y],mm0
movq [64+Y],mm0
movq [72+Y],mm0
movq [80+Y],mm0
movq [88+Y],mm0
movq [96+Y],mm0
movq [104+Y],mm0
movq [112+Y],mm0
movq [120+Y],mm0
#undef Y
#undef P
}
}
else{
/*Dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
refi=_state->frags[_fragi].refi;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
else{
const unsigned char *ref;
int mvoffsets[2];
ref=_state->ref_frame_data[refi]+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi])>1){
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
_dct_coeffs+64);
}
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
}
}
/*We copy these entire function to inline the actual MMX routines so that we
use only a single indirect call.*/
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
memset(_bv,~(_flimit<<1),8);
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
The filter may be run on the bottom edge, affecting pixels in the next row of
fragments, so this row also needs to be available.
_bv: The bounding values array.
_refi: The index of the frame buffer to filter.
_pli: The color plane to filter.
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
unsigned char *ref_frame_data;
ptrdiff_t fragi_top;
ptrdiff_t fragi_bot;
ptrdiff_t fragi0;
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
However, the order that the filters are applied in matters, and VP3 chose
the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
fragi=fragi0;
fragi_end=fragi+nhfrags;
while(fragi<fragi_end){
if(frags[fragi].coded){
unsigned char *ref;
ref=ref_frame_data+frag_buf_offs[fragi];
#define PIX eax
#define YSTRIDE3 edi
#define YSTRIDE ecx
#define LL edx
#define D esi
#define D_WORD si
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
OC_LOOP_FILTER_V_MMX(ref+(ystride*8),ystride,_bv);
}
#undef PIX
#undef YSTRIDE3
#undef YSTRIDE
#undef LL
#undef D
#undef D_WORD
}
fragi++;
}
fragi0+=nhfrags;
}
}
#endif

192
thirdparty/libtheora/x86_vc/x86cpu.c vendored Normal file
View File

@@ -0,0 +1,192 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
CPU capability detection for x86 processors.
Originally written by Rudolf Marek.
function:
********************************************************************/
#include "x86cpu.h"
#if !defined(OC_X86_ASM)
ogg_uint32_t oc_cpu_flags_get(void){
return 0;
}
#else
/*Why does MSVC need this complicated rigamarole?
At this point I honestly do not care.*/
/*Visual C cpuid helper function.
For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
for VS2003 users, so we do it in inline assembler.*/
static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
_asm{
mov eax,[_op]
mov esi,_cpu_info
cpuid
mov [esi+0],eax
mov [esi+4],ebx
mov [esi+8],ecx
mov [esi+12],edx
}
}
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
do{ \
ogg_uint32_t cpu_info[4]; \
oc_cpuid_helper(cpu_info,_op); \
(_eax)=cpu_info[0]; \
(_ebx)=cpu_info[1]; \
(_ecx)=cpu_info[2]; \
(_edx)=cpu_info[3]; \
}while(0)
static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
_asm{
pushfd
pushfd
pop eax
mov ebx,eax
xor eax,200000h
push eax
popfd
pushfd
pop eax
popfd
mov ecx,_eax
mov [ecx],eax
mov ecx,_ebx
mov [ecx],ebx
}
}
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
ogg_uint32_t flags;
/*If there isn't even MMX, give up.*/
if(!(_edx&0x00800000))return 0;
flags=OC_CPU_X86_MMX;
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
return flags;
}
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
ogg_uint32_t flags;
/*If there isn't even MMX, give up.*/
if(!(_edx&0x00800000))return 0;
flags=OC_CPU_X86_MMX;
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
return flags;
}
ogg_uint32_t oc_cpu_flags_get(void){
ogg_uint32_t flags;
ogg_uint32_t eax;
ogg_uint32_t ebx;
ogg_uint32_t ecx;
ogg_uint32_t edx;
# if !defined(__amd64__)&&!defined(__x86_64__)
/*Not all x86-32 chips support cpuid, so we have to check.*/
oc_detect_cpuid_helper(&eax,&ebx);
/*No cpuid.*/
if(eax==ebx)return 0;
# endif
cpuid(0,eax,ebx,ecx,edx);
/* l e t n I e n i u n e G*/
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
/* 6 8 x M T e n i u n e G*/
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
int family;
int model;
/*Intel, Transmeta (tested with Crusoe TM5800):*/
cpuid(1,eax,ebx,ecx,edx);
flags=oc_parse_intel_flags(edx,ecx);
family=(eax>>8)&0xF;
model=(eax>>4)&0xF;
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
unit, so don't use it.*/
if(family==6&&(model==9||model==13||model==14)){
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
}
}
/* D M A c i t n e h t u A*/
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
/* C S N y b e d o e G*/
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
/*AMD, Geode:*/
cpuid(0x80000000,eax,ebx,ecx,edx);
if(eax<0x80000001)flags=0;
else{
cpuid(0x80000001,eax,ebx,ecx,edx);
flags=oc_parse_amd_flags(edx,ecx);
}
/*Also check for SSE.*/
cpuid(1,eax,ebx,ecx,edx);
flags|=oc_parse_intel_flags(edx,ecx);
}
/*Technically some VIA chips can be configured in the BIOS to return any
string here the user wants.
There is a special detection method that can be used to identify such
processors, but in my opinion, if the user really wants to change it, they
deserve what they get.*/
/* s l u a H r u a t n e C*/
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
/*VIA:*/
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
chips (thanks to the engineers from Centaur Technology who provided it).
These chips support Intel-like cpuid info.
The C3-2 (Nehemiah) cores appear to, as well.*/
cpuid(1,eax,ebx,ecx,edx);
flags=oc_parse_intel_flags(edx,ecx);
if(eax>=0x80000001){
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
We need to check this even if the Intel test succeeds to pick up 3DNow!
support on these processors.
Unlike actual AMD processors, we cannot _rely_ on this info, since
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
this function, yet return edx=0, despite the Intel test indicating
MMX support.
Therefore the features detected here are strictly added to those
detected by the Intel test.*/
/*TODO: How about earlier chips?*/
cpuid(0x80000001,eax,ebx,ecx,edx);
/*Note: As of the C7, this function returns Intel-style extended feature
flags, not AMD-style.
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
do not conflict with any of the AMD flags we inspect.
For the remaining bits, Intel tells us, "Do not count on their value",
but VIA assures us that they will all be zero (at least on the C7 and
Isaiah chips).
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
(0xC0C00000) for something else, we will have to add code to detect
the model to decide when it is appropriate to inspect them.*/
flags|=oc_parse_amd_flags(edx,ecx);
}
}
else{
/*Implement me.*/
flags=0;
}
return flags;
}
#endif

36
thirdparty/libtheora/x86_vc/x86cpu.h vendored Normal file
View File

@@ -0,0 +1,36 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_vc_x86cpu_H)
# define _x86_vc_x86cpu_H (1)
#include "../internal.h"
#define OC_CPU_X86_MMX (1<<0)
#define OC_CPU_X86_3DNOW (1<<1)
#define OC_CPU_X86_3DNOWEXT (1<<2)
#define OC_CPU_X86_MMXEXT (1<<3)
#define OC_CPU_X86_SSE (1<<4)
#define OC_CPU_X86_SSE2 (1<<5)
#define OC_CPU_X86_PNI (1<<6)
#define OC_CPU_X86_SSSE3 (1<<7)
#define OC_CPU_X86_SSE4_1 (1<<8)
#define OC_CPU_X86_SSE4_2 (1<<9)
#define OC_CPU_X86_SSE4A (1<<10)
#define OC_CPU_X86_SSE5 (1<<11)
ogg_uint32_t oc_cpu_flags_get(void);
#endif

47
thirdparty/libtheora/x86_vc/x86enc.c vendored Normal file
View File

@@ -0,0 +1,47 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include "x86enc.h"
#if defined(OC_X86_ASM)
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
ogg_uint32_t cpu_flags;
cpu_flags=_enc->state.cpu_flags;
oc_enc_accel_init_c(_enc);
if(cpu_flags&OC_CPU_X86_MMX){
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
}
if(cpu_flags&OC_CPU_X86_MMXEXT){
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
}
if(cpu_flags&OC_CPU_X86_SSE2){
# if defined(OC_X86_64_ASM)
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
# endif
}
}
#endif

51
thirdparty/libtheora/x86_vc/x86enc.h vendored Normal file
View File

@@ -0,0 +1,51 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_vc_x86enc_H)
# define _x86_vc_x86enc_H (1)
# include "x86int.h"
# if defined(OC_X86_ASM)
# define oc_enc_accel_init oc_enc_accel_init_x86
# define OC_ENC_USE_VTABLE (1)
# endif
# include "../encint.h"
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride,unsigned _thresh);
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh);
unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
const unsigned char *_src,int _ystride);
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
const unsigned char *_x,const unsigned char *_y,int _stride);
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
const unsigned char *_x,int _stride);
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
#endif

49
thirdparty/libtheora/x86_vc/x86int.h vendored Normal file
View File

@@ -0,0 +1,49 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_vc_x86int_H)
# define _x86_vc_x86int_H (1)
# include "../internal.h"
# if defined(OC_X86_ASM)
# define oc_state_accel_init oc_state_accel_init_x86
# define OC_STATE_USE_VTABLE (1)
# endif
# include "../state.h"
# include "x86cpu.h"
void oc_state_accel_init_x86(oc_theora_state *_state);
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride);
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
const unsigned char *_src_frame,int _ystride,
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu_mmx(void);
#endif

61
thirdparty/libtheora/x86_vc/x86state.c vendored Normal file
View File

@@ -0,0 +1,61 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#include "x86int.h"
#if defined(OC_X86_ASM)
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
each quadrant of the destination.*/
static const unsigned char OC_FZIG_ZAG_MMX[128]={
0, 8, 1, 2, 9,16,24,17,
10, 3,32,11,18,25, 4,12,
5,26,19,40,33,34,41,48,
27, 6,13,20,28,21,14, 7,
56,49,42,35,43,50,57,36,
15,22,29,30,23,44,37,58,
51,59,38,45,52,31,60,53,
46,39,47,54,61,62,55,63,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,
};
void oc_state_accel_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
if(_state->cpu_flags&OC_CPU_X86_MMX){
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
else oc_state_accel_init_c(_state);
}
#endif

244
thirdparty/libtheora/x86_vc/x86zigzag.h vendored Normal file
View File

@@ -0,0 +1,244 @@
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors *
* https://www.xiph.org/ *
* *
********************************************************************
function:
********************************************************************/
#if !defined(_x86_vc_x86zigzag_H)
# define _x86_vc_x86zigzag_H (1)
# include "x86enc.h"
/*Converts DCT coefficients from transposed order into zig-zag scan order and
stores them in Y.
This relies on two macros to load the contents of each row:
OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
first four and second four entries of each row into the specified register,
respectively.
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
(because when the rows are already in SSE2 registers, loading the high half
destructively modifies the register).
The index of each output element in the original 64-element array should wind
up in the following 8x8 matrix (the letters indicate the order we compute
each 4-tuple below):
A 0 8 1 2 9 16 24 17 B
C 10 3 4 11 18 25 32 40 E
F 33 26 19 12 5 6 13 20 D
G 27 34 41 48 56 49 42 35 I
L 28 21 14 7 15 22 29 36 M
H 43 50 57 58 51 44 37 30 O
N 23 31 38 45 52 59 60 53 J
P 46 39 47 54 61 62 55 63 K
The order of the coefficients within each tuple is reversed in the comments
below to reflect the usual MSB to LSB notation.*/
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \
OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \
OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \
OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \
OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \
OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \
OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \
__asm movq mm7,mm0 /*mm7=03 02 01 00*/ \
__asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \
__asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
__asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \
__asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
__asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \
__asm movq [Y+0x00],mm7 \
__asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \
__asm movq mm7,mm2 /*mm7=19 18 17 16*/ \
__asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \
__asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \
__asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \
__asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \
__asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \
OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \
__asm movq [Y+0x08],mm1 \
OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \
__asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
__asm movq [Y+0x10],mm0 \
__asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \
__asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \
__asm movq [Y+0x28],mm4 \
__asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \
__asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
__asm movq mm4,mm7 /*mm4=12 19 15 18*/ \
__asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \
__asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \
__asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \
__asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \
__asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \
OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \
__asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
__asm movq [Y+0x18],mm4 \
OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \
__asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \
__asm movq [Y+0x20],mm2 \
__asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
__asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
__asm movq mm2,mm3 /*mm2=35 42 34 27*/ \
__asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \
__asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
__asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \
__asm movq [Y+0x30],mm3 \
__asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \
__asm movq [Y+0x50],mm1 \
OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \
__asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \
OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \
__asm psllq mm6,16 /*mm6=07 23 22 ..*/ \
__asm movq mm3,mm4 /*mm3=49 56 51 59*/ \
__asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \
OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \
__asm movq [Y+0x38],mm4 \
__asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \
__asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \
__asm movq mm4,mm3 /*mm4=61 51 60 59*/ \
__asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \
__asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \
OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \
__asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
__asm movq [Y+0x68],mm3 \
__asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \
__asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
__asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \
OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \
__asm movq [Y+0x78],mm4 \
__asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \
__asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \
__asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \
__asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
__asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
__asm movq [Y+0x40],mm5 \
__asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \
__asm movq [Y+0x48],mm7 \
__asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
__asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \
__asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \
__asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \
__asm movq [Y+0x60],mm6 \
__asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \
__asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \
__asm movq [Y+0x58],mm3 \
__asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
__asm movq [Y+0x70],mm0 \
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
order and stores them in %[qdct].
The index of each output element in the original 64-element array should wind
up in the following 8x8 matrix (the letters indicate the order we compute
each 4-tuple below):
A 0 1 8 16 9 2 3 10 B
C 17 24 32 25 18 11 4 5 D
E 12 19 26 33 40 48 41 34 I
H 27 20 13 6 7 14 21 28 G
K 35 42 49 56 57 50 43 36 J
F 29 22 15 23 30 37 44 51 M
P 58 59 52 45 38 31 39 46 L
N 53 60 61 54 47 55 62 63 O
The order of the coefficients within each tuple is reversed in the comments
below to reflect the usual MSB to LSB notation.*/
#define OC_ZIG_ZAG_MMXEXT \
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
"movq %%mm0,0x00(%[qdct])\n\t" \
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
"movq %%mm6,0x08(%[qdct])\n\t" \
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
"movq %%mm2,0x10(%[qdct])\n\t" \
"movq %%mm3,0x18(%[qdct])\n\t" \
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
"movq %%mm0,0x20(%[qdct])\n\t" \
"movq %%mm3,0x50(%[qdct])\n\t" \
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
"movq %%mm2,0x30(%[qdct])\n\t" \
"movq %%mm6,0x38(%[qdct])\n\t" \
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
"movq %%mm0,0x28(%[qdct])\n\t" \
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
"movq %%mm4,0x40(%[qdct])\n\t" \
"movq %%mm6,0x48(%[qdct])\n\t" \
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
"movq %%mm2,0x68(%[qdct])\n\t" \
"movq %%mm1,0x58(%[qdct])\n\t" \
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
"movq %%mm6,0x70(%[qdct])\n\t" \
"movq %%mm5,0x78(%[qdct])\n\t" \
"movq %%mm7,0x60(%[qdct])\n\t" \
#endif