[xiph-commits] r17736 - in trunk/theora/lib: x86 x86_vc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Tue Dec 7 06:13:55 PST 2010
Author: tterribe
Date: 2010-12-07 06:13:55 -0800 (Tue, 07 Dec 2010)
New Revision: 17736
Modified:
trunk/theora/lib/x86/mmxfdct.c
trunk/theora/lib/x86/sse2fdct.c
trunk/theora/lib/x86/x86zigzag.h
trunk/theora/lib/x86_vc/mmxfdct.c
trunk/theora/lib/x86_vc/x86enc.c
trunk/theora/lib/x86_vc/x86enc.h
Log:
Minor fix-ups to r17728.
Convert references to the stack buffer in the MMX fDCT to use esp-relative
offsets, saving a register.
Move the MSVC MMX fDCT into the MMXEXT section (as was done for the gcc one),
since it now requires pshufw for the zig-zagging.
Modified: trunk/theora/lib/x86/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86/mmxfdct.c 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86/mmxfdct.c 2010-12-07 14:13:55 UTC (rev 17736)
@@ -464,7 +464,7 @@
/*MMX implementation of the fDCT.*/
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- ogg_int16_t buf[64] __attribute__((aligned(8)));
+ OC_ALIGN8(ogg_int16_t buf[64]);
ptrdiff_t a;
__asm__ __volatile__(
/*Add two extra bits of working precision to improve accuracy; any more and
@@ -597,27 +597,27 @@
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
- "movq %%mm4,0x00(%[buf])\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
"movq 0x30(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
- "movq %%mm6,0x20(%[buf])\n\t"
+ "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
- "movq %%mm0,0x40(%[buf])\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
- "movq %%mm5,0x50(%[buf])\n\t"
+ "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
- "movq %%mm3,0x60(%[buf])\n\t"
+ "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
- "movq %%mm1,0x70(%[buf])\n\t"
+ "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
"psraw $2,%%mm7\n\t"
- "movq %%mm7,0x10(%[buf])\n\t"
+ "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
"psraw $2,%%mm4\n\t"
- "movq %%mm4,0x30(%[buf])\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
/*Load the next block.*/
"movq 0x40(%[y]),%%mm0\n\t"
"movq 0x78(%[y]),%%mm7\n\t"
@@ -638,39 +638,39 @@
"psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
"psubw %%mm2,%%mm0\n\t"
- "movq %%mm4,0x08(%[buf])\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
"movq 0x70(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
"psubw %%mm2,%%mm5\n\t"
- "movq %%mm6,0x28(%[buf])\n\t"
+ "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
"psraw $2,%%mm0\n\t"
"psubw %%mm2,%%mm3\n\t"
- "movq %%mm0,0x48(%[buf])\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
"psraw $2,%%mm5\n\t"
"psubw %%mm2,%%mm1\n\t"
- "movq %%mm5,0x58(%[buf])\n\t"
+ "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
"psraw $2,%%mm3\n\t"
"psubw %%mm2,%%mm7\n\t"
- "movq %%mm3,0x68(%[buf])\n\t"
+ "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
"psraw $2,%%mm1\n\t"
"psubw %%mm2,%%mm4\n\t"
- "movq %%mm1,0x78(%[buf])\n\t"
+ "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
"psraw $2,%%mm7\n\t"
- "movq %%mm7,0x18(%[buf])\n\t"
+ "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
"psraw $2,%%mm4\n\t"
- "movq %%mm4,0x38(%[buf])\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
/*Final transpose and zig-zag.*/
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
- "movq 0x"_row"0(%[buf]),"_reg"\n\t" \
+ "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
- "movq 0x"_row"8(%[buf]),"_reg"\n\t" \
+ "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
#undef OC_ZZ_LOAD_ROW_HI
- :[a]"=&r"(a)
- :[y]"r"(_y),[x]"r"(_x),[buf]"r"(buf)
+ :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
+ :[y]"r"(_y),[x]"r"(_x)
:"memory"
);
}
Modified: trunk/theora/lib/x86/sse2fdct.c
===================================================================
--- trunk/theora/lib/x86/sse2fdct.c 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86/sse2fdct.c 2010-12-07 14:13:55 UTC (rev 17736)
@@ -435,11 +435,11 @@
/*We could probably do better using SSSE3's palignr, but re-using MMXEXT
version will do for now.*/
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
- "movdq2q %%xmm"_row","_reg"\n\t" \
+ "movdq2q %%xmm"#_row","_reg"\n\t" \
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
- "punpckhqdq %%xmm"_row",%%xmm"_row"\n\t" \
- "movdq2q %%xmm"_row","_reg"\n\t" \
+ "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
+ "movdq2q %%xmm"#_row","_reg"\n\t" \
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
#undef OC_ZZ_LOAD_ROW_LO
Modified: trunk/theora/lib/x86/x86zigzag.h
===================================================================
--- trunk/theora/lib/x86/x86zigzag.h 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86/x86zigzag.h 2010-12-07 14:13:55 UTC (rev 17736)
@@ -23,9 +23,9 @@
/*Converts DCT coefficients from transposed order into zig-zag scan order and
stores them in %[y].
This relies on two macros to load the contents of each row:
- OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
- first four and second four entries of each row into the specified register,
- respectively.
+ OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
+ the first four and second four entries of each row into the specified
+ register, respectively.
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
(because when the rows are already in SSE2 registers, loading the high half
destructively modifies the register).
@@ -43,13 +43,13 @@
The order of the coefficients within each tuple is reversed in the comments
below to reflect the usual MSB to LSB notation.*/
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
- OC_ZZ_LOAD_ROW_LO("0","%%mm0") /*mm0=03 02 01 00*/ \
- OC_ZZ_LOAD_ROW_LO("1","%%mm1") /*mm1=11 10 09 08*/ \
- OC_ZZ_LOAD_ROW_LO("2","%%mm2") /*mm2=19 18 17 16*/ \
- OC_ZZ_LOAD_ROW_LO("3","%%mm3") /*mm3=27 26 25 24*/ \
- OC_ZZ_LOAD_ROW_HI("0","%%mm4") /*mm4=07 06 05 04*/ \
- OC_ZZ_LOAD_ROW_HI("1","%%mm5") /*mm5=15 14 13 12*/ \
- OC_ZZ_LOAD_ROW_HI("2","%%mm6") /*mm6=23 22 21 20*/ \
+ OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \
+ OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \
+ OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \
+ OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \
+ OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \
+ OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \
+ OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \
"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
@@ -64,9 +64,9 @@
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
- OC_ZZ_LOAD_ROW_LO("4","%%mm2") /*mm2=35 34 33 32*/ \
+ OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \
"movq %%mm1,0x08(%[y])\n\t" \
- OC_ZZ_LOAD_ROW_LO("5","%%mm1") /*mm1=43 42 41 40*/ \
+ OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \
"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
"movq %%mm0,0x10(%[y])\n\t" \
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
@@ -80,10 +80,10 @@
"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
- OC_ZZ_LOAD_ROW_LO("6","%%mm0") /*mm0=51 50 49 48*/ \
+ OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \
"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
"movq %%mm4,0x18(%[y])\n\t" \
- OC_ZZ_LOAD_ROW_LO("7","%%mm4") /*mm4=59 58 57 56*/ \
+ OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \
"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
"movq %%mm2,0x20(%[y])\n\t" \
"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
@@ -95,26 +95,26 @@
"movq %%mm3,0x30(%[y])\n\t" \
"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
"movq %%mm1,0x50(%[y])\n\t" \
- OC_ZZ_LOAD_ROW_HI("7","%%mm1") /*mm1=63 62 61 60*/ \
+ OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \
"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
- OC_ZZ_LOAD_ROW_HI("6","%%mm0") /*mm0=55 54 53 52*/ \
+ OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \
"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
- OC_ZZ_LOAD_ROW_HI("3","%%mm2") /*mm2=31 30 29 28*/ \
+ OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \
"movq %%mm4,0x38(%[y])\n\t" \
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
- OC_ZZ_LOAD_ROW_HI("4","%%mm0") /*mm0=39 38 37 36*/ \
+ OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \
"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
"movq %%mm3,0x68(%[y])\n\t" \
"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
- OC_ZZ_LOAD_ROW_HI("5","%%mm1") /*mm1=47 46 45 44*/ \
+ OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \
"movq %%mm4,0x78(%[y])\n\t" \
"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
Modified: trunk/theora/lib/x86_vc/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxfdct.c 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86_vc/mmxfdct.c 2010-12-07 14:13:55 UTC (rev 17736)
@@ -464,7 +464,7 @@
/*MMX implementation of the fDCT.*/
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- __declspec (align(8)) ogg_int16_t buf[64];
+ OC_ALIGN8(ogg_int16_t buf[64]);
ptrdiff_t a;
__asm{
#define X edx
Modified: trunk/theora/lib/x86_vc/x86enc.c
===================================================================
--- trunk/theora/lib/x86_vc/x86enc.c 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86_vc/x86enc.c 2010-12-07 14:13:55 UTC (rev 17736)
@@ -27,7 +27,6 @@
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
- _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
}
if(cpu_flags&OC_CPU_X86_MMXEXT){
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
@@ -37,6 +36,7 @@
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+ _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
}
if(cpu_flags&OC_CPU_X86_SSE2){
# if defined(OC_X86_64_ASM)
Modified: trunk/theora/lib/x86_vc/x86enc.h
===================================================================
--- trunk/theora/lib/x86_vc/x86enc.h 2010-12-07 11:41:08 UTC (rev 17735)
+++ trunk/theora/lib/x86_vc/x86enc.h 2010-12-07 14:13:55 UTC (rev 17736)
@@ -45,7 +45,7 @@
const unsigned char *_x,int _stride);
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
#endif
More information about the commits
mailing list