[xiph-commits] r17250 - experimental/derf/theora-ptalarbvorm/lib/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Fri May 28 15:44:24 PDT 2010


Author: tterribe
Date: 2010-05-28 15:44:24 -0700 (Fri, 28 May 2010)
New Revision: 17250

Modified:
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
Log:
Actually use a constant table for the SSE2 iDCT.
This requires an extra thunk call to set up the PIC register on x86-32, but
 most x86-32 shuffle units are slow enough that it should be worth it over the
 course of 15 loads.
x86-64 doesn't require the extra thunk call, so this gives roughly a 1%
 speed-up overall on i7.


Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-05-28 07:09:00 UTC (rev 17249)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-05-28 22:44:24 UTC (rev 17250)
@@ -30,41 +30,21 @@
 
 
 
-/*A table of constants used by the MMX routines.*/
-static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
 /*38 cycles*/
 #define OC_IDCT_BEGIN \
   "#OC_IDCT_BEGIN\n\t" \
   "movq "OC_I(3)",%%mm2\n\t" \
-  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq 0x30(%[c]),%%mm6\n\t" \
   "movq %%mm2,%%mm4\n\t" \
   "movq "OC_J(5)",%%mm7\n\t" \
   "pmulhw %%mm6,%%mm4\n\t" \
-  "movq "OC_C(5)",%%mm1\n\t" \
+  "movq 0x50(%[c]),%%mm1\n\t" \
   "pmulhw %%mm7,%%mm6\n\t" \
   "movq %%mm1,%%mm5\n\t" \
   "pmulhw %%mm2,%%mm1\n\t" \
   "movq "OC_I(1)",%%mm3\n\t" \
   "pmulhw %%mm7,%%mm5\n\t" \
-  "movq "OC_C(1)",%%mm0\n\t" \
+  "movq 0x10(%[c]),%%mm0\n\t" \
   "paddw %%mm2,%%mm4\n\t" \
   "paddw %%mm7,%%mm6\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
@@ -74,7 +54,7 @@
   "pmulhw %%mm3,%%mm0\n\t" \
   "paddw %%mm7,%%mm4\n\t" \
   "pmulhw %%mm1,%%mm5\n\t" \
-  "movq "OC_C(7)",%%mm7\n\t" \
+  "movq 0x70(%[c]),%%mm7\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm3,%%mm0\n\t" \
   "pmulhw %%mm7,%%mm3\n\t" \
@@ -82,25 +62,25 @@
   "pmulhw %%mm1,%%mm7\n\t" \
   "paddw %%mm1,%%mm5\n\t" \
   "movq %%mm2,%%mm1\n\t" \
-  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "pmulhw 0x20(%[c]),%%mm2\n\t" \
   "psubw %%mm5,%%mm3\n\t" \
   "movq "OC_J(6)",%%mm5\n\t" \
   "paddw %%mm7,%%mm0\n\t" \
   "movq %%mm5,%%mm7\n\t" \
   "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "pmulhw 0x20(%[c]),%%mm5\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "pmulhw 0x60(%[c]),%%mm1\n\t" \
   "paddw %%mm4,%%mm4\n\t" \
   "paddw %%mm0,%%mm4\n\t" \
   "psubw %%mm6,%%mm3\n\t" \
   "paddw %%mm7,%%mm5\n\t" \
   "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "pmulhw 0x60(%[c]),%%mm7\n\t" \
   "paddw %%mm3,%%mm6\n\t" \
   "movq %%mm4,"OC_I(1)"\n\t" \
   "psubw %%mm5,%%mm1\n\t" \
-  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq 0x40(%[c]),%%mm4\n\t" \
   "movq %%mm3,%%mm5\n\t" \
   "pmulhw %%mm4,%%mm3\n\t" \
   "paddw %%mm2,%%mm7\n\t" \
@@ -235,7 +215,7 @@
 #define OC_COLUMN_IDCT \
   "#OC_COLUMN_IDCT\n" \
   OC_IDCT_BEGIN \
-  "paddw "OC_8",%%mm2\n\t" \
+  "paddw 0x00(%[c]),%%mm2\n\t" \
   /*r1=H'+H'*/ \
   "paddw %%mm1,%%mm1\n\t" \
   /*r1=R1=A''+H'*/ \
@@ -258,7 +238,7 @@
   "movq %%mm1,"OC_I(1)"\n\t" \
   /*r4=R4=E'-D'*/ \
   "psubw %%mm3,%%mm4\n\t" \
-  "paddw "OC_8",%%mm4\n\t" \
+  "paddw 0x00(%[c]),%%mm4\n\t" \
   /*r3=D'+D'*/ \
   "paddw %%mm3,%%mm3\n\t" \
   /*r3=R3=E'+D'*/ \
@@ -269,7 +249,7 @@
   "psubw %%mm5,%%mm6\n\t" \
   /*r3=NR3*/ \
   "psraw $4,%%mm3\n\t" \
-  "paddw "OC_8",%%mm6\n\t" \
+  "paddw 0x00(%[c]),%%mm6\n\t" \
   /*r5=B''+B''*/ \
   "paddw %%mm5,%%mm5\n\t" \
   /*r5=R5=F'+B''*/ \
@@ -284,7 +264,7 @@
   "movq %%mm3,"OC_I(3)"\n\t" \
   /*r7=R7=G'-C'*/ \
   "psubw %%mm0,%%mm7\n\t" \
-  "paddw "OC_8",%%mm7\n\t" \
+  "paddw 0x00(%[c]),%%mm7\n\t" \
   /*r0=C'+C'*/ \
   "paddw %%mm0,%%mm0\n\t" \
   /*r0=R0=G'+C'*/ \
@@ -343,13 +323,13 @@
  "#OC_IDCT_BEGIN_10\n\t" \
  "movq "OC_I(3)",%%mm2\n\t" \
  "nop\n\t" \
- "movq "OC_C(3)",%%mm6\n\t" \
+ "movq 0x30(%[c]),%%mm6\n\t" \
  "movq %%mm2,%%mm4\n\t" \
- "movq "OC_C(5)",%%mm1\n\t" \
+ "movq 0x50(%[c]),%%mm1\n\t" \
  "pmulhw %%mm6,%%mm4\n\t" \
  "movq "OC_I(1)",%%mm3\n\t" \
  "pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_C(1)",%%mm0\n\t" \
+ "movq 0x10(%[c]),%%mm0\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  "pxor %%mm6,%%mm6\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
@@ -357,19 +337,19 @@
  "pmulhw %%mm3,%%mm0\n\t" \
  "movq %%mm5,%%mm1\n\t" \
  "paddw %%mm3,%%mm0\n\t" \
- "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "pmulhw 0x70(%[c]),%%mm3\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
- "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "pmulhw 0x20(%[c]),%%mm5\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
  "movq "OC_I(2)",%%mm7\n\t" \
  "paddw %%mm4,%%mm4\n\t" \
  "paddw %%mm5,%%mm7\n\t" \
  "paddw %%mm0,%%mm4\n\t" \
- "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "pmulhw 0x60(%[c]),%%mm1\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
  "movq %%mm4,"OC_I(1)"\n\t" \
  "paddw %%mm6,%%mm6\n\t" \
- "movq "OC_C(4)",%%mm4\n\t" \
+ "movq 0x40(%[c]),%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "movq %%mm3,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm3\n\t" \
@@ -432,7 +412,7 @@
 #define OC_COLUMN_IDCT_10 \
  "#OC_COLUMN_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "paddw "OC_8",%%mm2\n\t" \
+ "paddw 0x00(%[c]),%%mm2\n\t" \
  /*r1=H'+H'*/ \
  "paddw %%mm1,%%mm1\n\t" \
  /*r1=R1=A''+H'*/ \
@@ -455,7 +435,7 @@
  "movq %%mm1,"OC_I(1)"\n\t" \
  /*r4=R4=E'-D'*/ \
  "psubw %%mm3,%%mm4\n\t" \
- "paddw "OC_8",%%mm4\n\t" \
+ "paddw 0x00(%[c]),%%mm4\n\t" \
  /*r3=D'+D'*/ \
  "paddw %%mm3,%%mm3\n\t" \
  /*r3=R3=E'+D'*/ \
@@ -466,7 +446,7 @@
  "psubw %%mm5,%%mm6\n\t" \
  /*r3=NR3*/ \
  "psraw $4,%%mm3\n\t" \
- "paddw "OC_8",%%mm6\n\t" \
+ "paddw 0x00(%[c]),%%mm6\n\t" \
  /*r5=B''+B''*/ \
  "paddw %%mm5,%%mm5\n\t" \
  /*r5=R5=F'+B''*/ \
@@ -481,7 +461,7 @@
  "movq %%mm3,"OC_I(3)"\n\t" \
  /*r7=R7=G'-C'*/ \
  "psubw %%mm0,%%mm7\n\t" \
- "paddw "OC_8",%%mm7\n\t" \
+ "paddw 0x00(%[c]),%%mm7\n\t" \
  /*r0=C'+C'*/ \
  "paddw %%mm0,%%mm0\n\t" \
  /*r0=R0=G'+C'*/ \

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	2010-05-28 07:09:00 UTC (rev 17249)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	2010-05-28 22:44:24 UTC (rev 17250)
@@ -22,6 +22,19 @@
 
 #if defined(OC_X86_ASM)
 
+/*A table of constants used by the MMX routines.*/
+const short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+        8,      8,      8,      8,      8,      8,      8,      8,
+  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
 /*Performs the first three stages of the iDCT.
   xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
    (accessed in that order).
@@ -33,38 +46,28 @@
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
-  "mov $0xEC83EC83,%[b]\n\t" \
-  "mov $0x61F861F8,%[a]\n\t" \
-  "movd %[b],%%xmm1\n\t" \
-  "movd %[a],%%xmm4\n\t" \
-  "pshufd $00,%%xmm1,%%xmm1\n\t" \
-  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  "movdqa 0x20(%[c]),%%xmm1\n\t" \
+  "movdqa 0x60(%[c]),%%xmm4\n\t" \
   "movdqa %%xmm1,%%xmm0\n\t" \
   "pmulhw %%xmm2,%%xmm1\n\t" \
   "movdqa %%xmm4,%%xmm7\n\t" \
   "pmulhw %%xmm6,%%xmm0\n\t" \
-  "mov $0x8E3A8E3A,%[a]\n\t" \
   "pmulhw %%xmm2,%%xmm7\n\t" \
-  "mov $0xD4DBD4DB,%[b]\n\t" \
   "pmulhw %%xmm6,%%xmm4\n\t" \
   "paddw %%xmm6,%%xmm0\n\t" \
-  "movd %[b],%%xmm6\n\t" \
+  "movdqa 0x30(%[c]),%%xmm6\n\t" \
   "paddw %%xmm1,%%xmm2\n\t" \
   "psubw %%xmm0,%%xmm7\n\t" \
   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   "paddw %%xmm4,%%xmm2\n\t" \
-  "movd %[a],%%xmm4\n\t" \
+  "movdqa 0x50(%[c]),%%xmm4\n\t" \
   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
-  "pshufd $00,%%xmm4,%%xmm4\n\t" \
-  "pshufd $00,%%xmm6,%%xmm6\n\t" \
   "movdqa %%xmm4,%%xmm2\n\t" \
   "movdqa %%xmm6,%%xmm1\n\t" \
   "pmulhw %%xmm3,%%xmm4\n\t" \
-  "mov $0x31F131F1,%[a]\n\t" \
   "pmulhw %%xmm5,%%xmm1\n\t" \
-  "mov $0xFB15FB15,%[b]\n\t" \
   "pmulhw %%xmm3,%%xmm6\n\t" \
   "pmulhw %%xmm5,%%xmm2\n\t" \
   "paddw %%xmm3,%%xmm4\n\t" \
@@ -74,17 +77,14 @@
   "paddw %%xmm5,%%xmm1\n\t" \
   "movdqa 0x10(%[y]),%%xmm5\n\t" \
   "paddw %%xmm3,%%xmm2\n\t" \
-  "movd %[a],%%xmm3\n\t" \
+  "movdqa 0x70(%[c]),%%xmm3\n\t" \
   "psubw %%xmm4,%%xmm1\n\t" \
-  "movd %[b],%%xmm4\n\t" \
+  "movdqa 0x10(%[c]),%%xmm4\n\t" \
   /*4-7 rotation by 7pi/16. \
     xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
-  "pshufd $00,%%xmm3,%%xmm3\n\t" \
-  "pshufd $00,%%xmm4,%%xmm4\n\t" \
   "movdqa %%xmm3,%%xmm0\n\t" \
   "movdqa %%xmm4,%%xmm7\n\t" \
   "pmulhw %%xmm5,%%xmm3\n\t" \
-  "mov $0xB505B505,%[a]\n\t" \
   "pmulhw %%xmm5,%%xmm7\n\t" \
   "pmulhw %%xmm6,%%xmm4\n\t" \
   "pmulhw %%xmm6,%%xmm0\n\t" \
@@ -92,10 +92,9 @@
   "movdqa 0x40(%[y]),%%xmm6\n\t" \
   "paddw %%xmm5,%%xmm7\n\t" \
   "psubw %%xmm4,%%xmm3\n\t" \
-  "movd %[a],%%xmm4\n\t" \
+  "movdqa 0x40(%[c]),%%xmm4\n\t" \
   "paddw %%xmm7,%%xmm0\n\t" \
   "movdqa 0x00(%[y]),%%xmm7\n\t" \
-  "pshufd $00,%%xmm4,%%xmm4\n\t" \
   /*0-1 butterfly. \
     xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
   "paddw %%xmm7,%%xmm6\n\t" \
@@ -172,13 +171,11 @@
     1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
     2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
     3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
-  "mov $0x00080008,%[a]\n\t" \
   "psubw %%xmm3,%%xmm4\n\t" \
   "movdqa %%xmm4,0x40(%[y])\n\t" \
-  "movd %[a],%%xmm4\n\t" \
+  "movdqa 0x00(%[c]),%%xmm4\n\t" \
   "psubw %%xmm0,%%xmm7\n\t" \
   "psubw %%xmm1,%%xmm6\n\t" \
-  "pshufd $0x00,%%xmm4,%%xmm4\n\t" \
   "psubw %%xmm2,%%xmm5\n\t" \
   "paddw %%xmm4,%%xmm7\n\t" \
   "paddw %%xmm4,%%xmm6\n\t" \
@@ -211,8 +208,6 @@
 
 static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64]){
   OC_ALIGN16(ogg_int16_t buf[16]);
-  int a;
-  int b;
   /*This routine accepts an 8x8 matrix pre-transposed.*/
   __asm__ __volatile__(
     /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
@@ -230,8 +225,8 @@
     "movdqa %%xmm0,0x00(%[y])\n\t"
     OC_IDCT_8x8_ABC
     OC_IDCT_8x8_D_STORE
-    :[a]"=&r"(a),[b]"=&r"(b),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
-    :[y]"r"(_y)
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 
@@ -243,40 +238,28 @@
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
-  "mov $0x61F861F8,%[a]\n\t" \
-  "mov $0xEC83EC83,%[b]\n\t" \
-  "movd %[a],%%mm7\n\t" \
-  "movd %[b],%%mm6\n\t" \
-  "punpckldq %%mm7,%%mm7\n\t" \
-  "punpckldq %%mm6,%%mm6\n\t" \
+  "movq 0x60(%[c]),%%mm7\n\t" \
+  "movq 0x20(%[c]),%%mm6\n\t" \
   "pmulhw %%mm2,%%mm6\n\t" \
   "pmulhw %%mm2,%%mm7\n\t" \
-  "mov $0x8E3A8E3A,%[a]\n\t" \
-  "mov $0xD4DBD4DB,%[b]\n\t" \
-  "movd %[a],%%mm5\n\t" \
+  "movq 0x50(%[c]),%%mm5\n\t" \
   "paddw %%mm6,%%mm2\n\t" \
   "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
-  "movd %[b],%%mm2\n\t" \
+  "movq 0x30(%[c]),%%mm2\n\t" \
   "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
-  "punpckldq %%mm2,%%mm2\n\t" \
   "pmulhw %%mm3,%%mm5\n\t" \
-  "mov $0xFB15FB15,%[b]\n\t" \
   "pmulhw %%mm3,%%mm2\n\t" \
-  "mov $0x31F131F1,%[a]\n\t" \
-  "movd %[b],%%mm7\n\t" \
+  "movq 0x10(%[c]),%%mm7\n\t" \
   "paddw %%mm3,%%mm5\n\t" \
   "paddw %%mm3,%%mm2\n\t" \
+  "movq 0x70(%[c]),%%mm3\n\t" \
   /*4-7 rotation by 7pi/16. \
     mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
-  "movd %[a],%%mm3\n\t" \
-  "punpckldq %%mm7,%%mm7\n\t" \
-  "punpckldq %%mm3,%%mm3\n\t" \
   "pmulhw %%mm1,%%mm3\n\t" \
-  "mov $0xB505B505,%[a]\n\t" \
   "pmulhw %%mm1,%%mm7\n\t" \
-  "movd %[a],%%mm4\n\t" \
+  "movq 0x40(%[c]),%%mm4\n\t" \
   "movq %%mm3,%%mm6\n\t" \
   "paddw %%mm1,%%mm7\n\t" \
   /*0-1 butterfly. \
@@ -284,7 +267,6 @@
   /*Stage 2:*/ \
   /*4-5 butterfly: mm3=t[4], mm5=t[5] \
     7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
-  "punpckldq %%mm4,%%mm4\n\t" \
   "psubw %%mm5,%%mm3\n\t" \
   "paddw %%mm5,%%mm6\n\t" \
   "movq %%mm4,%%mm1\n\t" \
@@ -337,40 +319,28 @@
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
-  "mov $0x61F861F8,%[a]\n\t" \
-  "mov $0xEC83EC83,%[b]\n\t" \
-  "movd %[a],%%xmm7\n\t" \
-  "movd %[b],%%xmm6\n\t" \
-  "pshufd $00,%%xmm7,%%xmm7\n\t" \
-  "pshufd $00,%%xmm6,%%xmm6\n\t" \
+  "movdqa 0x60(%[c]),%%xmm7\n\t" \
+  "movdqa 0x20(%[c]),%%xmm6\n\t" \
   "pmulhw %%xmm2,%%xmm6\n\t" \
   "pmulhw %%xmm2,%%xmm7\n\t" \
-  "mov $0x8E3A8E3A,%[a]\n\t" \
-  "mov $0xD4DBD4DB,%[b]\n\t" \
-  "movd %[a],%%xmm5\n\t" \
+  "movdqa 0x50(%[c]),%%xmm5\n\t" \
   "paddw %%xmm6,%%xmm2\n\t" \
   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
-  "movd %[b],%%xmm2\n\t" \
+  "movdqa 0x30(%[c]),%%xmm2\n\t" \
   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
-  "pshufd $00,%%xmm2,%%xmm2\n\t" \
   "pmulhw %%xmm3,%%xmm5\n\t" \
-  "mov $0xFB15FB15,%[b]\n\t" \
   "pmulhw %%xmm3,%%xmm2\n\t" \
-  "mov $0x31F131F1,%[a]\n\t" \
-  "movd %[b],%%xmm7\n\t" \
+  "movdqa 0x10(%[c]),%%xmm7\n\t" \
   "paddw %%xmm3,%%xmm5\n\t" \
   "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa 0x70(%[c]),%%xmm3\n\t" \
   /*4-7 rotation by 7pi/16. \
     xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
-  "movd %[a],%%xmm3\n\t" \
-  "pshufd $00,%%xmm7,%%xmm7\n\t" \
-  "pshufd $00,%%xmm3,%%xmm3\n\t" \
   "pmulhw %%xmm1,%%xmm3\n\t" \
-  "mov $0xB505B505,%[a]\n\t" \
   "pmulhw %%xmm1,%%xmm7\n\t" \
-  "movd %[a],%%xmm4\n\t" \
+  "movdqa 0x40(%[c]),%%xmm4\n\t" \
   "movdqa %%xmm3,%%xmm6\n\t" \
   "paddw %%xmm1,%%xmm7\n\t" \
   /*0-1 butterfly. \
@@ -378,7 +348,6 @@
   /*Stage 2:*/ \
   /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
     7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
-  "pshufd $00,%%xmm4,%%xmm4\n\t" \
   "psubw %%xmm5,%%xmm3\n\t" \
   "paddw %%xmm5,%%xmm6\n\t" \
   "movdqa %%xmm4,%%xmm1\n\t" \
@@ -411,8 +380,6 @@
 
 static void oc_idct8x8_10_sse2(ogg_int16_t _y[64]){
   OC_ALIGN16(ogg_int16_t buf[16]);
-  int a;
-  int b;
   /*This routine accepts an 8x8 matrix pre-transposed.*/
   __asm__ __volatile__(
     "movq 0x20(%[y]),%%mm2\n\t"
@@ -423,8 +390,8 @@
     OC_TRANSPOSE_8x4_MMX2SSE
     OC_IDCT_8x8_10_ABC
     OC_IDCT_8x8_D_STORE
-    :[a]"=&r"(a),[b]"=&r"(b),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
-    :[y]"r"(_y)
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-05-28 07:09:00 UTC (rev 17249)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-05-28 22:44:24 UTC (rev 17250)
@@ -30,7 +30,9 @@
   OC_M2STR(_offs-8+%H[_name])
 #  endif
 # endif
-/*If your gcc version does't support %H, then you get to suffer the warnings.*/
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+  Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+   whole offset, instead of substituting in 0 for the missing operand to +.*/
 # if !defined(OC_MEM_OFFS)
 #  define OC_MEM_OFFS(_offs,_name) \
   OC_M2STR(_offs+%[_name])
@@ -44,6 +46,8 @@
   (*({struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
    array_addr__;}))
 
+extern const short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
 void oc_state_vtable_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,



More information about the commits mailing list