[xiph-commits] r17247 - in experimental/derf/theora-ptalarbvorm/lib: . x86

Thu May 27 22:35:32 PDT 2010

Author: tterribe
Date: 2010-05-27 22:35:32 -0700 (Thu, 27 May 2010)
New Revision: 17247

Added:
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
Modified:
   experimental/derf/theora-ptalarbvorm/lib/Makefile.am
   experimental/derf/theora-ptalarbvorm/lib/cpu.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxloop.h
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
   experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
Log:
Adds an SSE2 version of the iDCT and an MMXEXT version of the loop filter.
The latter uses entirely bytewise operations, using the same transformations I
 developed for the C64x port.
Overall, this speeds up the decoder about 5% on x86-64.
Also adds macros for declaring array operands and referencing them with offsets,
 so that local buffers on the stack can be referred to via the stack pointer
 instead of chewing up their own register.
Finally, explicitly detect Pentium M and Core Solo/Core Duo chips and disable
 SSE2 on them, since the MMX functions are faster.
This allows SSE2 to remain enabled on other 32-bit chips, where SSE2 actually
 has an advantage, and on 64-bit chips running a 32-bit OS.


Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================

--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-05-28 05:35:32 UTC (rev 17247)
@@ -8,6 +8,7 @@
 	x86/mmxfdct.c \
 	x86/sse2encfrag.c \
 	x86/sse2fdct.c \
+	x86/sse2trans.h \
 	x86/x86enc.c \
 	x86/x86enc.h \
 	x86/mmxfrag.c \
@@ -15,6 +16,7 @@
 	x86/mmxidct.c \
 	x86/mmxloop.h \
 	x86/mmxstate.c \
+	x86/sse2idct.c \
 	x86/x86int.h \
 	x86/x86state.c \
 	x86_vc
@@ -41,6 +43,7 @@
 	x86/mmxfrag.c \
 	x86/mmxidct.c \
 	x86/mmxstate.c \
+	x86/sse2idct.c \
 	x86/x86state.c
 
 encoder_shared_x86_64_sources =
@@ -93,6 +96,7 @@
 	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
+	x86/sse2idct.c \
 	x86/x86state.c
 if CPU_x86_64
 decoder_arch_sources = $(decoder_x86_sources)
@@ -137,11 +141,11 @@
 	huffman.h \
 	ocintrin.h \
 	quant.h \
-	x86/x86enc.h \
 	x86/mmxfrag.h \
 	x86/mmxloop.h \
-	x86/x86int.h \
-	x86/sse2trans.h
+	x86/sse2trans.h \
+	x86/x86enc.h \
+	x86/x86int.h
 
 libtheoradec_la_SOURCES = \
 	$(decoder_sources) \

Modified: experimental/derf/theora-ptalarbvorm/lib/cpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/cpu.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/cpu.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -159,9 +159,18 @@
   if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
    /*      6 8 x M          T e n i          u n e G*/
    ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
     /*Intel, Transmeta (tested with Crusoe TM5800):*/
     cpuid(1,eax,ebx,ecx,edx);
     flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
   }
   /*              D M A c          i t n e          h t u A*/
   else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -165,54 +165,54 @@
    16-bit difference in %%mm0...%%mm7.*/
 #define OC_LOAD_SUB_8x4(_off) \
  "#OC_LOAD_SUB_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[ref]),%%mm4\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm4\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "movd "_off"(%[src]),%%mm2\n\t" \
- "movd "_off"(%[ref]),%%mm7\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm2\n\t" \
+ "movd "#_off"(%[ref]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
  "punpcklbw %%mm4,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movd "_off"(%[src]),%%mm4\n\t" \
- "movq %%mm0,"_off"*2(%[buf])\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[src]),%%mm4\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm5,%%mm1\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psubw %%mm5,%%mm1\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm6,%%mm3\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm6,%%mm6\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movd "_off"(%[src]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm6\n\t" \
  "punpcklbw %%mm0,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "psubw %%mm0,%%mm4\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm7,%%mm5\n\t" \
  "neg %[src_ystride]\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm5\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm0,%%mm6\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "neg %[ref_ystride]\n\t" \
  "psubw %%mm0,%%mm6\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
  "lea (%[src],%[src_ystride],8),%[src]\n\t" \
  "punpcklbw %%mm0,%%mm7\n\t" \
  "neg %[src_ystride]\n\t" \
@@ -220,24 +220,24 @@
  "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
  "psubw %%mm0,%%mm7\n\t" \
  "neg %[ref_ystride]\n\t" \
- "movq "_off"*2(%[buf]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
 
 /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
 #define OC_LOAD_8x4(_off) \
  "#OC_LOAD_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
- "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
  "pxor %%mm7,%%mm7\n\t" \
- "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
+ "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
  "punpcklbw %%mm7,%%mm0\n\t" \
- "movd "_off"(%[src4]),%%mm4\n\t" \
+ "movd "#_off"(%[src4]),%%mm4\n\t" \
  "punpcklbw %%mm7,%%mm1\n\t" \
- "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
+ "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
  "punpcklbw %%mm7,%%mm3\n\t" \
- "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
+ "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psrlw $8,%%mm4\n\t" \
@@ -326,8 +326,8 @@
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
    This implementation is only 26 (+4 for spilling registers).*/ \
  "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
- "movq %%mm7,"_r7"(%[buf])\n\t" \
- "movq %%mm6,"_r6"(%[buf])\n\t" \
+ "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
+ "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
  /*mm7={0x7FFF}x4 \
    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  "pcmpeqb %%mm7,%%mm7\n\t" \
@@ -345,14 +345,14 @@
  "pmaxsw %%mm5,%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
- "movq "_r7"(%[buf]),%%mm3\n\t" \
+ "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
 
 /*Performs the second part of the final stage of the Hadamard transform and
    summing of absolute values.*/
 #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
  "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
  "paddsw %%mm7,%%mm6\n\t" \
- "movq "_r6"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
  "paddsw %%mm7,%%mm1\n\t" \
  "psubw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm4\n\t" \
@@ -393,7 +393,7 @@
 #define OC_TRANSPOSE_4x4x2(_off) \
  "#OC_TRANSPOSE_4x4x2\n\t" \
  /*First 4x4 transpose:*/ \
- "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
+ "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
  /*mm0 = e3 e2 e1 e0 \
    mm1 = f3 f2 f1 f0 \
    mm2 = g3 g2 g1 g0 \
@@ -413,13 +413,13 @@
  "punpckhdq %%mm2,%%mm1\n\t" \
  "movq %%mm3,%%mm2\n\t" \
  "punpckhdq %%mm5,%%mm3\n\t" \
- "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
  "punpckldq %%mm5,%%mm2\n\t" \
  /*mm0 = h0 g0 f0 e0 \
    mm1 = h1 g1 f1 e1 \
    mm2 = h2 g2 f2 e2 \
    mm3 = h3 g3 f3 e3*/ \
- "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
  /*Second 4x4 transpose:*/ \
  /*mm4 = a3 a2 a1 a0 \
    mm5 = b3 b2 b1 b0 \
@@ -427,11 +427,11 @@
    mm7 = d3 d2 d1 d0*/ \
  "movq %%mm6,%%mm0\n\t" \
  "punpcklwd %%mm7,%%mm6\n\t" \
- "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
+ "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
  "punpckhwd %%mm7,%%mm0\n\t" \
  "movq %%mm4,%%mm7\n\t" \
  "punpcklwd %%mm5,%%mm4\n\t" \
- "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
+ "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
  "punpckhwd %%mm5,%%mm7\n\t" \
  /*mm4 = b1 a1 b0 a0 \
    mm7 = b3 a3 b2 a2 \
@@ -439,7 +439,7 @@
    mm0 = d3 c3 d2 c2*/ \
  "movq %%mm4,%%mm5\n\t" \
  "punpckldq %%mm6,%%mm4\n\t" \
- "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
+ "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
  "punpckhdq %%mm6,%%mm5\n\t" \
  "movq %%mm7,%%mm6\n\t" \
  "punpckhdq %%mm0,%%mm7\n\t" \
@@ -453,38 +453,36 @@
  const unsigned char *_src,int _src_ystride,
  const unsigned char *_ref,int _ref_ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  unsigned     dc;
-  bufp=buf;
+  unsigned ret;
+  unsigned ret2;
+  unsigned dc;
   __asm__ __volatile__(
-    OC_LOAD_SUB_8x4("0x00")
+    OC_LOAD_SUB_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_SUB_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_SUB_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
     /*We split out the stages here so we can save the DC coefficient in the
        middle.*/
     OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
     "movd %%mm1,%[dc]\n\t"
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
@@ -492,21 +490,21 @@
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
     "movd %%mm4,%[ret2]\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
     /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
        added to them, and a factor of two removed; correct the final sum here.*/
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
     /*Compute abs(dc).*/
     "movsx %w[dc],%[ret]\n\t"
@@ -522,13 +520,13 @@
     "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
     /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
        and %[ret2] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+       them until after we're done using everything but %[buf].*/
     /*Note that _src_ystride and _ref_ystride must be given non-overlapping
        constraints, otherewise if gcc can prove they're equal it will allocate
        them to the same register (which is bad); _src and _ref face a similar
        problem, though those are never actually the same.*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=d"(dc),[buf]"+r"(bufp)
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=d"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
      [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
     /*We have to use neg, so we actually clobber the condition codes for once
@@ -672,38 +670,36 @@
 unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
  const unsigned char *_src,int _ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  unsigned     dc;
-  bufp=buf;
+  unsigned ret;
+  unsigned ret2;
+  unsigned dc;
   __asm__ __volatile__(
-    OC_LOAD_8x4("0x00")
+    OC_LOAD_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
     /*We split out the stages here so we can save the DC coefficient in the
        middle.*/
     OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
     "movd %%mm1,%[dc]\n\t"
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
@@ -711,19 +707,19 @@
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
     "movd %%mm4,%[ret]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
     /*We assume that the DC coefficient is always positive (which is true,
        because the input to the INTRA transform was not a difference).*/
@@ -739,7 +735,8 @@
        and %[ret2] with some of the inputs, since for once we don't write to
        them until after we're done using everything but %[buf] (which is also
        listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),[buf]"+r"(bufp)
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
     /*We have to use sub, so we actually clobber the condition codes for once

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -50,9 +50,6 @@
       8,    8,    8,    8
 };
 
-/*Converts the expression in the argument to a string.*/
-#define OC_M2STR(_s) #_s
-
 /*38 cycles*/
 #define OC_IDCT_BEGIN \
   "#OC_IDCT_BEGIN\n\t" \
@@ -310,7 +307,7 @@
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64]){
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm__ __volatile__(
@@ -503,7 +500,7 @@
  "movq %%mm0,"OC_I(0)"\n\t" \
  "#end OC_COLUMN_IDCT_10\n\t" \
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
   __asm__ __volatile__(
 #define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
@@ -557,8 +554,8 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<10)oc_idct8x8_10_mmx(_y);
+  else oc_idct8x8_slow_mmx(_y);
 }
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxloop.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxloop.h	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxloop.h	2010-05-28 05:35:32 UTC (rev 17247)
@@ -9,88 +9,191 @@
   On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
 #define OC_LOOP_FILTER8_MMX \
- "#OC_LOOP_FILTER8_MMX\n\t" \
- /*mm7=0*/ \
- "pxor %%mm7,%%mm7\n\t" \
- /*mm6:mm0={a0,...,a7}*/ \
- "movq %%mm0,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "punpckhbw %%mm7,%%mm6\n\t" \
- /*mm3:mm5={d0,...,d7}*/ \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
- "psubw %%mm3,%%mm0\n\t" \
- "psubw %%mm5,%%mm6\n\t" \
- /*mm3:mm1={b0,...,b7}*/ \
- "movq %%mm1,%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm3\n\t" \
- /*mm5:mm4={c0,...,c7}*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
- "pcmpeqw %%mm7,%%mm7\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm3,%%mm5\n\t" \
- /*Scale by 3.*/ \
- "pmullw %%mm7,%%mm4\n\t" \
- "pmullw %%mm7,%%mm5\n\t" \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "psllw $2,%%mm7\n\t" \
- "movq (%[ll]),%%mm0\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
- "psubw %%mm7,%%mm4\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "psraw $3,%%mm4\n\t" \
- "psraw $3,%%mm5\n\t" \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "packsswb %%mm5,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "pxor %%mm7,%%mm4\n\t" \
- "packuswb %%mm3,%%mm1\n\t" \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
- "pcmpgtb %%mm4,%%mm6\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "pxor %%mm6,%%mm4\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "psubb %%mm6,%%mm4\n\t" \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
- "paddusb %%mm4,%%mm7\n\t" \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
- "paddusb %%mm7,%%mm4\n\t" \
- "psubusb %%mm7,%%mm4\n\t" \
- /*Now split mm4 by the original sign of -R_i.*/ \
- "movq %%mm4,%%mm5\n\t" \
- "pand %%mm6,%%mm4\n\t" \
- "pandn %%mm5,%%mm6\n\t" \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
- "paddusb %%mm4,%%mm1\n\t" \
- "psubusb %%mm4,%%mm2\n\t" \
- "psubusb %%mm6,%%mm1\n\t" \
- "paddusb %%mm6,%%mm2\n\t" \
+  "#OC_LOOP_FILTER8_MMX\n\t" \
+  /*mm7=0*/ \
+  "pxor %%mm7,%%mm7\n\t" \
+  /*mm6:mm0={a0,...,a7}*/ \
+  "movq %%mm0,%%mm6\n\t" \
+  "punpcklbw %%mm7,%%mm0\n\t" \
+  "punpckhbw %%mm7,%%mm6\n\t" \
+  /*mm3:mm5={d0,...,d7}*/ \
+  "movq %%mm3,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm3\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  "psubw %%mm3,%%mm0\n\t" \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*mm3:mm1={b0,...,b7}*/ \
+  "movq %%mm1,%%mm3\n\t" \
+  "punpcklbw %%mm7,%%mm1\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm3\n\t" \
+  /*mm5:mm4={c0,...,c7}*/ \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  "pcmpeqw %%mm7,%%mm7\n\t" \
+  "psubw %%mm1,%%mm4\n\t" \
+  "psrlw $14,%%mm7\n\t" \
+  "psubw %%mm3,%%mm5\n\t" \
+  /*Scale by 3.*/ \
+  "pmullw %%mm7,%%mm4\n\t" \
+  "pmullw %%mm7,%%mm5\n\t" \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  "psrlw $1,%%mm7\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psllw $2,%%mm7\n\t" \
+  "movq (%[ll]),%%mm0\n\t" \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm7,%%mm5\n\t" \
+  "psraw $3,%%mm4\n\t" \
+  "psraw $3,%%mm5\n\t" \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "packsswb %%mm5,%%mm4\n\t" \
+  "pxor %%mm6,%%mm6\n\t" \
+  "pxor %%mm7,%%mm4\n\t" \
+  "packuswb %%mm3,%%mm1\n\t" \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  "pcmpgtb %%mm4,%%mm6\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "pxor %%mm6,%%mm4\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "psubb %%mm6,%%mm4\n\t" \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  "paddusb %%mm4,%%mm7\n\t" \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  "paddusb %%mm7,%%mm4\n\t" \
+  "psubusb %%mm7,%%mm4\n\t" \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  "movq %%mm4,%%mm5\n\t" \
+  "pand %%mm6,%%mm4\n\t" \
+  "pandn %%mm5,%%mm6\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm4,%%mm1\n\t" \
+  "psubusb %%mm4,%%mm2\n\t" \
+  "psubusb %%mm6,%%mm1\n\t" \
+  "paddusb %%mm6,%%mm2\n\t" \
 
-#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+  All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+  /*This first part is based on the transformation \
+      f = -(3*(c-b)+a-d+4>>3) \
+        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+        = -(3*(c+~b)+(a+~d)-1016>>3) \
+        = 127-(3*(c+~b)+(a+~d)>>3) \
+        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+    Using this, the last expression above can be computed in 8 bits of working \
+     precision via: \
+      u = ~pavgb(~b,c); \
+      v = pavgb(b,~c); \
+      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+      m = u-v; \
+      t = m^pavgb(m^~a,m^d); \
+      f = 128+pavgb(pavgb(t,u),v); \
+    This required some careful analysis to ensure that carries are propagated \
+     correctly in all cases, but has been checked exhaustively.*/ \
+  /*input (a, b, c, d, ., ., ., .)*/ \
+  /*ff=0xFF; \
+    u=b; \
+    v=c; \
+    ll=255-2*L;*/ \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "movq %%mm1,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "movq (%[ll]),%%mm6\n\t" \
+  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u^=ff; \
+    v^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm5\n\t" \
+  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u=pavgb(u,c); \
+    v=pavgb(v,b);*/ \
+  "pavgb %%mm2,%%mm4\n\t" \
+  "pavgb %%mm1,%%mm5\n\t" \
+  /*u^=ff; \
+    a^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm0\n\t" \
+  /*m=u-v;*/ \
+  "psubb %%mm5,%%mm4\n\t" \
+  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+  /*a^=m; \
+    d^=m;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "pxor %%mm4,%%mm3\n\t" \
+  /*t=pavgb(a,d);*/ \
+  "pavgb %%mm3,%%mm0\n\t" \
+  "psllw $7,%%mm7\n\t" \
+  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+  /*t^=m; \
+    u=m+v;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "paddb %%mm5,%%mm4\n\t" \
+  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+  /*f=pavgb(f,u); \
+    of=128;*/ \
+  "pavgb %%mm4,%%mm0\n\t" \
+  "packsswb %%mm7,%%mm7\n\t" \
+  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+  /*f=pavgb(f,v);*/ \
+  "pavgb %%mm5,%%mm0\n\t" \
+  "movq %%mm7,%%mm3\n\t" \
+  "movq %%mm6,%%mm4\n\t" \
+  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but staying in 8 bits gives much better parallelism).*/ \
+  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+    This is the same number of instructions as computing a mask and splitting \
+     after the lflim computation, but has shorter dependency chains.*/ \
+  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+  "psubusb %%mm0,%%mm3\n\t" \
+  "psubusb %%mm7,%%mm0\n\t" \
+  /*mm6=255-max(2*L-abs(R_i<0),0) \
+    mm4=255-max(2*L-abs(R_i>0),0)*/ \
+  "paddusb %%mm3,%%mm4\n\t" \
+  "paddusb %%mm0,%%mm6\n\t" \
+  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+  "paddusb %%mm4,%%mm3\n\t" \
+  "paddusb %%mm6,%%mm0\n\t" \
+  "psubusb %%mm4,%%mm3\n\t" \
+  "psubusb %%mm6,%%mm0\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm3,%%mm1\n\t" \
+  "psubusb %%mm3,%%mm2\n\t" \
+  "psubusb %%mm0,%%mm1\n\t" \
+  "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
   do{ \
     ptrdiff_t ystride3__; \
     __asm__ __volatile__( \
@@ -104,7 +207,7 @@
       "movq (%[pix],%[ystride]),%%mm1\n\t" \
       /*mm2={c0,...,c7}*/ \
       "movq (%[pix],%[ystride],2),%%mm2\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*Write it back out.*/ \
       "movq %%mm1,(%[pix],%[ystride])\n\t" \
       "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
@@ -116,7 +219,7 @@
   } \
   while(0)
 
-#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
   do{ \
     unsigned char *pix__; \
     ptrdiff_t      ystride3__; \
@@ -174,7 +277,7 @@
       "punpckldq %%mm5,%%mm2\n\t" \
       /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
       "punpckhdq %%mm5,%%mm3\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*mm2={b0+R_0'',...,b7+R_7''}*/ \
       "movq %%mm1,%%mm0\n\t" \
       /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -71,7 +71,7 @@
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
@@ -170,13 +170,17 @@
       if(frags[fragi].coded){
         unsigned char *ref;
         ref=ref_frame_data+frag_buf_offs[fragi];
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
         }
       }
       fragi++;
@@ -185,4 +189,69 @@
   }
 }
 
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  OC_ALIGN8(unsigned char   ll[8]);
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  memset(ll,~(_state->loop_filter_limits[_state->qis[0]]<<1),sizeof(ll));
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -40,7 +40,7 @@
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "psubw %%xmm4,%%xmm0\n\t" \
  "movq (%[src]),%%xmm4\n\t" \
- "movdqa %%xmm0,(%[buf])\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
  "movq (%[ref]),%%xmm0\n\t" \
  "punpcklbw %%xmm5,%%xmm1\n\t" \
  "punpcklbw %%xmm5,%%xmm5\n\t" \
@@ -75,7 +75,7 @@
  "punpcklbw %%xmm0,%%xmm7\n\t" \
  "punpcklbw %%xmm0,%%xmm0\n\t" \
  "psubw %%xmm0,%%xmm7\n\t" \
- "movdqa (%[buf]),%%xmm0\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
 
 /*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
 #define OC_LOAD_8x8 \
@@ -176,8 +176,8 @@
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
    This implementation is only 26 (+4 for spilling registers).*/ \
  "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
- "movdqa %%xmm7,0x10(%[buf])\n\t" \
- "movdqa %%xmm6,(%[buf])\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
  /*xmm7={0x7FFF}x4 \
    xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
  "pcmpeqb %%xmm7,%%xmm7\n\t" \
@@ -194,9 +194,9 @@
  "pmaxsw %%xmm3,%%xmm2\n\t" \
  "pmaxsw %%xmm1,%%xmm0\n\t" \
  "paddw %%xmm3,%%xmm6\n\t" \
- "movdqa 0x10(%[buf]),%%xmm3\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
  "paddw %%xmm5,%%xmm1\n\t" \
- "movdqa (%[buf]),%%xmm5\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
 
 /*Performs the second part of the final stage of the Hadamard transform and
    summing of absolute values.*/
@@ -236,10 +236,8 @@
  const unsigned char *_src,int _src_ystride,
  const unsigned char *_ref,int _ref_ystride){
   OC_ALIGN16(ogg_int16_t buf[16]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     dc;
-  bufp=buf;
+  unsigned ret;
+  unsigned dc;
   __asm__ __volatile__(
     OC_LOAD_SUB_8x8
     OC_HADAMARD_8x8
@@ -273,15 +271,14 @@
     "sub %[dc],%[ret]\n\t"
     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
        and %[dc] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+       them until after we're done using everything but %[buf].*/
     /*Note that _src_ystride and _ref_ystride must be given non-overlapping
        constraints, otherewise if gcc can prove they're equal it will allocate
        them to the same register (which is bad); _src and _ref face a similar
        problem.
       All four are destructively modified, but if we list them as output
        constraints, gcc can't alias them with other outputs.*/
-    :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"+r"(bufp)
+    :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
     :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
      [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
     /*We have to use neg, so we actually clobber the condition codes for once
@@ -307,10 +304,8 @@
 unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
  const unsigned char *_src,int _ystride){
   OC_ALIGN16(ogg_int16_t buf[16]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     dc;
-  bufp=buf;
+  unsigned ret;
+  unsigned dc;
   __asm__ __volatile__(
     OC_LOAD_8x8
     OC_HADAMARD_8x8
@@ -339,9 +334,8 @@
     "sub %[dc],%[ret]\n\t"
     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
        and %[dc] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"+r"(bufp)
+       them until after we're done using everything but %[buf].*/
+    :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
     /*We have to use sub, so we actually clobber the condition codes for once.*/

Added: experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -0,0 +1,464 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86int.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*Performs the first three stages of the iDCT.
+  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+   (accessed in that order).
+  The remaining rows must be in %[y] at their corresponding locations.
+  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC \
+  "#OC_IDCT_8x8_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+  "mov $0xEC83EC83,%[b]\n\t" \
+  "mov $0x61F861F8,%[a]\n\t" \
+  "movd %[b],%%xmm1\n\t" \
+  "movd %[a],%%xmm4\n\t" \
+  "pshufd $00,%%xmm1,%%xmm1\n\t" \
+  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  "movdqa %%xmm1,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm1\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "mov $0x8E3A8E3A,%[a]\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "mov $0xD4DBD4DB,%[b]\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm6,%%xmm0\n\t" \
+  "movd %[b],%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  "paddw %%xmm4,%%xmm2\n\t" \
+  "movd %[a],%%xmm4\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  "pshufd $00,%%xmm6,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm2\n\t" \
+  "movdqa %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm4\n\t" \
+  "mov $0x31F131F1,%[a]\n\t" \
+  "pmulhw %%xmm5,%%xmm1\n\t" \
+  "mov $0xFB15FB15,%[b]\n\t" \
+  "pmulhw %%xmm3,%%xmm6\n\t" \
+  "pmulhw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm6,%%xmm3\n\t" \
+  "movdqa 0x70(%[y]),%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm1\n\t" \
+  "movdqa 0x10(%[y]),%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movd %[a],%%xmm3\n\t" \
+  "psubw %%xmm4,%%xmm1\n\t" \
+  "movd %[b],%%xmm4\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+  "pshufd $00,%%xmm3,%%xmm3\n\t" \
+  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm5,%%xmm3\n\t" \
+  "mov $0xB505B505,%[a]\n\t" \
+  "pmulhw %%xmm5,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  "movdqa 0x40(%[y]),%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm7\n\t" \
+  "psubw %%xmm4,%%xmm3\n\t" \
+  "movd %[a],%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "movdqa 0x00(%[y]),%%xmm7\n\t" \
+  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  /*0-1 butterfly. \
+    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+  "paddw %%xmm7,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm7\n\t" \
+  "psubw %%xmm6,%%xmm7\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm3\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "movdqa %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm5\n\t" \
+  "paddw %%xmm7,%%xmm5\n\t" \
+  "movdqa %%xmm0,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+  "#OC_IDCT_8x8_D\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+  "#OC_IDCT_8x8_D_STORE\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "mov $0x00080008,%[a]\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "movdqa %%xmm4,0x40(%[y])\n\t" \
+  "movd %[a],%%xmm4\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "pshufd $0x00,%%xmm4,%%xmm4\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm4,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm5\n\t" \
+  "paddw 0x40(%[y]),%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "psraw $4,%%xmm0\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "movdqa %%xmm0,0x00(%[y])\n\t" \
+  "psraw $4,%%xmm1\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+  "movdqa %%xmm1,0x10(%[y])\n\t" \
+  "psraw $4,%%xmm2\n\t" \
+  "movdqa %%xmm2,0x20(%[y])\n\t" \
+  "psraw $4,%%xmm3\n\t" \
+  "movdqa %%xmm3,0x30(%[y])\n\t" \
+  "psraw $4,%%xmm4\n\t" \
+  "movdqa %%xmm4,0x40(%[y])\n\t" \
+  "psraw $4,%%xmm5\n\t" \
+  "movdqa %%xmm5,0x50(%[y])\n\t" \
+  "psraw $4,%%xmm6\n\t" \
+  "movdqa %%xmm6,0x60(%[y])\n\t" \
+  "psraw $4,%%xmm7\n\t" \
+  "movdqa %%xmm7,0x70(%[y])\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  int a;
+  int b;
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+    "movdqa 0x20(%[y]),%%xmm2\n\t"
+    "movdqa 0x60(%[y]),%%xmm6\n\t"
+    "movdqa 0x30(%[y]),%%xmm3\n\t"
+    "movdqa 0x50(%[y]),%%xmm5\n\t"
+    OC_IDCT_8x8_ABC
+    OC_IDCT_8x8_D
+    OC_TRANSPOSE_8x8
+    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+    "movdqa %%xmm7,0x70(%[y])\n\t"
+    "movdqa %%xmm4,0x40(%[y])\n\t"
+    "movdqa %%xmm1,0x10(%[y])\n\t"
+    "movdqa %%xmm0,0x00(%[y])\n\t"
+    OC_IDCT_8x8_ABC
+    OC_IDCT_8x8_D_STORE
+    :[a]"=&r"(a),[b]"=&r"(b),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[y]"r"(_y)
+  );
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+   need to work with four columns at a time.
+  Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+  "#OC_IDCT_8x8_10_MMX\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+  "mov $0x61F861F8,%[a]\n\t" \
+  "mov $0xEC83EC83,%[b]\n\t" \
+  "movd %[a],%%mm7\n\t" \
+  "movd %[b],%%mm6\n\t" \
+  "punpckldq %%mm7,%%mm7\n\t" \
+  "punpckldq %%mm6,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm7\n\t" \
+  "mov $0x8E3A8E3A,%[a]\n\t" \
+  "mov $0xD4DBD4DB,%[b]\n\t" \
+  "movd %[a],%%mm5\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movd %[b],%%mm2\n\t" \
+  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+  "punpckldq %%mm2,%%mm2\n\t" \
+  "pmulhw %%mm3,%%mm5\n\t" \
+  "mov $0xFB15FB15,%[b]\n\t" \
+  "pmulhw %%mm3,%%mm2\n\t" \
+  "mov $0x31F131F1,%[a]\n\t" \
+  "movd %[b],%%mm7\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "paddw %%mm3,%%mm2\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+  "movd %[a],%%mm3\n\t" \
+  "punpckldq %%mm7,%%mm7\n\t" \
+  "punpckldq %%mm3,%%mm3\n\t" \
+  "pmulhw %%mm1,%%mm3\n\t" \
+  "mov $0xB505B505,%[a]\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "movd %[a],%%mm4\n\t" \
+  "movq %%mm3,%%mm6\n\t" \
+  "paddw %%mm1,%%mm7\n\t" \
+  /*0-1 butterfly. \
+    mm4=C4, mm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+  "punpckldq %%mm4,%%mm4\n\t" \
+  "psubw %%mm5,%%mm3\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "pmulhw %%mm0,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "movq %%mm7,%%mm0\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "paddw %%mm2,%%mm0\n\t" \
+  "psubw %%mm2,%%mm7\n\t" \
+  "movq %%mm1,%%mm2\n\t" \
+  "pmulhw %%mm6,%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm2\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+  "paddw %%mm7,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "paddw %%mm4,%%mm7\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm6,%%mm5\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "psubw %%mm1,%%mm6\n\t" \
+  "psubw %%mm2,%%mm5\n\t" \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "paddw %%mm5,%%mm2\n\t" \
+  "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+  "#OC_IDCT_8x8_10_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+  "mov $0x61F861F8,%[a]\n\t" \
+  "mov $0xEC83EC83,%[b]\n\t" \
+  "movd %[a],%%xmm7\n\t" \
+  "movd %[b],%%xmm6\n\t" \
+  "pshufd $00,%%xmm7,%%xmm7\n\t" \
+  "pshufd $00,%%xmm6,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "mov $0x8E3A8E3A,%[a]\n\t" \
+  "mov $0xD4DBD4DB,%[b]\n\t" \
+  "movd %[a],%%xmm5\n\t" \
+  "paddw %%xmm6,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movd %[b],%%xmm2\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+  "pshufd $00,%%xmm2,%%xmm2\n\t" \
+  "pmulhw %%xmm3,%%xmm5\n\t" \
+  "mov $0xFB15FB15,%[b]\n\t" \
+  "pmulhw %%xmm3,%%xmm2\n\t" \
+  "mov $0x31F131F1,%[a]\n\t" \
+  "movd %[b],%%xmm7\n\t" \
+  "paddw %%xmm3,%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+  "movd %[a],%%xmm3\n\t" \
+  "pshufd $00,%%xmm7,%%xmm7\n\t" \
+  "pshufd $00,%%xmm3,%%xmm3\n\t" \
+  "pmulhw %%xmm1,%%xmm3\n\t" \
+  "mov $0xB505B505,%[a]\n\t" \
+  "pmulhw %%xmm1,%%xmm7\n\t" \
+  "movd %[a],%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=C4, xmm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+  "pshufd $00,%%xmm4,%%xmm4\n\t" \
+  "psubw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm1\n\t" \
+  "pmulhw %%xmm0,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm4\n\t" \
+  "movdqa %%xmm7,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  int a;
+  int b;
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    "movq 0x20(%[y]),%%mm2\n\t"
+    "movq 0x30(%[y]),%%mm3\n\t"
+    "movq 0x10(%[y]),%%mm1\n\t"
+    "movq 0x00(%[y]),%%mm0\n\t"
+    OC_IDCT_8x8_10_MMX
+    OC_TRANSPOSE_8x4_MMX2SSE
+    OC_IDCT_8x8_10_ABC
+    OC_IDCT_8x8_D_STORE
+    :[a]"=&r"(a),[b]"=&r"(b),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[y]"r"(_y)
+  );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10_sse2(_y);
+  else oc_idct8x8_slow_sse2(_y);
+}
+
+#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h	2010-05-28 05:35:32 UTC (rev 17247)
@@ -108,7 +108,7 @@
 #  define OC_TRANSPOSE_8x8 \
  "#OC_TRANSPOSE_8x8\n\t" \
  /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
- "movdqa %%xmm0,(%[buf])\n\t" \
+ "movdqa %%xmm0,"OC_ARRAY_OFFS(0x00,buf)"\n\t" \
  /*xmm0 is free.*/ \
  "movdqa %%xmm2,%%xmm0\n\t" \
  /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
@@ -116,9 +116,9 @@
  /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
  "punpcklwd %%xmm3,%%xmm0\n\t" \
  /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
- "movdqa (%[buf]),%%xmm3\n\t" \
+ "movdqa "OC_ARRAY_OFFS(0x00,buf)",%%xmm3\n\t" \
  /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "movdqa %%xmm2,0x10(%[buf])\n\t" \
+ "movdqa %%xmm2,"OC_ARRAY_OFFS(0x10,buf)"\n\t" \
  /*xmm2 is free.*/ \
  "movdqa %%xmm6,%%xmm2\n\t" \
  /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
@@ -144,9 +144,9 @@
  /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
  "punpckhdq %%xmm2,%%xmm1\n\t" \
  /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "movdqa 0x10(%[buf]),%%xmm2\n\t" \
+ "movdqa "OC_ARRAY_OFFS(0x10,buf)",%%xmm2\n\t" \
  /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "movdqa %%xmm1,(%[buf])\n\t" \
+ "movdqa %%xmm1,"OC_ARRAY_OFFS(0x00,buf)"\n\t" \
  /*xmm1 is free.*/ \
  "movdqa %%xmm3,%%xmm1\n\t" \
  /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
@@ -172,9 +172,9 @@
  /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
  "punpcklqdq %%xmm0,%%xmm2\n\t" \
  /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "movdqa (%[buf]),%%xmm0\n\t" \
+ "movdqa "OC_ARRAY_OFFS(0x00,buf)",%%xmm0\n\t" \
  /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "movdqa %%xmm2,0x10(%[buf])\n\t" \
+ "movdqa %%xmm2,"OC_ARRAY_OFFS(0x10,buf)"\n\t" \
  /*xmm2 is free.*/ \
  "movdqa %%xmm3,%%xmm2\n\t" \
  /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
@@ -194,8 +194,50 @@
  /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
  "punpckhqdq %%xmm0,%%xmm7\n\t" \
  /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "movdqa 0x10(%[buf]),%%xmm0\n\t" \
+ "movdqa "OC_ARRAY_OFFS(0x10,buf)",%%xmm0\n\t" \
 
 # endif
 
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+   four SSE registers.
+  No need to be clever here; we have plenty of room.*/
+#  define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -43,13 +43,10 @@
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
     _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
-    /*These routines work on x86-32, but are actually slower than the MMX ones
-       on my Core Duo, which is probably the most advanced SSE2 engine any
-       32-bit Intel chip had.*/
+# endif
     _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
     _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
-# endif
   }
 }
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-05-28 05:35:32 UTC (rev 17247)
@@ -19,6 +19,22 @@
 # define _x86_x86int_H (1)
 # include "../internal.h"
 
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+  To avoid warnings, we force an offset with %H (which adds 8).*/
+#define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs-8+%H[_name])
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
+   array_addr__;}))
+
 void oc_state_vtable_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
@@ -30,6 +46,7 @@
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
 void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
@@ -37,6 +54,8 @@
  int _dst_frame,int _src_frame,int _pli);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
  int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c	2010-05-24 22:02:27 UTC (rev 17246)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c	2010-05-28 05:35:32 UTC (rev 17247)
@@ -39,7 +39,28 @@
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
 };
 
 void oc_state_vtable_init_x86(oc_theora_state *_state){
@@ -58,5 +79,13 @@
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   else oc_state_vtable_init_c(_state);
+  if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmxext;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_SSE2){
+    _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+  }
 }
 #endif