[xiph-commits] r16220 - in branches/theora-thusnelda/lib: dec/x86 dec/x86_vc enc/x86 enc/x86_vc

tterribe at svn.xiph.org tterribe at svn.xiph.org
Tue Jul 7 19:06:38 PDT 2009


Author: tterribe
Date: 2009-07-07 19:06:38 -0700 (Tue, 07 Jul 2009)
New Revision: 16220

Modified:
   branches/theora-thusnelda/lib/dec/x86/mmxloop.h
   branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.h
   branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h
   branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
   branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c
   branches/theora-thusnelda/lib/enc/x86/x86enc.c
   branches/theora-thusnelda/lib/enc/x86_vc/mmxencfrag.c
   branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c
Log:
Minor MSVC asm updates/cleanups.


Modified: branches/theora-thusnelda/lib/dec/x86/mmxloop.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxloop.h	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86/mmxloop.h	2009-07-08 02:06:38 UTC (rev 16220)
@@ -92,7 +92,7 @@
 
 #define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
   do{ \
-    ptrdiff_t ystride3; \
+    ptrdiff_t ystride3__; \
     __asm__ __volatile__( \
       /*mm0={a0,...,a7}*/ \
       "movq (%[pix]),%%mm0\n\t" \
@@ -108,7 +108,7 @@
       /*Write it back out.*/ \
       "movq %%mm1,(%[pix],%[ystride])\n\t" \
       "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
-      :[ystride3]"=&r"(ystride3) \
+      :[ystride3]"=&r"(ystride3__) \
       :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
        [ll]"r"(_ll) \
       :"memory" \
@@ -118,10 +118,10 @@
 
 #define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
   do{ \
-    unsigned char *pix; \
-    ptrdiff_t      ystride3; \
-    ptrdiff_t      d; \
-    pix=(_pix)-2; \
+    unsigned char *pix__; \
+    ptrdiff_t      ystride3__; \
+    ptrdiff_t      d__; \
+    pix__=(_pix)-2; \
     __asm__ __volatile__( \
       /*x x x x d0 c0 b0 a0*/ \
       "movd (%[pix]),%%mm0\n\t" \
@@ -204,7 +204,7 @@
       "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
       "shr $16,%[d]\n\t" \
       "movw %w[d],1(%[pix],%[ystride3])\n\t" \
-      :[pix]"+r"(pix),[ystride3]"=&r"(ystride3),[d]"=&r"(d) \
+      :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
       :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
       :"memory" \
     ); \

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -30,137 +30,137 @@
    between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
-  #define SRC edx
-  #define DST eax
-  #define YSTRIDE ecx
-  #define YSTRIDE3 ebx
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 ebx
   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
-  #undef SRC
-  #undef DST
-  #undef YSTRIDE
-  #undef YSTRIDE3
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
 }
 
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
-  __asm {
-      #define DST edx
-      #define DST4 esi
-      #define YSTRIDE eax
-      #define YSTRIDE3 ebx
-      #define RESIDUE ecx
-      mov DST, _dst
-      mov YSTRIDE, _ystride
-      mov RESIDUE, _residue
-      lea DST4, [DST+YSTRIDE*4]
-      lea YSTRIDE3, [YSTRIDE+YSTRIDE*2]
-      /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
-      pcmpeqw mm0,mm0
-      /*#0 Load low residue.*/
-      movq mm1,[0*8+RESIDUE]
-      /*#0 Load high residue.*/
-      movq mm2,[1*8+RESIDUE]
-      /*Set mm0 to 0x8000800080008000.*/
-      psllw mm0,15
-      /*#1 Load low residue.*/
-      movq mm3,[2*8+RESIDUE]
-      /*#1 Load high residue.*/
-      movq mm4,[3*8+RESIDUE]
-      /*Set mm0 to 0x0080008000800080.*/
-      psrlw mm0,8
-      /*#2 Load low residue.*/
-      movq mm5,[4*8+RESIDUE]
-      /*#2 Load high residue.*/
-      movq mm6,[5*8+RESIDUE]
-      /*#0 Bias low  residue.*/
-      paddsw mm1,mm0
-      /*#0 Bias high residue.*/
-      paddsw mm2,mm0
-      /*#0 Pack to byte.*/
-      packuswb mm1,mm2
-      /*#1 Bias low  residue.*/
-      paddsw mm3,mm0
-      /*#1 Bias high residue.*/
-      paddsw mm4,mm0
-      /*#1 Pack to byte.*/
-      packuswb mm3,mm4
-      /*#2 Bias low  residue.*/
-      paddsw mm5,mm0
-      /*#2 Bias high residue.*/
-      paddsw mm6,mm0
-      /*#2 Pack to byte.*/
-      packuswb mm5,mm6
-      /*#0 Write row.*/
-      movq [DST],mm1
-      /*#1 Write row.*/
-      movq [DST+YSTRIDE],mm3
-      /*#2 Write row.*/
-      movq [DST+YSTRIDE*2],mm5
-      /*#3 Load low residue.*/
-      movq mm1,[6*8+RESIDUE]
-      /*#3 Load high residue.*/
-      movq mm2,[7*8+RESIDUE]
-      /*#4 Load high residue.*/
-      movq mm3,[8*8+RESIDUE]
-      /*#4 Load high residue.*/
-      movq mm4,[9*8+RESIDUE]
-      /*#5 Load high residue.*/
-      movq mm5,[10*8+RESIDUE]
-      /*#5 Load high residue.*/
-      movq mm6,[11*8+RESIDUE]
-      /*#3 Bias low  residue.*/
-      paddsw mm1,mm0
-      /*#3 Bias high residue.*/
-      paddsw mm2,mm0
-      /*#3 Pack to byte.*/
-      packuswb mm1,mm2
-      /*#4 Bias low  residue.*/
-      paddsw mm3,mm0
-      /*#4 Bias high residue.*/
-      paddsw mm4,mm0
-      /*#4 Pack to byte.*/
-      packuswb mm3,mm4
-      /*#5 Bias low  residue.*/
-      paddsw mm5,mm0
-      /*#5 Bias high residue.*/
-      paddsw mm6,mm0
-      /*#5 Pack to byte.*/
-      packuswb mm5,mm6
-      /*#3 Write row.*/
-      movq [DST+YSTRIDE3],mm1
-      /*#4 Write row.*/
-      movq [DST4],mm3
-      /*#5 Write row.*/
-      movq [DST4+YSTRIDE],mm5
-      /*#6 Load low residue.*/
-      movq mm1,[12*8+RESIDUE]
-      /*#6 Load high residue.*/
-      movq mm2,[13*8+RESIDUE]
-      /*#7 Load low residue.*/
-      movq mm3,[14*8+RESIDUE]
-      /*#7 Load high residue.*/
-      movq mm4,[15*8+RESIDUE]
-      /*#6 Bias low  residue.*/
-      paddsw mm1,mm0
-      /*#6 Bias high residue.*/
-      paddsw mm2,mm0
-      /*#6 Pack to byte.*/
-      packuswb mm1,mm2
-      /*#7 Bias low  residue.*/
-      paddsw mm3,mm0
-      /*#7 Bias high residue.*/
-      paddsw mm4,mm0
-      /*#7 Pack to byte.*/
-      packuswb mm3,mm4
-      /*#6 Write row.*/
-      movq [DST4+YSTRIDE*2],mm1
-      /*#7 Write row.*/
-      movq [DST4+YSTRIDE3],mm3
-      #undef DST
-      #undef DST4
-      #undef YSTRIDE
-      #undef YSTRIDE3
-      #undef RESIDUE         
+  __asm{
+#define DST edx
+#define DST4 esi
+#define YSTRIDE eax
+#define YSTRIDE3 ebx
+#define RESIDUE ecx
+    mov DST,_dst
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    lea DST4,[DST+YSTRIDE*4]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    pcmpeqw mm0,mm0
+    /*#0 Load low residue.*/
+    movq mm1,[0*8+RESIDUE]
+    /*#0 Load high residue.*/
+    movq mm2,[1*8+RESIDUE]
+    /*Set mm0 to 0x8000800080008000.*/
+    psllw mm0,15
+    /*#1 Load low residue.*/
+    movq mm3,[2*8+RESIDUE]
+    /*#1 Load high residue.*/
+    movq mm4,[3*8+RESIDUE]
+    /*Set mm0 to 0x0080008000800080.*/
+    psrlw mm0,8
+    /*#2 Load low residue.*/
+    movq mm5,[4*8+RESIDUE]
+    /*#2 Load high residue.*/
+    movq mm6,[5*8+RESIDUE]
+    /*#0 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#0 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#0 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#1 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#1 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#1 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#2 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#2 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#2 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#0 Write row.*/
+    movq [DST],mm1
+    /*#1 Write row.*/
+    movq [DST+YSTRIDE],mm3
+    /*#2 Write row.*/
+    movq [DST+YSTRIDE*2],mm5
+    /*#3 Load low residue.*/
+    movq mm1,[6*8+RESIDUE]
+    /*#3 Load high residue.*/
+    movq mm2,[7*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm3,[8*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm4,[9*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm5,[10*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm6,[11*8+RESIDUE]
+    /*#3 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#3 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#3 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#4 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#4 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#4 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#5 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#5 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#5 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#3 Write row.*/
+    movq [DST+YSTRIDE3],mm1
+    /*#4 Write row.*/
+    movq [DST4],mm3
+    /*#5 Write row.*/
+    movq [DST4+YSTRIDE],mm5
+    /*#6 Load low residue.*/
+    movq mm1,[12*8+RESIDUE]
+    /*#6 Load high residue.*/
+    movq mm2,[13*8+RESIDUE]
+    /*#7 Load low residue.*/
+    movq mm3,[14*8+RESIDUE]
+    /*#7 Load high residue.*/
+    movq mm4,[15*8+RESIDUE]
+    /*#6 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#6 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#6 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#7 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#7 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#7 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#6 Write row.*/
+    movq [DST4+YSTRIDE*2],mm1
+    /*#7 Write row.*/
+    movq [DST4+YSTRIDE3],mm3
+#undef DST
+#undef DST4
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
   }
 }
 
@@ -170,15 +170,15 @@
   /*Zero mm0.*/
   __asm pxor mm0,mm0;
   for(i=4;i-->0;){
-    __asm {
-      #define DST edx
-      #define SRC ecx
-      #define YSTRIDE ebx
-      #define RESIDUE eax
-      mov DST, _dst
-      mov SRC, _src
-      mov YSTRIDE, _ystride
-      mov RESIDUE, _residue
+    __asm{
+#define DST edx
+#define SRC ecx
+#define YSTRIDE ebx
+#define RESIDUE eax
+      mov DST,_dst
+      mov SRC,_src
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
       /*#0 Load source.*/
       movq mm3,[SRC]
       /*#1 Load source.*/
@@ -217,13 +217,13 @@
       movq [DST+YSTRIDE],mm7
       /*Advance dst.*/
       lea DST,[DST+YSTRIDE*2]
-      mov _residue, RESIDUE
-      mov _dst, DST
-      mov _src, SRC
-      #undef DST
-      #undef SRC
-      #undef YSTRIDE
-      #undef RESIDUE
+      mov _residue,RESIDUE
+      mov _dst,DST
+      mov _src,SRC
+#undef DST
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
     }
   }
 }
@@ -234,17 +234,17 @@
   /*Zero mm7.*/
   __asm pxor mm7,mm7;
   for(i=4;i-->0;){
-    __asm {
-      #define SRC1 ecx
-      #define SRC2 ebx
-      #define YSTRIDE esi
-      #define RESIDUE edx
-      #define DST eax
-      mov YSTRIDE, _ystride
-      mov DST, _dst
-      mov RESIDUE, _residue
-      mov SRC1, _src1
-      mov SRC2, _src2
+    __asm{
+#define SRC1 ecx
+#define SRC2 ebx
+#define YSTRIDE esi
+#define RESIDUE edx
+#define DST eax
+      mov YSTRIDE,_ystride
+      mov DST,_dst
+      mov RESIDUE,_residue
+      mov SRC1,_src1
+      mov SRC2,_src2
       /*#0 Load src1.*/
       movq mm0,[SRC1]
       /*#0 Load src2.*/
@@ -317,15 +317,15 @@
       add RESIDUE,32
       /*Advance dest ptr.*/
       lea DST,[DST+YSTRIDE*2]
-      mov _dst, DST
-      mov _residue, RESIDUE
-      mov _src1, SRC1
-      mov _src2, SRC2
-      #undef SRC1
-      #undef SRC2
-      #undef YSTRIDE
-      #undef RESIDUE
-      #undef DST
+      mov _dst,DST
+      mov _residue,RESIDUE
+      mov _src1,SRC1
+      mov _src2,SRC2
+#undef SRC1
+#undef SRC2
+#undef YSTRIDE
+#undef RESIDUE
+#undef DST
     }
   }
 }

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.h	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxfrag.h	2009-07-08 02:06:38 UTC (rev 16220)
@@ -1,5 +1,5 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
+#if !defined(_x86_vc_mmxfrag_H)
+# define _x86_vc_mmxfrag_H (1)
 # include <stddef.h>
 # include "x86int.h"
 
@@ -11,13 +11,11 @@
   do{ \
     const unsigned char *src; \
     unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
     src=(_src); \
     dst=(_dst); \
-    __asm  mov SRC, src \
-    __asm  mov DST, dst \
-    __asm  mov YSTRIDE, _ystride \
-    __asm  mov YSTRIDE3, ystride3 \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
     /*src+0*ystride*/ \
     __asm  movq mm0,[SRC] \
     /*src+1*ystride*/ \

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -51,7 +51,7 @@
 };
 
 /*38 cycles*/
-#define OC_IDCT_BEGIN __asm { \
+#define OC_IDCT_BEGIN __asm{ \
   __asm movq mm2,OC_I(3) \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
@@ -129,7 +129,7 @@
 }
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm {\
+#define OC_ROW_IDCT __asm{ \
   OC_IDCT_BEGIN \
   /*r3=D'*/ \
   __asm  movq mm3,OC_I(2) \
@@ -190,7 +190,7 @@
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm { \
+#define OC_TRANSPOSE __asm{ \
   __asm movq mm1,mm4 \
   __asm punpcklwd mm4,mm5 \
   __asm movq OC_I(0),mm0 \
@@ -229,7 +229,7 @@
 }
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm {\
+#define OC_COLUMN_IDCT __asm{ \
   OC_IDCT_BEGIN \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
@@ -306,11 +306,11 @@
 static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
-  __asm {
+  __asm{
 #define CONSTS eax
 #define Y edx
-    mov CONSTS, offset OC_IDCT_CONSTS
-    mov Y, _y
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
 #define OC_I(_k)      [Y+_k*16]
 #define OC_J(_k)      [Y+(_k-4)*16+8]
     OC_ROW_IDCT
@@ -339,7 +339,7 @@
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm {\
+#define OC_IDCT_BEGIN_10 __asm{ \
   __asm movq mm2,OC_I(3) \
   __asm nop \
   __asm movq mm6,OC_C(3) \
@@ -390,119 +390,119 @@
   __asm paddw mm2,mm6 \
   __asm psubw mm2,mm1 \
   __asm nop \
- }
+}
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm {\
- OC_IDCT_BEGIN_10 \
- /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
- /*r4=E'=E-G*/ \
-  __asm psubw mm4,mm7 \
- /*r1=H'+H'*/ \
-  __asm paddw mm1,mm1 \
- /*r7=G+G*/ \
-  __asm paddw mm7,mm7 \
- /*r1=R1=A''+H'*/ \
-  __asm paddw mm1,mm2 \
- /*r7=G'=E+G*/ \
-  __asm paddw mm7,mm4 \
- /*r4=R4=E'-D'*/ \
-  __asm psubw mm4,mm3 \
-  __asm paddw mm3,mm3 \
- /*r6=R6=F'-B''*/ \
-  __asm psubw mm6,mm5 \
-  __asm paddw mm5,mm5 \
- /*r3=R3=E'+D'*/ \
-  __asm paddw mm3,mm4 \
- /*r5=R5=F'+B''*/ \
-  __asm paddw mm5,mm6 \
- /*r7=R7=G'-C'*/ \
-  __asm psubw mm7,mm0 \
-  __asm paddw mm0,mm0 \
- /*Save R1.*/ \
-  __asm movq OC_I(1),mm1 \
- /*r0=R0=G'+C'*/ \
-  __asm paddw mm0,mm7 \
+#define OC_ROW_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
+  /*r3=D'*/ \
+   __asm movq mm3,OC_I(2) \
+  /*r4=E'=E-G*/ \
+   __asm psubw mm4,mm7 \
+  /*r1=H'+H'*/ \
+   __asm paddw mm1,mm1 \
+  /*r7=G+G*/ \
+   __asm paddw mm7,mm7 \
+  /*r1=R1=A''+H'*/ \
+   __asm paddw mm1,mm2 \
+  /*r7=G'=E+G*/ \
+   __asm paddw mm7,mm4 \
+  /*r4=R4=E'-D'*/ \
+   __asm psubw mm4,mm3 \
+   __asm paddw mm3,mm3 \
+  /*r6=R6=F'-B''*/ \
+   __asm psubw mm6,mm5 \
+   __asm paddw mm5,mm5 \
+  /*r3=R3=E'+D'*/ \
+   __asm paddw mm3,mm4 \
+  /*r5=R5=F'+B''*/ \
+   __asm paddw mm5,mm6 \
+  /*r7=R7=G'-C'*/ \
+   __asm psubw mm7,mm0 \
+   __asm paddw mm0,mm0 \
+  /*Save R1.*/ \
+   __asm movq OC_I(1),mm1 \
+  /*r0=R0=G'+C'*/ \
+   __asm paddw mm0,mm7 \
 }
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm {\
- OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
   __asm paddw mm2,OC_8 \
- /*r1=H'+H'*/ \
+  /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
- /*r1=R1=A''+H'*/ \
+  /*r1=R1=A''+H'*/ \
   __asm paddw mm1,mm2 \
- /*r2=NR2*/ \
+  /*r2=NR2*/ \
   __asm psraw mm2,4 \
- /*r4=E'=E-G*/ \
+  /*r4=E'=E-G*/ \
   __asm psubw mm4,mm7 \
- /*r1=NR1*/ \
+  /*r1=NR1*/ \
   __asm psraw mm1,4 \
- /*r3=D'*/ \
+  /*r3=D'*/ \
   __asm movq mm3,OC_I(2) \
- /*r7=G+G*/ \
+  /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
- /*Store NR2 at I(2).*/ \
+  /*Store NR2 at I(2).*/ \
   __asm movq OC_I(2),mm2 \
- /*r7=G'=E+G*/ \
+  /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
- /*Store NR1 at I(1).*/ \
+  /*Store NR1 at I(1).*/ \
   __asm movq OC_I(1),mm1 \
- /*r4=R4=E'-D'*/ \
+  /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
- /*r3=D'+D'*/ \
+  /*r3=D'+D'*/ \
   __asm paddw mm3,mm3 \
- /*r3=R3=E'+D'*/ \
+  /*r3=R3=E'+D'*/ \
   __asm paddw mm3,mm4 \
- /*r4=NR4*/ \
+  /*r4=NR4*/ \
   __asm psraw mm4,4 \
- /*r6=R6=F'-B''*/ \
+  /*r6=R6=F'-B''*/ \
   __asm psubw mm6,mm5 \
- /*r3=NR3*/ \
+  /*r3=NR3*/ \
   __asm psraw mm3,4 \
   __asm paddw mm6,OC_8 \
- /*r5=B''+B''*/ \
+  /*r5=B''+B''*/ \
   __asm paddw mm5,mm5 \
- /*r5=R5=F'+B''*/ \
+  /*r5=R5=F'+B''*/ \
   __asm paddw mm5,mm6 \
- /*r6=NR6*/ \
+  /*r6=NR6*/ \
   __asm psraw mm6,4 \
- /*Store NR4 at J(4).*/ \
+  /*Store NR4 at J(4).*/ \
   __asm movq OC_J(4),mm4 \
- /*r5=NR5*/ \
+  /*r5=NR5*/ \
   __asm psraw mm5,4 \
- /*Store NR3 at I(3).*/ \
+  /*Store NR3 at I(3).*/ \
   __asm movq OC_I(3),mm3 \
- /*r7=R7=G'-C'*/ \
+  /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
- /*r0=C'+C'*/ \
+  /*r0=C'+C'*/ \
   __asm paddw mm0,mm0 \
- /*r0=R0=G'+C'*/ \
+  /*r0=R0=G'+C'*/ \
   __asm paddw mm0,mm7 \
- /*r7=NR7*/ \
+  /*r7=NR7*/ \
   __asm psraw mm7,4 \
- /*Store NR6 at J(6).*/ \
+  /*Store NR6 at J(6).*/ \
   __asm movq OC_J(6),mm6 \
- /*r0=NR0*/ \
+  /*r0=NR0*/ \
   __asm psraw mm0,4 \
- /*Store NR5 at J(5).*/ \
+  /*Store NR5 at J(5).*/ \
   __asm movq OC_J(5),mm5 \
- /*Store NR7 at J(7).*/ \
+  /*Store NR7 at J(7).*/ \
   __asm movq OC_J(7),mm7 \
- /*Store NR0 at I(0).*/ \
+  /*Store NR0 at I(0).*/ \
   __asm movq OC_I(0),mm0 \
-} 
+}
 
 static void oc_idct8x8_10(ogg_int16_t _y[64]){
-  __asm {
+  __asm{
 #define CONSTS eax
 #define Y edx
-    mov CONSTS, offset OC_IDCT_CONSTS
-    mov Y, _y
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
 #define OC_I(_k) [Y+_k*16]
 #define OC_J(_k) [Y+(_k-4)*16+8]
     /*Done with dequant, descramble, and partial transpose.
@@ -581,11 +581,11 @@
        no iDCT rounding.*/
     p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
     /*Fill _y with p.*/
-    __asm {
-      #define Y eax
-      #define P ecx
-      mov Y, _y
-      movd P, p
+    __asm{
+#define Y eax
+#define P ecx
+      mov Y,_y
+      movd P,p
       /*mm0=0000 0000 0000 AAAA*/
       movd mm0,P
       /*mm0=0000 0000 AAAA AAAA*/
@@ -608,17 +608,17 @@
       movq [104+Y],mm0
       movq [112+Y],mm0
       movq [120+Y],mm0
-      #undef Y
-      #undef P
+#undef Y
+#undef P
     }
   }
   else{
     int zzi;
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm {
-      #define Y eax
-      mov Y, _y
+    __asm{
+#define Y eax
+      mov Y,_y
       pxor mm0,mm0
       movq [Y],mm0
       movq [8+Y],mm0
@@ -636,7 +636,7 @@
       movq [104+Y],mm0
       movq [112+Y],mm0
       movq [120+Y],mm0
-      #undef Y
+#undef Y
     }
     /*Dequantize the coefficients.*/
     _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
@@ -649,5 +649,4 @@
   }
 }
 
-
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h	2009-07-08 02:06:38 UTC (rev 16220)
@@ -1,5 +1,5 @@
-#if !defined(_x86_mmxloop_H)
-# define _x86_mmxloop_H (1)
+#if !defined(_x86_vc_mmxloop_H)
+# define _x86_vc_mmxloop_H (1)
 # include <stddef.h>
 # include "x86int.h"
 
@@ -8,100 +8,99 @@
 /*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
   On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
-#define OC_LOOP_FILTER8_MMX __asm { \
- /*mm7=0*/ \
-__asm pxor mm7,mm7 \
- /*mm6:mm0={a0,...,a7}*/ \
-__asm movq mm6,mm0 \
-__asm punpcklbw mm0,mm7 \
-__asm punpckhbw mm6,mm7 \
- /*mm3:mm5={d0,...,d7}*/ \
-__asm movq mm5,mm3 \
-__asm punpcklbw mm3,mm7 \
-__asm punpckhbw mm5,mm7 \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
-__asm psubw mm0,mm3 \
-__asm psubw mm6,mm5 \
- /*mm3:mm1={b0,...,b7}*/ \
-__asm movq mm3,mm1 \
-__asm punpcklbw mm1,mm7 \
-__asm movq mm4,mm2 \
-__asm punpckhbw mm3,mm7 \
- /*mm5:mm4={c0,...,c7}*/ \
-__asm movq mm5,mm2 \
-__asm punpcklbw mm4,mm7 \
-__asm punpckhbw mm5,mm7 \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
-__asm pcmpeqw mm7,mm7 \
-__asm psubw mm4,mm1 \
-__asm psrlw mm7,14 \
-__asm psubw mm5,mm3 \
- /*Scale by 3.*/ \
-__asm pmullw mm4,mm7 \
-__asm pmullw mm5,mm7 \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
-__asm psrlw mm7,1 \
-__asm paddw mm4,mm0 \
-__asm psllw mm7,2 \
-__asm movq mm0,[LL] \
-__asm paddw mm5,mm6 \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
-__asm psubw mm4,mm7 \
-__asm psubw mm5,mm7 \
-__asm psraw mm4,3 \
-__asm psraw mm5,3 \
-__asm pcmpeqb mm7,mm7 \
-__asm packsswb mm4,mm5 \
-__asm pxor mm6,mm6 \
-__asm pxor mm4,mm7 \
-__asm packuswb mm1,mm3 \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
-__asm pcmpgtb mm6,mm4 \
-__asm psubb mm7,mm0 \
-__asm pxor mm4,mm6 \
-__asm psubb mm7,mm0 \
-__asm psubb mm4,mm6 \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
-__asm paddusb mm7,mm4 \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
-__asm paddusb mm4,mm7 \
-__asm psubusb mm4,mm7 \
- /*Now split mm4 by the original sign of -R_i.*/ \
-__asm movq mm5,mm4 \
-__asm pand mm4,mm6 \
-__asm pandn mm6,mm5 \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
-__asm paddusb mm1,mm4 \
-__asm psubusb mm2,mm4 \
-__asm psubusb mm1,mm6 \
-__asm paddusb mm2,mm6 \
+#define OC_LOOP_FILTER8_MMX __asm{ \
+  /*mm7=0*/ \
+  __asm pxor mm7,mm7 \
+  /*mm6:mm0={a0,...,a7}*/ \
+  __asm movq mm6,mm0 \
+  __asm punpcklbw mm0,mm7 \
+  __asm punpckhbw mm6,mm7 \
+  /*mm3:mm5={d0,...,d7}*/ \
+  __asm movq mm5,mm3 \
+  __asm punpcklbw mm3,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  __asm psubw mm0,mm3 \
+  __asm psubw mm6,mm5 \
+  /*mm3:mm1={b0,...,b7}*/ \
+  __asm movq mm3,mm1 \
+  __asm punpcklbw mm1,mm7 \
+  __asm movq mm4,mm2 \
+  __asm punpckhbw mm3,mm7 \
+  /*mm5:mm4={c0,...,c7}*/ \
+  __asm movq mm5,mm2 \
+  __asm punpcklbw mm4,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  __asm pcmpeqw mm7,mm7 \
+  __asm psubw mm4,mm1 \
+  __asm psrlw mm7,14 \
+  __asm psubw mm5,mm3 \
+  /*Scale by 3.*/ \
+  __asm pmullw mm4,mm7 \
+  __asm pmullw mm5,mm7 \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  __asm psrlw mm7,1 \
+  __asm paddw mm4,mm0 \
+  __asm psllw mm7,2 \
+  __asm movq mm0,[LL] \
+  __asm paddw mm5,mm6 \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  __asm psubw mm4,mm7 \
+  __asm psubw mm5,mm7 \
+  __asm psraw mm4,3 \
+  __asm psraw mm5,3 \
+  __asm pcmpeqb mm7,mm7 \
+  __asm packsswb mm4,mm5 \
+  __asm pxor mm6,mm6 \
+  __asm pxor mm4,mm7 \
+  __asm packuswb mm1,mm3 \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  __asm pcmpgtb mm6,mm4 \
+  __asm psubb mm7,mm0 \
+  __asm pxor mm4,mm6 \
+  __asm psubb mm7,mm0 \
+  __asm psubb mm4,mm6 \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  __asm paddusb mm7,mm4 \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  __asm paddusb mm4,mm7 \
+  __asm psubusb mm4,mm7 \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  __asm movq mm5,mm4 \
+  __asm pand mm4,mm6 \
+  __asm pandn mm6,mm5 \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  __asm paddusb mm1,mm4 \
+  __asm psubusb mm2,mm4 \
+  __asm psubusb mm1,mm6 \
+  __asm paddusb mm2,mm6 \
 }
 
 #define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
   do{ \
-    ptrdiff_t ystride3; \
-    unsigned char* pix_ = _pix; \
-    OC_ALIGN8(unsigned char* ll_); \
-    ll_ = _ll; \
-    __asm mov YSTRIDE, _ystride \
-    __asm mov YSTRIDE3, ystride3 \
-    __asm mov LL, ll_ \
-    __asm mov PIX, pix_ \
-    __asm sub PIX, YSTRIDE \
-    __asm sub PIX, YSTRIDE \
+    unsigned char *pix__; \
+    unsigned char *ll__; \
+    ll__=(_ll); \
+    pix__=(_pix); \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov LL,ll__ \
+    __asm mov PIX,pix__ \
+    __asm sub PIX,YSTRIDE \
+    __asm sub PIX,YSTRIDE \
     /*mm0={a0,...,a7}*/ \
     __asm movq mm0,[PIX] \
     /*ystride3=_ystride*3*/ \
@@ -121,17 +120,14 @@
 
 #define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
   do{ \
-    unsigned char *pix; \
-    ptrdiff_t      ystride3; \
-    ptrdiff_t      d; \
-    OC_ALIGN8(unsigned char* ll_); \
-    ll_ = _ll; \
+    unsigned char *ll__; \
+    unsigned char *pix__; \
+    ll__=(_ll); \
     pix=(_pix)-2; \
-    __asm mov PIX, pix \
-    __asm mov YSTRIDE, _ystride \
-    __asm mov YSTRIDE3, ystride3 \
-    __asm mov LL, ll_ \
-    __asm mov D, d \
+    __asm mov PIX,pix__ \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov YSTRIDE3,ystride3__ \
+    __asm mov LL,ll__ \
     /*x x x x d0 c0 b0 a0*/ \
     __asm movd mm0,[PIX] \
     /*x x x x d1 c1 b1 a1*/ \
@@ -216,6 +212,5 @@
   } \
   while(0)
 
-  
 # endif
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -81,16 +81,16 @@
   for(fragii=0;fragii<_nfragis;fragii++){
     ptrdiff_t frag_buf_off;
     frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    #define SRC edx
-    #define DST eax
-    #define YSTRIDE ecx
-    #define YSTRIDE3 ebx
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 ebx
     OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
      src_frame_data+frag_buf_off,ystride);
-    #undef SRC
-    #undef DST
-    #undef YSTRIDE
-    #undef YSTRIDE3
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
   }
 }
 
@@ -140,14 +140,12 @@
       if(frags[fragi].coded){
         unsigned char *ref;
         ref=ref_frame_data+frag_buf_offs[fragi];
-
-        #define PIX eax
-        #define YSTRIDE3 edi
-        #define YSTRIDE ecx
-        #define LL edx
-        #define D esi
-        #define D_WORD si
-
+#define PIX eax
+#define YSTRIDE3 edi
+#define YSTRIDE ecx
+#define LL edx
+#define D esi
+#define D_WORD si
         if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
         if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
@@ -156,13 +154,12 @@
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
           OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
         }
-
-        #undef PIX
-        #undef YSTRIDE3
-        #undef YSTRIDE
-        #undef LL
-        #undef D
-        #undef D_WORD
+#undef PIX
+#undef YSTRIDE3
+#undef YSTRIDE
+#undef LL
+#undef D
+#undef D_WORD
       }
       fragi++;
     }

Modified: branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -448,8 +448,7 @@
    mm7 = d3 c3 b3 a3*/ \
 
 static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,
- unsigned _thresh){
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
   OC_ALIGN8(ogg_int16_t  buf[64]);
   ogg_int16_t *bufp;
   unsigned     ret;

Modified: branches/theora-thusnelda/lib/enc/x86/x86enc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/x86enc.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/enc/x86/x86enc.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -42,7 +42,7 @@
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+    /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
 # endif
   }
 }

Modified: branches/theora-thusnelda/lib/enc/x86_vc/mmxencfrag.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_vc/mmxencfrag.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/enc/x86_vc/mmxencfrag.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -21,12 +21,12 @@
 
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
-  ptrdiff_t retv;
-  __asm {
-    #define SRC esi
-    #define REF edx
-    #define YSTRIDE ecx
-    #define YSTRIDE3 ebx
+  ptrdiff_t ret;
+  __asm{
+#define SRC esi
+#define REF edx
+#define YSTRIDE ecx
+#define YSTRIDE3 ebx
     mov YSTRIDE,_ystride
     mov SRC,_src
     mov REF,_ref
@@ -70,89 +70,85 @@
     psadbw mm2,mm3
     paddw mm0,mm6
     paddw mm0,mm2
-    movd [retv],mm0
-    #undef SRC
-    #undef REF
-    #undef YSTRIDE
-    #undef YSTRIDE3
+    movd [ret],mm0
+#undef SRC
+#undef REF
+#undef YSTRIDE
+#undef YSTRIDE3
   }
-  return (unsigned)retv;
+  return (unsigned)ret;
 }
 
 unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride,unsigned _thresh){
   /*Early termination is for suckers.*/
   return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
-} 
+}
 
-#define OC_SAD2_LOOP __asm \
-/*
-  We want to compute (mm0+mm1 >> 1) on unsigned bytes without overflow, but
-  pavgb computes (mm0+mm1+1 >> 1).
-  The latter is exactly 1 too large when the low bit of two corresponding
-  bytes is only set in one of them.
-  Therefore we pxor the operands, pand to mask out the low bits, and psubb to
-  correct the output of pavgb.
-*/ \
-{ \
-__asm  movq mm6,mm0 \
-__asm  lea REF1,[REF1+YSTRIDE*2] \
-__asm  pxor mm0,mm1 \
-__asm  pavgb mm6,mm1 \
-__asm  lea REF2,[REF2+YSTRIDE*2] \
-__asm  movq mm1,mm2 \
-__asm  pand mm0,mm7 \
-__asm  pavgb mm2,mm3 \
-__asm  pxor mm1,mm3 \
-__asm  movq mm3,[REF2+YSTRIDE] \
-__asm  psubb mm6,mm0 \
-__asm  movq mm0,[REF1] \
-__asm  pand mm1,mm7 \
-__asm  psadbw mm4,mm6 \
-__asm  movd mm6,RET \
-__asm  psubb mm2,mm1 \
-__asm  movq mm1,[REF2] \
-__asm  lea SRC,[SRC+YSTRIDE*2] \
-__asm  psadbw mm5,mm2 \
-__asm  movq mm2,[REF1+YSTRIDE] \
-__asm  paddw mm5,mm4 \
-__asm  movq mm4,[SRC] \
-__asm  paddw mm6,mm5 \
-__asm  movq mm5,[SRC+YSTRIDE] \
-__asm  movd RET,mm6 \
+#define OC_SAD2_LOOP __asm{ \
+  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
+     pavgb computes (mm0+mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+  __asm  movq mm6,mm0 \
+  __asm  lea REF1,[REF1+YSTRIDE*2] \
+  __asm  pxor mm0,mm1 \
+  __asm  pavgb mm6,mm1 \
+  __asm  lea REF2,[REF2+YSTRIDE*2] \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm0,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  movq mm3,[REF2+YSTRIDE] \
+  __asm  psubb mm6,mm0 \
+  __asm  movq mm0,[REF1] \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm6 \
+  __asm  movd mm6,RET \
+  __asm  psubb mm2,mm1 \
+  __asm  movq mm1,[REF2] \
+  __asm  lea SRC,[SRC+YSTRIDE*2] \
+  __asm  psadbw mm5,mm2 \
+  __asm  movq mm2,[REF1+YSTRIDE] \
+  __asm  paddw mm5,mm4 \
+  __asm  movq mm4,[SRC] \
+  __asm  paddw mm6,mm5 \
+  __asm  movq mm5,[SRC+YSTRIDE] \
+  __asm  movd RET,mm6 \
 }
 
 /*Same as above, but does not pre-load the next two rows.*/
-#define OC_SAD2_TAIL __asm \
-{ \
-__asm  movq mm6,mm0 \
-__asm  pavgb mm0,mm1 \
-__asm  pxor mm6,mm1 \
-__asm  movq mm1,mm2 \
-__asm  pand mm6,mm7 \
-__asm  pavgb mm2,mm3 \
-__asm  pxor mm1,mm3 \
-__asm  psubb mm0,mm6 \
-__asm  pand mm1,mm7 \
-__asm  psadbw mm4,mm0 \
-__asm  psubb mm2,mm1 \
-__asm  movd mm6,RET \
-__asm  psadbw mm5,mm2 \
-__asm  paddw mm5,mm4 \
-__asm  paddw mm6,mm5 \
-__asm  movd RET,mm6 \
+#define OC_SAD2_TAIL __asm{ \
+  __asm  movq mm6,mm0 \
+  __asm  pavgb mm0,mm1 \
+  __asm  pxor mm6,mm1 \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm6,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  psubb mm0,mm6 \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm0 \
+  __asm  psubb mm2,mm1 \
+  __asm  movd mm6,RET \
+  __asm  psadbw mm5,mm2 \
+  __asm  paddw mm5,mm4 \
+  __asm  paddw mm6,mm5 \
+  __asm  movd RET,mm6 \
 }
 
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh){
-  ptrdiff_t retv;
-  __asm {
-    #define REF1 ecx
-    #define REF2 edi
-    #define YSTRIDE esi
-    #define SRC edx
-    #define RET eax
+  ptrdiff_t ret;
+  __asm{
+#define REF1 ecx
+#define REF2 edi
+#define YSTRIDE esi
+#define SRC edx
+#define RET eax
     mov YSTRIDE,_ystride
     mov SRC,_src
     mov REF1,_ref1
@@ -171,100 +167,100 @@
     OC_SAD2_LOOP
     OC_SAD2_LOOP
     OC_SAD2_TAIL
-    mov [retv], RET
-    #undef REF1
-    #undef REF2
-    #undef YSTRIDE
-    #undef SRC
-    #undef RET
+    mov [ret],RET
+#undef REF1
+#undef REF2
+#undef YSTRIDE
+#undef SRC
+#undef RET
   }
-  return (unsigned)retv;
+  return (unsigned)ret;
 }
 
 /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
   16-bit difference in mm0...mm7.*/
-#define OC_LOAD_SUB_8x4(_off) __asm { \
-__asm  movd mm0,[_off+SRC] \
-__asm  movd mm4,[_off+REF] \
-__asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
-__asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-__asm  movd mm5,[_off+REF+REF_YSTRIDE] \
-__asm  lea REF,[REF+REF_YSTRIDE*2] \
-__asm  movd mm2,[_off+SRC] \
-__asm  movd mm7,[_off+REF] \
-__asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
-__asm  movd mm6,[_off+REF+REF_YSTRIDE] \
-__asm  punpcklbw mm0,mm4 \
-__asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-__asm  punpcklbw mm4,mm4 \
-__asm  lea REF,[REF+REF_YSTRIDE*2] \
-__asm  psubw mm0,mm4 \
-__asm  movd mm4,[_off+SRC] \
-__asm  movq [_off*2+BUF],mm0 \
-__asm  movd mm0,[_off+REF] \
-__asm  punpcklbw mm1,mm5 \
-__asm  punpcklbw mm5,mm5 \
-__asm  psubw mm1,mm5 \
-__asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
-__asm  punpcklbw mm2,mm7 \
-__asm  punpcklbw mm7,mm7 \
-__asm  psubw mm2,mm7 \
-__asm  movd mm7,[_off+REF+REF_YSTRIDE] \
-__asm  punpcklbw mm3,mm6 \
-__asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-__asm  punpcklbw mm6,mm6 \
-__asm  psubw mm3,mm6 \
-__asm  movd mm6,[_off+SRC] \
-__asm  punpcklbw mm4,mm0 \
-__asm  lea REF,[REF+REF_YSTRIDE*2] \
-__asm  punpcklbw mm0,mm0 \
-__asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
-__asm  psubw mm4,mm0 \
-__asm  movd mm0,[_off+REF] \
-__asm  punpcklbw mm5,mm7 \
-__asm  neg SRC_YSTRIDE \
-__asm  punpcklbw mm7,mm7 \
-__asm  psubw mm5,mm7 \
-__asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
-__asm  punpcklbw mm6,mm0 \
-__asm  lea REF,[REF+REF_YSTRIDE*2] \
-__asm  punpcklbw mm0,mm0 \
-__asm  neg REF_YSTRIDE \
-__asm  psubw mm6,mm0 \
-__asm  movd mm0,[_off+REF+REF_YSTRIDE] \
-__asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
-__asm  punpcklbw mm7,mm0 \
-__asm  neg SRC_YSTRIDE \
-__asm  punpcklbw mm0,mm0 \
-__asm  lea REF,[REF+REF_YSTRIDE*8] \
-__asm  psubw mm7,mm0 \
-__asm  neg REF_YSTRIDE \
-__asm  movq mm0,[_off*2+BUF] \
+#define OC_LOAD_SUB_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm4,[_off+REF] \
+  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  movd mm2,[_off+SRC] \
+  __asm  movd mm7,[_off+REF] \
+  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
+  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm0,mm4 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  psubw mm0,mm4 \
+  __asm  movd mm4,[_off+SRC] \
+  __asm  movq [_off*2+BUF],mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm1,mm5 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psubw mm1,mm5 \
+  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm2,mm7 \
+  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm3,mm6 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  psubw mm3,mm6 \
+  __asm  movd mm6,[_off+SRC] \
+  __asm  punpcklbw mm4,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  psubw mm4,mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm5,mm7 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm5,mm7 \
+  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm6,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  psubw mm6,mm0 \
+  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
+  __asm  punpcklbw mm7,mm0 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*8] \
+  __asm  psubw mm7,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  movq mm0,[_off*2+BUF] \
 }
 
 /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
-#define OC_LOAD_8x4(_off) __asm { \
-__asm  movd mm0,[_off+SRC] \
-__asm  movd mm1,[_off+SRC+YSTRIDE] \
-__asm  movd mm2,[_off+SRC+YSTRIDE*2] \
-__asm  pxor mm7,mm7 \
-__asm  movd mm3,[_off+SRC+YSTRIDE3] \
-__asm  punpcklbw mm0,mm7 \
-__asm  movd mm4,[_off+SRC4] \
-__asm  punpcklbw mm1,mm7 \
-__asm  movd mm5,[_off+SRC4+YSTRIDE] \
-__asm  punpcklbw mm2,mm7 \
-__asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
-__asm  punpcklbw mm3,mm7 \
-__asm  movd mm7,[_off+SRC4+YSTRIDE3] \
-__asm  punpcklbw mm4,mm4 \
-__asm  punpcklbw mm5,mm5 \
-__asm  psrlw mm4,8 \
-__asm  psrlw mm5,8 \
-__asm  punpcklbw mm6,mm6 \
-__asm  punpcklbw mm7,mm7 \
-__asm  psrlw mm6,8 \
-__asm  psrlw mm7,8 \
+#define OC_LOAD_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm1,[_off+SRC+YSTRIDE] \
+  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
+  __asm  pxor mm7,mm7 \
+  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
+  __asm  punpcklbw mm0,mm7 \
+  __asm  movd mm4,[_off+SRC4] \
+  __asm  punpcklbw mm1,mm7 \
+  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
+  __asm  punpcklbw mm3,mm7 \
+  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psrlw mm4,8 \
+  __asm  psrlw mm5,8 \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psrlw mm6,8 \
+  __asm  psrlw mm7,8 \
 }
 
 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
@@ -272,53 +268,53 @@
    outputs 4-7.
   Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
    perform this stage in place with no temporary registers).*/
-#define OC_HADAMARD_AB_8x4 __asm { \
-/*Stage A: \
-  Outputs 0-3 are swapped with 4-7 here.*/ \
-__asm  paddw mm5,mm1 \
-__asm  paddw mm6,mm2 \
-__asm  paddw mm1,mm1 \
-__asm  paddw mm2,mm2 \
-__asm  psubw mm1,mm5 \
-__asm  psubw mm2,mm6 \
-__asm  paddw mm7,mm3 \
-__asm  paddw mm4,mm0 \
-__asm  paddw mm3,mm3 \
-__asm  paddw mm0,mm0 \
-__asm  psubw mm3,mm7 \
-__asm  psubw mm0,mm4 \
- /*Stage B:*/ \
-__asm  paddw mm0,mm2 \
-__asm  paddw mm1,mm3 \
-__asm  paddw mm4,mm6 \
-__asm  paddw mm5,mm7 \
-__asm  paddw mm2,mm2 \
-__asm  paddw mm3,mm3 \
-__asm  paddw mm6,mm6 \
-__asm  paddw mm7,mm7 \
-__asm  psubw mm2,mm0 \
-__asm  psubw mm3,mm1 \
-__asm  psubw mm6,mm4 \
-__asm  psubw mm7,mm5 \
+#define OC_HADAMARD_AB_8x4 __asm{ \
+  /*Stage A: \
+    Outputs 0-3 are swapped with 4-7 here.*/ \
+  __asm  paddw mm5,mm1 \
+  __asm  paddw mm6,mm2 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm2,mm2 \
+  __asm  psubw mm1,mm5 \
+  __asm  psubw mm2,mm6 \
+  __asm  paddw mm7,mm3 \
+  __asm  paddw mm4,mm0 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm0,mm0 \
+  __asm  psubw mm3,mm7 \
+  __asm  psubw mm0,mm4 \
+   /*Stage B:*/ \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm1,mm3 \
+  __asm  paddw mm4,mm6 \
+  __asm  paddw mm5,mm7 \
+  __asm  paddw mm2,mm2 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm6,mm6 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm2,mm0 \
+  __asm  psubw mm3,mm1 \
+  __asm  psubw mm6,mm4 \
+  __asm  psubw mm7,mm5 \
 }
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
   Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
    place with no temporary registers).*/
-#define OC_HADAMARD_C_8x4 __asm { \
- /*Stage C:*/ \
-__asm  paddw mm0,mm1 \
-__asm  paddw mm2,mm3 \
-__asm  paddw mm4,mm5 \
-__asm  paddw mm6,mm7 \
-__asm  paddw mm1,mm1 \
-__asm  paddw mm3,mm3 \
-__asm  paddw mm5,mm5 \
-__asm  paddw mm7,mm7 \
-__asm  psubw mm1,mm0 \
-__asm  psubw mm3,mm2 \
-__asm  psubw mm5,mm4 \
-__asm  psubw mm7,mm6 \
+#define OC_HADAMARD_C_8x4 __asm{ \
+  /*Stage C:*/ \
+  __asm  paddw mm0,mm1 \
+  __asm  paddw mm2,mm3 \
+  __asm  paddw mm4,mm5 \
+  __asm  paddw mm6,mm7 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm5,mm5 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm1,mm0 \
+  __asm  psubw mm3,mm2 \
+  __asm  psubw mm5,mm4 \
+  __asm  psubw mm7,mm6 \
 }
 
 /*Performs an 8-point 1-D Hadamard transform.
@@ -326,67 +322,67 @@
    outputs 4-7.
   Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
    in place with no temporary registers).*/
-#define OC_HADAMARD_8x4 __asm {\
- OC_HADAMARD_AB_8x4 \
- OC_HADAMARD_C_8x4 \
+#define OC_HADAMARD_8x4 __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_8x4 \
 }
 
 /*Performs the first part of the final stage of the Hadamard transform and
    summing of absolute values.
   At the end of this part, mm1 will contain the DC coefficient of the
    transform.*/
-#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm {\
- /*We use the fact that \
-   (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
-  to merge the final butterfly with the abs and the first stage of \
-  accumulation. \
-   Thus we can avoid using pabsw, which is not available until SSSE3. \
-   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
-  implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
-  registers). \
-   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
-   This implementation is only 26 (+4 for spilling registers).*/ \
-__asm  movq [_r7+BUF],mm7 \
-__asm  movq [_r6+BUF],mm6 \
- /*mm7={0x7FFF}x4 \
-  mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
-__asm  pcmpeqb mm7,mm7 \
-__asm  movq mm6,mm0 \
-__asm  psrlw mm7,1 \
-__asm  paddw mm6,mm1 \
-__asm  pmaxsw mm0,mm1 \
-__asm  paddsw mm6,mm7 \
-__asm  psubw mm0,mm6 \
- /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
-  mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
-__asm  movq mm6,mm2 \
-__asm  movq mm1,mm4 \
-__asm  pmaxsw mm2,mm3 \
-__asm  pmaxsw mm4,mm5 \
-__asm  paddw mm6,mm3 \
-__asm  paddw mm1,mm5 \
-__asm  movq mm3,[_r7+BUF] \
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
+  /*We use the fact that \
+      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+     to merge the final butterfly with the abs and the first stage of \
+     accumulation. \
+    Thus we can avoid using pabsw, which is not available until SSSE3. \
+    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+     registers). \
+    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+    This implementation is only 26 (+4 for spilling registers).*/ \
+  __asm  movq [_r7+BUF],mm7 \
+  __asm  movq [_r6+BUF],mm6 \
+  /*mm7={0x7FFF}x4 \
+    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+  __asm  pcmpeqb mm7,mm7 \
+  __asm  movq mm6,mm0 \
+  __asm  psrlw mm7,1 \
+  __asm  paddw mm6,mm1 \
+  __asm  pmaxsw mm0,mm1 \
+  __asm  paddsw mm6,mm7 \
+  __asm  psubw mm0,mm6 \
+  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm1,mm4 \
+  __asm  pmaxsw mm2,mm3 \
+  __asm  pmaxsw mm4,mm5 \
+  __asm  paddw mm6,mm3 \
+  __asm  paddw mm1,mm5 \
+  __asm  movq mm3,[_r7+BUF] \
 }
 
 /*Performs the second part of the final stage of the Hadamard transform and
    summing of absolute values.*/
-#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm { \
-__asm  paddsw mm6,mm7 \
-__asm  movq mm5,[_r6+BUF] \
-__asm  paddsw mm1,mm7 \
-__asm  psubw mm2,mm6 \
-__asm  psubw mm4,mm1 \
- /*mm7={1}x4 (needed for the horizontal add that follows) \
-   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
-__asm  movq mm6,mm3 \
-__asm  pmaxsw mm3,mm5 \
-__asm  paddw mm0,mm2 \
-__asm  paddw mm6,mm5 \
-__asm  paddw mm0,mm4 \
-__asm  paddsw mm6,mm7 \
-__asm  paddw mm0,mm3 \
-__asm  psrlw mm7,14 \
-__asm  psubw mm0,mm6 \
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
+  __asm  paddsw mm6,mm7 \
+  __asm  movq mm5,[_r6+BUF] \
+  __asm  paddsw mm1,mm7 \
+  __asm  psubw mm2,mm6 \
+  __asm  psubw mm4,mm1 \
+  /*mm7={1}x4 (needed for the horizontal add that follows) \
+    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm3 \
+  __asm  pmaxsw mm3,mm5 \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm6,mm5 \
+  __asm  paddw mm0,mm4 \
+  __asm  paddsw mm6,mm7 \
+  __asm  paddw mm0,mm3 \
+  __asm  psrlw mm7,14 \
+  __asm  psubw mm0,mm6 \
 }
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
@@ -394,18 +390,18 @@
   This is the only portion of SATD which requires MMXEXT (we could use plain
    MMX, but it takes 4 instructions and an extra register to work around the
    lack of a pmaxsw, which is a pretty serious penalty).*/
-#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm {\
- OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
- OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
 }
 
 /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
    component, and accumulates everything into mm0.
   Note that mm0 will have an extra 4 added to each column, and that after
    removing this value, the remainder will be half the conventional value.*/
-#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm {\
- OC_HADAMARD_AB_8x4 \
- OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
 }
 
 /*Performs two 4x4 transposes (mostly) in place.
@@ -413,86 +409,85 @@
    contains rows {a,b,c,d}.
   On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
    {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
-#define OC_TRANSPOSE_4x4x2(_off) __asm {\
- /*First 4x4 transpose:*/ \
-__asm  movq [0x10+_off+BUF],mm5 \
- /*mm0 = e3 e2 e1 e0 \
-   mm1 = f3 f2 f1 f0 \
-   mm2 = g3 g2 g1 g0 \
-   mm3 = h3 h2 h1 h0*/ \
-__asm  movq mm5,mm2 \
-__asm  punpcklwd mm2,mm3 \
-__asm  punpckhwd mm5,mm3 \
-__asm  movq mm3,mm0 \
-__asm  punpcklwd mm0,mm1 \
-__asm  punpckhwd mm3,mm1 \
- /*mm0 = f1 e1 f0 e0 \
-   mm3 = f3 e3 f2 e2 \
-   mm2 = h1 g1 h0 g0 \
-   mm5 = h3 g3 h2 g2*/ \
-__asm  movq mm1,mm0 \
-__asm  punpckldq mm0,mm2 \
-__asm  punpckhdq mm1,mm2 \
-__asm  movq mm2,mm3 \
-__asm  punpckhdq mm3,mm5 \
-__asm  movq [0x40+_off+BUF],mm0 \
-__asm  punpckldq mm2,mm5 \
- /*mm0 = h0 g0 f0 e0 \
-   mm1 = h1 g1 f1 e1 \
-   mm2 = h2 g2 f2 e2 \
-   mm3 = h3 g3 f3 e3*/ \
-__asm  movq mm5,[0x10+_off+BUF] \
- /*Second 4x4 transpose:*/ \
- /*mm4 = a3 a2 a1 a0 \
-   mm5 = b3 b2 b1 b0 \
-   mm6 = c3 c2 c1 c0 \
-   mm7 = d3 d2 d1 d0*/ \
-__asm  movq mm0,mm6 \
-__asm  punpcklwd mm6,mm7 \
-__asm  movq [0x50+_off+BUF],mm1 \
-__asm  punpckhwd mm0,mm7 \
-__asm  movq mm7,mm4 \
-__asm  punpcklwd mm4,mm5 \
-__asm  movq [0x60+_off+BUF],mm2 \
-__asm  punpckhwd mm7,mm5 \
- /*mm4 = b1 a1 b0 a0 \
-   mm7 = b3 a3 b2 a2 \
-   mm6 = d1 c1 d0 c0 \
-   mm0 = d3 c3 d2 c2*/ \
-__asm  movq mm5,mm4 \
-__asm  punpckldq mm4,mm6 \
-__asm  movq [0x70+_off+BUF],mm3 \
-__asm  punpckhdq mm5,mm6 \
-__asm  movq mm6,mm7 \
-__asm  punpckhdq mm7,mm0 \
-__asm  punpckldq mm6,mm0 \
- /*mm4 = d0 c0 b0 a0 \
-   mm5 = d1 c1 b1 a1 \
-   mm6 = d2 c2 b2 a2 \
-   mm7 = d3 c3 b3 a3*/ \
+#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
+  /*First 4x4 transpose:*/ \
+  __asm  movq [0x10+_off+BUF],mm5 \
+  /*mm0 = e3 e2 e1 e0 \
+    mm1 = f3 f2 f1 f0 \
+    mm2 = g3 g2 g1 g0 \
+    mm3 = h3 h2 h1 h0*/ \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm2,mm3 \
+  __asm  punpckhwd mm5,mm3 \
+  __asm  movq mm3,mm0 \
+  __asm  punpcklwd mm0,mm1 \
+  __asm  punpckhwd mm3,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm3 = f3 e3 f2 e2 \
+    mm2 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm2 \
+  __asm  punpckhdq mm1,mm2 \
+  __asm  movq mm2,mm3 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq [0x40+_off+BUF],mm0 \
+  __asm  punpckldq mm2,mm5 \
+  /*mm0 = h0 g0 f0 e0 \
+    mm1 = h1 g1 f1 e1 \
+    mm2 = h2 g2 f2 e2 \
+    mm3 = h3 g3 f3 e3*/ \
+  __asm  movq mm5,[0x10+_off+BUF] \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm5 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm7 = d3 d2 d1 d0*/ \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm7 \
+  __asm  movq [0x50+_off+BUF],mm1 \
+  __asm  punpckhwd mm0,mm7 \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm5 \
+  __asm  movq [0x60+_off+BUF],mm2 \
+  __asm  punpckhwd mm7,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  movq [0x70+_off+BUF],mm3 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
 }
 
 static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,
- unsigned _thresh){
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
   OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned   ret1;
-  unsigned   ret2;
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
   bufp=buf;
-  __asm {
-    #define SRC esi
-    #define REF eax
-    #define SRC_YSTRIDE ecx
-    #define REF_YSTRIDE edx
-    #define BUF edi
-    #define RET eax
-    #define RET2 edx
-    mov SRC, _src
-    mov SRC_YSTRIDE, _src_ystride
-    mov REF, _ref
-    mov REF_YSTRIDE, _ref_ystride
-    mov BUF, bufp
+  __asm{
+#define SRC esi
+#define REF eax
+#define SRC_YSTRIDE ecx
+#define REF_YSTRIDE edx
+#define BUF edi
+#define RET eax
+#define RET2 edx
+    mov SRC,_src
+    mov SRC_YSTRIDE,_src_ystride
+    mov REF,_ref
+    mov REF_YSTRIDE,_ref_ystride
+    mov BUF,bufp
     OC_LOAD_SUB_8x4(0x00)
     OC_HADAMARD_8x4
     OC_TRANSPOSE_4x4x2(0x00)
@@ -551,15 +546,15 @@
     movd RET2,mm4
     lea RET,[RET+RET2*2]
     align 16
-    at_end:
-    mov ret1, RET
-    #undef SRC
-    #undef REF
-    #undef SRC_YSTRIDE
-    #undef REF_YSTRIDE
-    #undef BUF
-    #undef RET
-    #undef RET2
+at_end:
+    mov ret1,RET
+#undef SRC
+#undef REF
+#undef SRC_YSTRIDE
+#undef REF_YSTRIDE
+#undef BUF
+#undef RET
+#undef RET2
   }
   return ret1;
 }
@@ -568,24 +563,24 @@
  const unsigned char *_ref,int _ystride,unsigned _thresh){
   return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
 }
- 
 
+
 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
    we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
 static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
-  __asm {
+  __asm{
     /*Load the first 3 rows.*/
-    #define DST_YSTRIDE ebx
-    #define SRC_YSTRIDE esi
-    #define DST eax
-    #define SRC1 edx
-    #define SRC2 ecx
-    mov DST_YSTRIDE, _dst_ystride
-    mov SRC_YSTRIDE, _src_ystride
-    mov DST, _dst
-    mov SRC1, _src1
-    mov SRC2, _src2
+#define DST_YSTRIDE ebx
+#define SRC_YSTRIDE esi
+#define DST eax
+#define SRC1 edx
+#define SRC2 ecx
+    mov DST_YSTRIDE,_dst_ystride
+    mov SRC_YSTRIDE,_src_ystride
+    mov DST,_dst
+    mov SRC1,_src1
+    mov SRC2,_src2
     movq mm0,[SRC1]
     movq mm1,[SRC2]
     movq mm2,[SRC1+SRC_YSTRIDE]
@@ -691,16 +686,13 @@
     psubb mm2,mm4
     /*mm2 [row 7] is done, write it out.*/
     movq [DST+DST_YSTRIDE],mm2
-    mov _dst, DST
-    mov _src1, SRC1
-    mov _src2, SRC2
-    #undef SRC1
-    #undef SRC2
-    #undef SRC_YSTRIDE
-    #undef DST_YSTRIDE
-    #undef DST
+#undef SRC1
+#undef SRC2
+#undef SRC_YSTRIDE
+#undef DST_YSTRIDE
+#undef DST
   }
-} 
+}
 
 unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
@@ -708,29 +700,29 @@
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
   return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
-} 
+}
 
 unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
  int _ystride){
   OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret1;
-  unsigned     ret2;
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
   bufp=buf;
-  __asm {
-    #define SRC eax
-    #define SRC4 esi      
-    #define BUF edi
-    #define RET eax
-    #define RET_WORD ax
-    #define RET2 ecx
-    #define YSTRIDE edx
-    #define YSTRIDE3 ecx
-    mov SRC, _src
-    mov BUF, bufp
-    mov YSTRIDE, _ystride
+  __asm{
+#define SRC eax
+#define SRC4 esi
+#define BUF edi
+#define RET eax
+#define RET_WORD ax
+#define RET2 ecx
+#define YSTRIDE edx
+#define YSTRIDE3 ecx
+    mov SRC,_src
+    mov BUF,bufp
+    mov YSTRIDE,_ystride
     /* src4 = src+4*ystride */
-    lea SRC4, [SRC+YSTRIDE*4]
+    lea SRC4,[SRC+YSTRIDE*4]
     /* ystride3 = 3*ystride */
     lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
     OC_LOAD_8x4(0x00)
@@ -790,42 +782,42 @@
     paddd mm4,mm0
     movd RET,mm4
     lea RET,[-64+RET2+RET*2]
-    mov [ret1], RET
-    #undef SRC
-    #undef SRC4
-    #undef BUF
-    #undef RET
-    #undef RET_WORD
-    #undef RET2
-    #undef YSTRIDE
-    #undef YSTRIDE3
+    mov [ret1],RET
+#undef SRC
+#undef SRC4
+#undef BUF
+#undef RET
+#undef RET_WORD
+#undef RET2
+#undef YSTRIDE
+#undef YSTRIDE3
   }
   return ret1;
 }
 
 void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src, const unsigned char *_ref, int _ystride){
-  int i; 
+ const unsigned char *_src, const unsigned char *_ref,int _ystride){
+  int i;
   __asm  pxor mm7,mm7
-  for (i = 4; i-- > 0; ) {  
-    __asm {
-      #define SRC edx
-      #define YSTRIDE ebx
-      #define RESIDUE eax
-      #define REF ecx
+  for(i=4;i-->0;){
+    __asm{
+#define SRC edx
+#define YSTRIDE ebx
+#define RESIDUE eax
+#define REF ecx
       mov YSTRIDE,_ystride
       mov RESIDUE,_residue
       mov SRC,_src
       mov REF,_ref
-      /*mm0=[src]*/ 
+      /*mm0=[src]*/
       movq mm0,[SRC]
-      /*mm1=[ref]*/       
+      /*mm1=[ref]*/
       movq mm1,[REF]
-      /*mm4=[src+ystride]*/ 
+      /*mm4=[src+ystride]*/
       movq mm4,[SRC+YSTRIDE]
-      /*mm5=[ref+ystride]*/ 
+      /*mm5=[ref+ystride]*/
       movq mm5,[REF+YSTRIDE]
-      /*Compute [src]-[ref].*/ 
+      /*Compute [src]-[ref].*/
       movq mm2,mm0
       punpcklbw mm0,mm7
       movq mm3,mm1
@@ -834,7 +826,7 @@
       punpckhbw mm3,mm7
       psubw mm0,mm1
       psubw mm2,mm3
-      /*Compute [src+ystride]-[ref+ystride].*/ 
+      /*Compute [src+ystride]-[ref+ystride].*/
       movq mm1,mm4
       punpcklbw mm4,mm7
       movq mm3,mm5
@@ -845,7 +837,7 @@
       punpckhbw mm3,mm7
       psubw mm4,mm5
       psubw mm1,mm3
-      /*Write the answer out.*/ 
+      /*Write the answer out.*/
       movq [RESIDUE+0x00],mm0
       movq [RESIDUE+0x08],mm2
       movq [RESIDUE+0x10],mm4
@@ -854,21 +846,21 @@
       mov _residue,RESIDUE
       mov _src,SRC
       mov _ref,REF
-      #undef SRC
-      #undef YSTRIDE
-      #undef RESIDUE
-      #undef REF
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+#undef REF
     }
   }
 }
 
 void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
  const unsigned char *_src,int _ystride){
-  ptrdiff_t ystride3; 
-   __asm {
-    #define YSTRIDE edx
-    #define RESIDUE ecx
-    #define SRC eax
+   __asm{
+#define YSTRIDE edx
+#define YSTRIDE3 ebx
+#define RESIDUE ecx
+#define SRC eax
     mov YSTRIDE,_ystride
     mov RESIDUE,_residue
     mov SRC,_src
@@ -881,11 +873,11 @@
     /*mm2=[src+2*ystride]*/
     movq mm2,[SRC+YSTRIDE*2]
     /*[ystride3]=3*[ystride]*/
-    lea ebx,[YSTRIDE+YSTRIDE*2]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
     /*mm6={1}x4*/
     psllw mm6,15
     /*mm3=[src+3*ystride]*/
-    movq mm3,[SRC+ebx]
+    movq mm3,[SRC+YSTRIDE3]
     /*mm6={128}x4*/
     psrlw mm6,8
     /*mm7=0*/ 
@@ -930,7 +922,7 @@
     movq [RESIDUE+0x38],mm5
     /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
     movq mm2,[SRC+YSTRIDE*2]
-    movq mm3,[SRC+ebx]
+    movq mm3,[SRC+YSTRIDE3]
     movq mm4,mm0
     punpcklbw mm0,mm7
     movq mm5,mm1
@@ -962,9 +954,10 @@
     movq [RESIDUE+0x68],mm4
     movq [RESIDUE+0x70],mm3
     movq [RESIDUE+0x78],mm5
-    #undef YSTRIDE
-    #undef RESIDUE
-    #undef SRC
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+#undef SRC
   }
 }
 
@@ -973,4 +966,4 @@
   oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
 }
 
-#endif 
+#endif

Modified: branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c	2009-07-07 23:26:21 UTC (rev 16219)
+++ branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c	2009-07-08 02:06:38 UTC (rev 16220)
@@ -15,459 +15,456 @@
 
 #if defined(OC_X86_ASM)
 
-# define OC_FDCT_STAGE1_8x4  __asm\
-{ \
-/*Stage 1:*/ \
-/*mm0=t7'=t0-t7*/ \
-__asm  psubw mm0,mm7 \
-__asm  paddw mm7,mm7 \
-/*mm1=t6'=t1-t6*/ \
-__asm  psubw mm1, mm6 \
-__asm  paddw mm6,mm6 \
-/*mm2=t5'=t2-t5*/ \
-__asm  psubw mm2,mm5 \
-__asm  paddw mm5,mm5 \
-/*mm3=t4'=t3-t4*/ \
-__asm  psubw mm3,mm4 \
-__asm  paddw mm4,mm4 \
-/*mm7=t0'=t0+t7*/ \
-__asm  paddw mm7,mm0 \
-/*mm6=t1'=t1+t6*/  \
-__asm  paddw mm6,mm1 \
-/*mm5=t2'=t2+t5*/ \
-__asm  paddw mm5,mm2 \
-/*mm4=t3'=t3+t4*/ \
-__asm  paddw mm4,mm3\
+#define OC_FDCT_STAGE1_8x4  __asm{ \
+  /*Stage 1:*/ \
+  /*mm0=t7'=t0-t7*/ \
+  __asm  psubw mm0,mm7 \
+  __asm  paddw mm7,mm7 \
+  /*mm1=t6'=t1-t6*/ \
+  __asm  psubw mm1, mm6 \
+  __asm  paddw mm6,mm6 \
+  /*mm2=t5'=t2-t5*/ \
+  __asm  psubw mm2,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*mm3=t4'=t3-t4*/ \
+  __asm  psubw mm3,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm7=t0'=t0+t7*/ \
+  __asm  paddw mm7,mm0 \
+  /*mm6=t1'=t1+t6*/  \
+  __asm  paddw mm6,mm1 \
+  /*mm5=t2'=t2+t5*/ \
+  __asm  paddw mm5,mm2 \
+  /*mm4=t3'=t3+t4*/ \
+  __asm  paddw mm4,mm3\
 }
 
-# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm\
-{ \
- /*Stage 2:*/ \
- /*mm7=t3''=t0'-t3'*/ \
-__asm  psubw mm7,mm4 \
-__asm  paddw mm4,mm4 \
- /*mm6=t2''=t1'-t2'*/ \
-__asm  psubw mm6,mm5 \
-__asm  movq [Y+_r6],mm7 \
-__asm  paddw mm5,mm5 \
- /*mm1=t5''=t6'-t5'*/ \
-__asm  psubw mm1,mm2 \
-__asm  movq [Y+_r2],mm6 \
- /*mm4=t0''=t0'+t3'*/ \
-__asm  paddw mm4,mm7 \
-__asm  paddw mm2,mm2 \
- /*mm5=t1''=t1'+t2'*/ \
-__asm  movq [Y+_r0],mm4 \
-__asm  paddw mm5,mm6 \
- /*mm2=t6''=t6'+t5'*/ \
-__asm  paddw mm2,mm1 \
-__asm  movq [Y+_r4],mm5 \
- /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
- /*mm4, mm5, mm6, mm7 are free.*/ \
- /*Stage 3:*/ \
- /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
-__asm  mov A,0x5A806A0A \
-__asm  pcmpeqb mm6,mm6 \
-__asm  movd mm7,A \
-__asm  psrlw mm6,15 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddw mm6,mm6 \
- /*mm0=0, m2={-1}x4 \
-   mm5:mm4=t5''*27146+0xB500*/ \
-__asm  movq mm4,mm1 \
-__asm  movq mm5,mm1 \
-__asm  punpcklwd mm4,mm6 \
-__asm  movq [Y+_r3],mm2 \
-__asm  pmaddwd mm4,mm7 \
-__asm  movq [Y+_r7],mm0 \
-__asm  punpckhwd mm5,mm6 \
-__asm  pxor mm0,mm0 \
-__asm  pmaddwd mm5,mm7 \
-__asm  pcmpeqb mm2,mm2 \
- /*mm2=t6'', mm1=t5''+(t5''!=0) \
-   mm4=(t5''*27146+0xB500>>16)*/ \
-__asm  pcmpeqw mm0,mm1 \
-__asm  psrad mm4,16 \
-__asm  psubw mm0,mm2 \
-__asm  movq mm2, [Y+_r3] \
-__asm  psrad mm5,16 \
-__asm  paddw mm1,mm0 \
-__asm  packssdw mm4,mm5 \
- /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
-__asm  paddw mm4,mm1 \
-__asm  movq mm0, [Y+_r7] \
-__asm  psraw mm4,1 \
-__asm  movq mm1,mm3 \
- /*mm3=t4''=t4'+s*/ \
-__asm  paddw mm3,mm4 \
- /*mm1=t5'''=t4'-s*/ \
-__asm  psubw mm1,mm4 \
- /*mm1=0, mm3={-1}x4 \
-   mm5:mm4=t6''*27146+0xB500*/ \
-__asm  movq mm4,mm2 \
-__asm  movq mm5,mm2 \
-__asm  punpcklwd mm4,mm6 \
-__asm  movq [Y+_r5],mm1 \
-__asm  pmaddwd mm4,mm7 \
-__asm  movq [Y+_r1],mm3 \
-__asm  punpckhwd mm5,mm6 \
-__asm  pxor mm1,mm1 \
-__asm  pmaddwd mm5,mm7 \
-__asm  pcmpeqb mm3,mm3 \
- /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
-__asm  psrad mm4,16 \
-__asm  pcmpeqw mm1,mm2 \
-__asm  psrad mm5,16 \
-__asm  psubw mm1,mm3 \
-__asm  packssdw mm4,mm5 \
-__asm  paddw mm2,mm1 \
- /*mm1=t1'' \
-   mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
-__asm  paddw mm4,mm2 \
-__asm  movq mm1,[Y+_r4] \
-__asm  psraw mm4,1 \
-__asm  movq mm2,mm0 \
- /*mm7={54491-0x7FFF,0x7FFF}x2 \
-   mm0=t7''=t7'+s*/ \
-__asm  paddw mm0,mm4 \
- /*mm2=t6'''=t7'-s*/ \
-__asm  psubw mm2,mm4 \
- /*Stage 4:*/ \
- /*mm0=0, mm2=t0'' \
-   mm5:mm4=t1''*27146+0xB500*/ \
-__asm  movq mm4,mm1 \
-__asm  movq mm5,mm1 \
-__asm  punpcklwd mm4,mm6 \
-__asm  movq [Y+_r3],mm2 \
-__asm  pmaddwd mm4,mm7 \
-__asm  movq mm2,[Y+_r0] \
-__asm  punpckhwd mm5,mm6 \
-__asm  movq [Y+_r7],mm0 \
-__asm  pmaddwd mm5,mm7 \
-__asm  pxor mm0,mm0 \
- /*mm7={27146,0x4000>>1}x2 \
-   mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
-__asm  psrad mm4,16 \
-__asm  mov A,0x20006A0A \
-__asm  pcmpeqw mm0,mm1 \
-__asm  movd mm7,A \
-__asm  psrad mm5,16 \
-__asm  psubw mm0,mm3 \
-__asm  packssdw mm4,mm5 \
-__asm  paddw mm0,mm1 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddw mm0,mm4 \
- /*mm6={0x00000E3D}x2 \
-   mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
-__asm  movq mm4,mm2 \
-__asm  movq mm5,mm2 \
-__asm  punpcklwd mm4,mm6 \
-__asm  mov A,0x0E3D \
-__asm  pmaddwd mm4,mm7 \
-__asm  punpckhwd mm5,mm6 \
-__asm  movd mm6,A \
-__asm  pmaddwd mm5,mm7 \
-__asm  pxor mm1,mm1 \
-__asm  punpckldq mm6,mm6 \
-__asm  pcmpeqw mm1,mm2 \
- /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
-__asm  psrad mm4,16 \
-__asm  psubw mm1,mm3 \
-__asm  psrad mm5,16 \
-__asm  paddw mm2,mm1 \
-__asm  packssdw mm4,mm5 \
-__asm  movq mm1,[Y+_r5] \
-__asm  paddw mm4,mm2 \
- /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
-   The naive implementation could cause overflow, so we use u=s+(r-s>>1).*/ \
-__asm  mov A,0x7FFF54DC \
-__asm  psubw mm0,mm4 \
-__asm  movq mm2,[Y+_r3] \
-__asm  psraw mm0,1 \
-__asm  movd mm7,A \
-__asm  paddw mm0,mm4 \
- /*mm7={54491-0x7FFF,0x7FFF}x2 \
-   mm4=_y[4]=v=r-u*/ \
-__asm  psubw mm4,mm0 \
-__asm  punpckldq mm7,mm7 \
-__asm  movq [Y+_r4],mm4 \
- /*mm0=0, mm7={36410}x4 \
-   mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
-__asm  movq mm4,mm1 \
-__asm  movq mm5,mm1 \
-__asm  punpcklwd mm4,mm1 \
-__asm  mov A,0x8E3A8E3A \
-__asm  pmaddwd mm4,mm7 \
-__asm  movq [Y+_r0],mm0 \
-__asm  punpckhwd mm5,mm1 \
-__asm  pxor mm0,mm0 \
-__asm  pmaddwd mm5,mm7 \
-__asm  pcmpeqw mm1,mm0 \
-__asm  movd mm7,A \
-__asm  psubw mm1,mm3 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddd mm4,mm6 \
-__asm  paddd mm5,mm6 \
- /*mm0=0 \
-   mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
-__asm  movq mm6,mm2 \
-__asm  movq mm3,mm2 \
-__asm  pmulhw mm6,mm7 \
-__asm  paddw mm1,mm2 \
-__asm  pmullw mm3,mm7 \
-__asm  pxor mm0,mm0 \
-__asm  paddw mm6,mm1 \
-__asm  movq mm1,mm3 \
-__asm  punpckhwd mm3,mm6 \
-__asm  punpcklwd mm1,mm6 \
- /*mm3={-1}x4, mm6={1}x4 \
-   mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
-__asm  paddd mm5,mm3 \
-__asm  paddd mm4,mm1 \
-__asm  psrad mm5,16 \
-__asm  pxor mm6,mm6 \
-__asm  psrad mm4,16 \
-__asm  pcmpeqb mm3,mm3 \
-__asm  packssdw mm4,mm5 \
-__asm  psubw mm6,mm3 \
- /*mm1=t7'', mm7={26568,0x3400}x2 \
-   mm2=s=t6'''-(36410*u>>16)*/ \
-__asm  movq mm1,mm4 \
-__asm  mov A,0x340067C8 \
-__asm  pmulhw mm4,mm7 \
-__asm  movd mm7,A \
-__asm  movq [Y+_r5],mm1 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddw mm4,mm1 \
-__asm  movq mm1,[Y+_r7] \
-__asm  psubw mm2,mm4 \
- /*mm6={0x00007B1B}x2 \
-   mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
-__asm  movq mm4,mm2 \
-__asm  movq mm5,mm2 \
-__asm  punpcklwd mm4,mm6 \
-__asm  pcmpeqw mm0,mm2 \
-__asm  pmaddwd mm4,mm7 \
-__asm  mov A,0x7B1B \
-__asm  punpckhwd mm5,mm6 \
-__asm  movd mm6,A \
-__asm  pmaddwd mm5,mm7 \
-__asm  psubw mm0,mm3 \
-__asm  punpckldq mm6,mm6 \
- /*mm7={64277-0x7FFF,0x7FFF}x2 \
-   mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
-__asm  psrad mm4,17 \
-__asm  paddw mm2,mm0 \
-__asm  psrad mm5,17 \
-__asm  mov A,0x7FFF7B16 \
-__asm  packssdw mm4,mm5 \
-__asm  movd mm7,A \
-__asm  paddw mm2,mm4 \
-__asm  punpckldq mm7,mm7 \
- /*mm0=0, mm7={12785}x4 \
-   mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
-__asm  movq mm4,mm1 \
-__asm  movq mm5,mm1 \
-__asm  movq [Y+_r3],mm2 \
-__asm  punpcklwd mm4,mm1 \
-__asm  movq mm2,[Y+_r1] \
-__asm  pmaddwd mm4,mm7 \
-__asm  mov A,0x31F131F1 \
-__asm  punpckhwd mm5,mm1 \
-__asm  pxor mm0,mm0 \
-__asm  pmaddwd mm5,mm7 \
-__asm  pcmpeqw mm1,mm0 \
-__asm  movd mm7,A \
-__asm  psubw mm1,mm3 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddd mm4,mm6 \
-__asm  paddd mm5,mm6 \
- /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
-__asm  movq mm6,mm2 \
-__asm  movq mm3,mm2 \
-__asm  pmulhw mm6,mm7 \
-__asm  pmullw mm3,mm7 \
-__asm  paddw mm6,mm1 \
-__asm  movq mm1,mm3 \
-__asm  punpckhwd mm3,mm6 \
-__asm  punpcklwd mm1,mm6 \
- /*mm3={-1}x4, mm6={1}x4 \
-   mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
-__asm  paddd mm5,mm3 \
-__asm  paddd mm4,mm1 \
-__asm  psrad mm5,16 \
-__asm  pxor mm6,mm6 \
-__asm  psrad mm4,16 \
-__asm  pcmpeqb mm3,mm3 \
-__asm  packssdw mm4,mm5 \
-__asm  psubw mm6,mm3 \
- /*mm1=t3'', mm7={20539,0x3000}x2 \
-   mm4=s=(12785*u>>16)-t4''*/ \
-__asm  movq [Y+_r1],mm4 \
-__asm  pmulhw mm4,mm7 \
-__asm  mov A,0x3000503B \
-__asm  movq mm1,[Y+_r6] \
-__asm  movd mm7,A \
-__asm  psubw mm4,mm2 \
-__asm  punpckldq mm7,mm7 \
- /*mm6={0x00006CB7}x2 \
-   mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
-__asm  movq mm5,mm4 \
-__asm  movq mm2,mm4 \
-__asm  punpcklwd mm4,mm6 \
-__asm  pcmpeqw mm0,mm2 \
-__asm  pmaddwd mm4,mm7 \
-__asm  mov A,0x6CB7 \
-__asm  punpckhwd mm5,mm6 \
-__asm  movd mm6,A \
-__asm  pmaddwd mm5,mm7 \
-__asm  psubw mm0,mm3 \
-__asm  punpckldq mm6,mm6 \
- /*mm7={60547-0x7FFF,0x7FFF}x2 \
-   mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
-__asm  psrad mm4,20 \
-__asm  paddw mm2,mm0 \
-__asm  psrad mm5,20 \
-__asm  mov A,0x7FFF6C84 \
-__asm  packssdw mm4,mm5 \
-__asm  movd mm7,A \
-__asm  paddw mm2,mm4 \
-__asm  punpckldq mm7,mm7 \
- /*mm0=0, mm7={25080}x4 \
-   mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
-__asm  movq mm4,mm1 \
-__asm  movq mm5,mm1 \
-__asm  movq [Y+_r7],mm2 \
-__asm  punpcklwd mm4,mm1 \
-__asm  movq mm2,[Y+_r2] \
-__asm  pmaddwd mm4,mm7 \
-__asm  mov A,0x61F861F8 \
-__asm  punpckhwd mm5,mm1 \
-__asm  pxor mm0,mm0 \
-__asm  pmaddwd mm5,mm7 \
-__asm  movd mm7,A \
-__asm  pcmpeqw mm1,mm0 \
-__asm  psubw mm1,mm3 \
-__asm  punpckldq mm7,mm7 \
-__asm  paddd mm4,mm6 \
-__asm  paddd mm5,mm6 \
- /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
-__asm  movq mm6,mm2 \
-__asm  movq mm3,mm2 \
-__asm  pmulhw mm6,mm7 \
-__asm  pmullw mm3,mm7 \
-__asm  paddw mm6,mm1 \
-__asm  movq mm1,mm3 \
-__asm  punpckhwd mm3,mm6 \
-__asm  punpcklwd mm1,mm6 \
- /*mm1={-1}x4 \
-   mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
-__asm  paddd mm5,mm3 \
-__asm  paddd mm4,mm1 \
-__asm  psrad mm5,16 \
-__asm  mov A,0x28005460 \
-__asm  psrad mm4,16 \
-__asm  pcmpeqb mm1,mm1 \
-__asm  packssdw mm4,mm5 \
- /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
-   mm4=s=(25080*u>>16)-t2''*/ \
-__asm  movq mm6,mm4 \
-__asm  pmulhw mm4,mm7 \
-__asm  pxor mm5,mm5 \
-__asm  movd mm7,A \
-__asm  psubw mm5,mm1 \
-__asm  punpckldq mm7,mm7 \
-__asm  psubw mm4,mm2 \
- /*mm2=s+(s!=0) \
-   mm4:mm3=s*21600+0x2800*/ \
-__asm  movq mm3,mm4 \
-__asm  movq mm2,mm4 \
-__asm  punpckhwd mm4,mm5 \
-__asm  pcmpeqw mm0,mm2 \
-__asm  pmaddwd mm4,mm7 \
-__asm  psubw mm0,mm1 \
-__asm  punpcklwd mm3,mm5 \
-__asm  paddw mm2,mm0 \
-__asm  pmaddwd mm3,mm7 \
- /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
-   mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
-__asm  movq mm0,[Y+_r4] \
-__asm  psrad mm4,18 \
-__asm  movq mm5,[Y+_r5] \
-__asm  psrad mm3,18 \
-__asm  movq mm1,[Y+_r7] \
-__asm  packssdw mm3,mm4 \
-__asm  movq mm4,[Y+_r0] \
-__asm  paddw mm3,mm2 \
+#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*Stage 2:*/ \
+  /*mm7=t3''=t0'-t3'*/ \
+  __asm  psubw mm7,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm6=t2''=t1'-t2'*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  movq [Y+_r6],mm7 \
+  __asm  paddw mm5,mm5 \
+  /*mm1=t5''=t6'-t5'*/ \
+  __asm  psubw mm1,mm2 \
+  __asm  movq [Y+_r2],mm6 \
+  /*mm4=t0''=t0'+t3'*/ \
+  __asm  paddw mm4,mm7 \
+  __asm  paddw mm2,mm2 \
+  /*mm5=t1''=t1'+t2'*/ \
+  __asm  movq [Y+_r0],mm4 \
+  __asm  paddw mm5,mm6 \
+  /*mm2=t6''=t6'+t5'*/ \
+  __asm  paddw mm2,mm1 \
+  __asm  movq [Y+_r4],mm5 \
+  /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+  /*mm4, mm5, mm6, mm7 are free.*/ \
+  /*Stage 3:*/ \
+  /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+  __asm  mov A,0x5A806A0A \
+  __asm  pcmpeqb mm6,mm6 \
+  __asm  movd mm7,A \
+  __asm  psrlw mm6,15 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm6,mm6 \
+  /*mm0=0, m2={-1}x4 \
+    mm5:mm4=t5''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm2,mm2 \
+  /*mm2=t6'', mm1=t5''+(t5''!=0) \
+    mm4=(t5''*27146+0xB500>>16)*/ \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm0,mm2 \
+  __asm  movq mm2, [Y+_r3] \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm1,mm0 \
+  __asm  packssdw mm4,mm5 \
+  /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm0, [Y+_r7] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm1,mm3 \
+  /*mm3=t4''=t4'+s*/ \
+  __asm  paddw mm3,mm4 \
+  /*mm1=t5'''=t4'-s*/ \
+  __asm  psubw mm1,mm4 \
+  /*mm1=0, mm3={-1}x4 \
+    mm5:mm4=t6''*27146+0xB500*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r1],mm3 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm1,mm1 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm3,mm3 \
+  /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqw mm1,mm2 \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm2,mm1 \
+  /*mm1=t1'' \
+    mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+  __asm  paddw mm4,mm2 \
+  __asm  movq mm1,[Y+_r4] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm2,mm0 \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm0=t7''=t7'+s*/ \
+  __asm  paddw mm0,mm4 \
+  /*mm2=t6'''=t7'-s*/ \
+  __asm  psubw mm2,mm4 \
+  /*Stage 4:*/ \
+  /*mm0=0, mm2=t0'' \
+    mm5:mm4=t1''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq mm2,[Y+_r0] \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm0,mm0 \
+  /*mm7={27146,0x4000>>1}x2 \
+    mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  mov A,0x20006A0A \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  movd mm7,A \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm0,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm0,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm0,mm4 \
+  /*mm6={0x00000E3D}x2 \
+    mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  mov A,0x0E3D \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm1,mm1 \
+  __asm  punpckldq mm6,mm6 \
+  __asm  pcmpeqw mm1,mm2 \
+  /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm2,mm1 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movq mm1,[Y+_r5] \
+  __asm  paddw mm4,mm2 \
+  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+    The naive implementation could cause overflow, so we use u=s+(r-s>>1).*/ \
+  __asm  mov A,0x7FFF54DC \
+  __asm  psubw mm0,mm4 \
+  __asm  movq mm2,[Y+_r3] \
+  __asm  psraw mm0,1 \
+  __asm  movd mm7,A \
+  __asm  paddw mm0,mm4 \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm4=_y[4]=v=r-u*/ \
+  __asm  psubw mm4,mm0 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  movq [Y+_r4],mm4 \
+  /*mm0=0, mm7={36410}x4 \
+    mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  mov A,0x8E3A8E3A \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r0],mm0 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm0=0 \
+    mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  paddw mm1,mm2 \
+  __asm  pmullw mm3,mm7 \
+  __asm  pxor mm0,mm0 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t7'', mm7={26568,0x3400}x2 \
+    mm2=s=t6'''-(36410*u>>16)*/ \
+  __asm  movq mm1,mm4 \
+  __asm  mov A,0x340067C8 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  movd mm7,A \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  psubw mm2,mm4 \
+  /*mm6={0x00007B1B}x2 \
+    mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x7B1B \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={64277-0x7FFF,0x7FFF}x2 \
+    mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+  __asm  psrad mm4,17 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,17 \
+  __asm  mov A,0x7FFF7B16 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={12785}x4 \
+    mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r1] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x31F131F1 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t3'', mm7={20539,0x3000}x2 \
+    mm4=s=(12785*u>>16)-t4''*/ \
+  __asm  movq [Y+_r1],mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  mov A,0x3000503B \
+  __asm  movq mm1,[Y+_r6] \
+  __asm  movd mm7,A \
+  __asm  psubw mm4,mm2 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm6={0x00006CB7}x2 \
+    mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+  __asm  movq mm5,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x6CB7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={60547-0x7FFF,0x7FFF}x2 \
+    mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+  __asm  psrad mm4,20 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,20 \
+  __asm  mov A,0x7FFF6C84 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={25080}x4 \
+    mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r7],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r2] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x61F861F8 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  movd mm7,A \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm1={-1}x4 \
+    mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  mov A,0x28005460 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm1,mm1 \
+  __asm  packssdw mm4,mm5 \
+  /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+    mm4=s=(25080*u>>16)-t2''*/ \
+  __asm  movq mm6,mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  pxor mm5,mm5 \
+  __asm  movd mm7,A \
+  __asm  psubw mm5,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  psubw mm4,mm2 \
+  /*mm2=s+(s!=0) \
+    mm4:mm3=s*21600+0x2800*/ \
+  __asm  movq mm3,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpckhwd mm4,mm5 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  psubw mm0,mm1 \
+  __asm  punpcklwd mm3,mm5 \
+  __asm  paddw mm2,mm0 \
+  __asm  pmaddwd mm3,mm7 \
+  /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+    mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+  __asm  movq mm0,[Y+_r4] \
+  __asm  psrad mm4,18 \
+  __asm  movq mm5,[Y+_r5] \
+  __asm  psrad mm3,18 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  packssdw mm3,mm4 \
+  __asm  movq mm4,[Y+_r0] \
+  __asm  paddw mm3,mm2 \
 }
 
 /*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
   On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
    {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
-# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm \
-{ \
- /*First 4x4 transpose:*/ \
- /*mm0 = e3 e2 e1 e0 \
-   mm5 = f3 f2 f1 f0 \
-   mm3 = g3 g2 g1 g0 \
-   mm1 = h3 h2 h1 h0*/ \
-__asm  movq mm2,mm0 \
-__asm  punpcklwd mm0,mm5 \
-__asm  punpckhwd mm2,mm5 \
-__asm  movq mm5,mm3 \
-__asm  punpcklwd mm3,mm1 \
-__asm  punpckhwd mm5,mm1 \
- /*mm0 = f1 e1 f0 e0 \
-   mm2 = f3 e3 f2 e2 \
-   mm3 = h1 g1 h0 g0 \
-   mm5 = h3 g3 h2 g2*/ \
-__asm  movq mm1,mm0 \
-__asm  punpckldq mm0,mm3 \
-__asm  movq [Y+_r4],mm0 \
-__asm  punpckhdq mm1,mm3 \
-__asm  movq mm0,[Y+_r1] \
-__asm  movq mm3,mm2 \
-__asm  punpckldq mm2,mm5 \
-__asm  punpckhdq mm3,mm5 \
-__asm  movq mm5,[Y+_r3] \
- /*_y[4] = h0 g0 f0 e0 \
-  mm1  = h1 g1 f1 e1 \
-  mm2  = h2 g2 f2 e2 \
-  mm3  = h3 g3 f3 e3*/ \
- /*Second 4x4 transpose:*/ \
- /*mm4 = a3 a2 a1 a0 \
-   mm0 = b3 b2 b1 b0 \
-   mm6 = c3 c2 c1 c0 \
-   mm5 = d3 d2 d1 d0*/ \
-__asm  movq mm7,mm4 \
-__asm  punpcklwd mm4,mm0 \
-__asm  punpckhwd mm7,mm0 \
-__asm  movq mm0,mm6 \
-__asm  punpcklwd mm6,mm5 \
-__asm  punpckhwd mm0,mm5 \
- /*mm4 = b1 a1 b0 a0 \
-   mm7 = b3 a3 b2 a2 \
-   mm6 = d1 c1 d0 c0 \
-   mm0 = d3 c3 d2 c2*/ \
-__asm  movq mm5,mm4 \
-__asm  punpckldq mm4,mm6 \
-__asm  punpckhdq mm5,mm6 \
-__asm  movq mm6,mm7 \
-__asm  punpckhdq mm7,mm0 \
-__asm  punpckldq mm6,mm0 \
- /*mm4 = d0 c0 b0 a0 \
-   mm5 = d1 c1 b1 a1 \
-   mm6 = d2 c2 b2 a2 \
-   mm7 = d3 c3 b3 a3*/ \
+#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*First 4x4 transpose:*/ \
+  /*mm0 = e3 e2 e1 e0 \
+    mm5 = f3 f2 f1 f0 \
+    mm3 = g3 g2 g1 g0 \
+    mm1 = h3 h2 h1 h0*/ \
+  __asm  movq mm2,mm0 \
+  __asm  punpcklwd mm0,mm5 \
+  __asm  punpckhwd mm2,mm5 \
+  __asm  movq mm5,mm3 \
+  __asm  punpcklwd mm3,mm1 \
+  __asm  punpckhwd mm5,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm2 = f3 e3 f2 e2 \
+    mm3 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm3 \
+  __asm  movq [Y+_r4],mm0 \
+  __asm  punpckhdq mm1,mm3 \
+  __asm  movq mm0,[Y+_r1] \
+  __asm  movq mm3,mm2 \
+  __asm  punpckldq mm2,mm5 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq mm5,[Y+_r3] \
+  /*_y[4] = h0 g0 f0 e0 \
+   mm1  = h1 g1 f1 e1 \
+   mm2  = h2 g2 f2 e2 \
+   mm3  = h3 g3 f3 e3*/ \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm0 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm5 = d3 d2 d1 d0*/ \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm0 \
+  __asm  punpckhwd mm7,mm0 \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm5 \
+  __asm  punpckhwd mm0,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
 }
 
 /*MMX implementation of the fDCT.*/
 void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   ptrdiff_t a;
-  __asm {
-    #define Y eax
-    #define A ecx
-    #define X edx
+  __asm{
+#define Y eax
+#define A ecx
+#define X edx
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
     /*We also add biases to correct for some systematic error that remains in
@@ -661,6 +658,9 @@
     psraw mm1,2
     movq [0x70+Y],mm7
     movq [0x48+Y],mm1
+#undef Y
+#undef A
+#undef X
   }
 }
 



More information about the commits mailing list