[xiph-commits] r12873 - in trunk/theora/lib: . dec dec/x86

Sun Apr 15 18:32:17 PDT 2007

Author: tterribe
Date: 2007-04-15 18:32:17 -0700 (Sun, 15 Apr 2007)
New Revision: 12873

Modified:
   trunk/theora/lib/Makefile.am
   trunk/theora/lib/dec/state.c
   trunk/theora/lib/dec/x86/mmxfrag.c
   trunk/theora/lib/dec/x86/mmxidct.c
   trunk/theora/lib/dec/x86/mmxstate.c
   trunk/theora/lib/dec/x86/x86state.c
   trunk/theora/lib/internal.h
Log:
Major overhaul of the x86 assembly for the decoder.
Reduced the divergence between x86-32 and x86-64 (there is now basically 
 one function that is different between them, and that due to the extra 
 registers x86-64 gives us).
Used named parameters for readability (requires gcc 3.1 or later).
All global symbol references should now be handled by gcc, so there 
 should be no problems with/without -fPIC, or various underscore 
 conventions for different toolchains (e.g., OS X).
Currently satisfies register allocations even WITH -fPIC and WITHOUT 
 -fomit-frame-pointer (the worst case scenario).

The same cannot be said for the encoder's asm, on basically any of these 
 points.


Modified: trunk/theora/lib/Makefile.am
===================================================================

--- trunk/theora/lib/Makefile.am	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/Makefile.am	2007-04-16 01:32:17 UTC (rev 12873)
@@ -49,22 +49,22 @@
 if CPU_x86_64
 arch_dir = enc/x86_64
 encoder_arch_sources= \
-	$(arch_dir)/dct_decode_mmx.c \
-	$(arch_dir)/dsp_mmx.c \
-	$(arch_dir)/dsp_mmxext.c \
-	$(arch_dir)/recon_mmx.c \
-	$(arch_dir)/idct_mmx.c \
-	$(arch_dir)/fdct_mmx.c
+	$(enc_arch_dir)/dct_decode_mmx.c \
+	$(enc_arch_dir)/dsp_mmx.c \
+	$(enc_arch_dir)/dsp_mmxext.c \
+	$(enc_arch_dir)/recon_mmx.c \
+	$(enc_arch_dir)/idct_mmx.c \
+	$(enc_arch_dir)/fdct_mmx.c
 else
 if CPU_x86_32
 arch_dir = enc/x86_32
 encoder_arch_sources= \
-	$(arch_dir)/dct_decode_mmx.c \
-	$(arch_dir)/dsp_mmx.c \
-	$(arch_dir)/dsp_mmxext.c \
-	$(arch_dir)/recon_mmx.c \
-	$(arch_dir)/idct_mmx.c \
-	$(arch_dir)/fdct_mmx.c
+	$(enc_arch_dir)/dct_decode_mmx.c \
+	$(enc_arch_dir)/dsp_mmx.c \
+	$(enc_arch_dir)/dsp_mmxext.c \
+	$(enc_arch_dir)/recon_mmx.c \
+	$(enc_arch_dir)/idct_mmx.c \
+	$(enc_arch_dir)/fdct_mmx.c
 endif
 endif
 

Modified: trunk/theora/lib/dec/state.c
===================================================================
--- trunk/theora/lib/dec/state.c	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/dec/state.c	2007-04-16 01:32:17 UTC (rev 12873)
@@ -528,9 +528,9 @@
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
   _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
-  _state->opt_vtable.oc_state_loop_filter_frag_rows=
-                       oc_state_loop_filter_frag_rows_c;
 }
 
 /*Initialize the accelerated function pointers.*/
@@ -939,7 +939,7 @@
   }
 }
 
-void loop_filter_h_c(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
   int y;
   _pix-=2;
   for(y=0;y<8;y++){
@@ -952,7 +952,7 @@
   }
 }
 
-void loop_filter_v_c(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
   int y;
   _pix-=_ystride*2;
   for(y=0;y<8;y++){
@@ -967,15 +967,12 @@
 
 /*Initialize the bounding values array used by the loop filter.
   _bv: Storage for the array.
-       The total array size should be 512, but this pointer should point to the
-         256th entry, as that is more convenient for the filter functions.
   Return: 0 on success, or a non-zero value if no filtering need be applied.*/
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv){
   int flimit;
   int i;
   flimit=_state->loop_filter_limits[_state->qis[0]];
   if(flimit==0)return 1;
-  _bv-=256;
   memset(_bv,0,sizeof(_bv[0])*512);
   for(i=0;i<flimit;i++){
     _bv[256-i-flimit]=i-flimit;
@@ -996,13 +993,13 @@
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-   _state->opt_vtable.oc_state_loop_filter_frag_rows(_state,_bv,_refi,
-                _pli,_fragy0,_fragy_end);
+  _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
+   _fragy0,_fragy_end);
 }
 
 void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){  
-  th_img_plane  *iplane;
+  th_img_plane      *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag_top;
   oc_fragment       *frag0;
@@ -1028,16 +1025,16 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h_c(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag0>frag_top){
-          loop_filter_v_c(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h_c(frag->buffer[_refi]+8,iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
-          loop_filter_v_c((frag+fplane->nhfrags)->buffer[_refi],
+          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
            iplane->ystride,_bv);
         }
       }
@@ -1053,7 +1050,7 @@
   int framei;
   int pli;
   framei=_state->ref_frame_idx[_frame];
-  if(oc_state_loop_filter_init(_state,bounding_values+256))return;
+  if(oc_state_loop_filter_init(_state,bounding_values))return;
   for(pli=0;pli<3;pli++){
     oc_state_loop_filter_frag_rows(_state,bounding_values,
      framei,pli,0,_state->fplanes[pli].nvfrags);

Modified: trunk/theora/lib/dec/x86/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxfrag.c	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/dec/x86/mmxfrag.c	2007-04-16 01:32:17 UTC (rev 12873)
@@ -21,158 +21,220 @@
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V128=
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V128=
  0x0080008000800080LL;
 
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
-  __asm__ __volatile__(
-   "  mov          $0x7, %%ecx  \n\t" /* 8x loop */
-   "  .p2align 4                \n\t"
-   "1:movq           %3, %%mm0  \n\t" /* Set mm0 to 0x0080008000800080 */
-   "  movq         (%1), %%mm2  \n\t" /* First four input values */
-   "  movq        %%mm0, %%mm1  \n\t" /* Set mm1 == mm0 */
-   "  movq        8(%1), %%mm3  \n\t" /* Next four input values */
-   "  decl      %%ecx           \n\t" /* dec counter */
-   "  paddsw      %%mm3, %%mm1  \n\t" /* add+128 and saturate to 16bit */
-   "  lea      0x10(%1), %1     \n\t" /*_residuo+16 */
-   "  paddsw      %%mm2, %%mm0  \n\t" /* add+128 and saturate to 16bit   */
-   "  packuswb    %%mm1, %%mm0  \n\t" /* pack saturate with next(high) four values */
-   "  movq      %%mm0, (%0)     \n\t" /* writeback */
-   "  lea         (%0,%2), %0   \n\t" /*_dst+_dst_ystride */
-   "  jns 1b                    \n\t" /* loop */
-   :"+r" (_dst)
-   :"r" (_residue),
-    "r" ((long)_dst_ystride),
-    "m" (V128)
-   :"memory", "ecx", "cc"
-  );
+  int i;
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+      /*Set mm0 to 0x0080008000800080.*/
+      "movq %[OC_V128],%%mm0\n\t"
+      /*First four input values*/
+      "movq (%[residue]),%%mm2\n\t"
+      /*Set mm1=mm0.*/
+      "movq %%mm0,%%mm1\n\t"
+      /*Next four input values.*/
+      "movq 8(%[residue]),%%mm3\n\t"
+      /*Add 128 and saturate to 16 bits.*/
+      "paddsw %%mm3,%%mm1\n\t"
+      /*_residue+=16*/
+      "lea 0x10(%[residue]),%[residue]\n\t"
+      /*Add 128 and saturate to 16 bits.*/
+      "paddsw %%mm2,%%mm0\n\t"
+      /*Pack saturate with next(high) four values.*/
+      "packuswb %%mm1,%%mm0\n\t"
+      /*Writeback.*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*_dst+=_dst_ystride*/
+      "lea  (%[dst],%[dst_ystride]),%[dst]\n\t"
+      :[dst]"+r"(_dst),[residue]"+r"(_residue)
+      :[dst_ystride]"r"((long)_dst_ystride),[OC_V128]"m"(OC_V128)
+      :"memory"
+    );
+  }
 }
 
 void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
   int i;
-  __asm__ __volatile__(
-   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0  */
-   "  .p2align 4                   \n\t"
-   "1: movq        (%4),   %%mm2   \n\t" /* load mm2 with _src */
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy mm2 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm2   \n\t" /* expand high part of _src to 16 bits */
-   "  punpcklbw    %%mm0,  %%mm3   \n\t" /* expand low part of _src to 16 bits */
-   "  paddsw       (%1),   %%mm3   \n\t" /* add low part with low part of residue */
-   "  paddsw       8(%1),  %%mm2   \n\t" /* high with high */
-   "  packuswb     %%mm2,  %%mm3   \n\t" /* pack and saturate to mm3 */
-   "  lea         (%4,%3), %4      \n\t" /* _src+_src_ystride */
-   "  lea         0x10(%1), %1     \n\t" /* _residuo+16 */
-   "  movq        %%mm3,   (%0)    \n\t" /* put mm3 to dest */
-   "  lea         (%0,%2),%0       \n\t" /* _dst+_dst_ystride */
-   "  decl        %%eax            \n\t" /* dec counter */
-   "  jns         1b               \n\t" /* loop */
-   :"+r" (_dst)
-   :"r" (_residue), 
-    "r" ((long)_dst_ystride),
-    "r" ((long)_src_ystride),
-    "r" (_src)
-   :"memory", "eax", "cc"
-  );
+  /*Zero mm0.*/
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+      /*Load mm2 with _src*/
+      "movq (%[src]),%%mm2\n\t"
+      /*Copy mm2 to mm3.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*Expand high part of _src to 16 bits.*/
+      "punpckhbw %%mm0,%%mm2\n\t"
+      /*Expand low part of _src to 16 bits.*/
+      "punpcklbw %%mm0,%%mm3\n\t"
+      /*Add low part with low part of residue.*/
+      "paddsw (%[residue]),%%mm3\n\t"
+      /*High with high.*/
+      "paddsw 8(%[residue]),%%mm2\n\t"
+      /*Pack and saturate to mm3.*/
+      "packuswb %%mm2,%%mm3\n\t"
+      /*_src+=_src_ystride*/
+      "lea (%[src],%[src_ystride]),%[src]\n\t"
+      /*_residue+=16*/
+      "lea 0x10(%[residue]),%[residue]\n\t"
+      /*Put mm3 to dest.*/
+      "movq %%mm3,(%[dst])\n\t"
+      /*_dst+=_dst_ystride*/
+      "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
+      :[dst]"+r"(_dst),[src]"+r"(_src),[residue]"+r"(_residue)
+      :[dst_ystride]"r"((long)_dst_ystride),
+       [src_ystride]"r"((long)_src_ystride)
+      :"memory"
+    );
+  }
 }
 
-#if (defined(__amd64__) ||  defined(__x86_64__))
+#if defined(__amd64__)||defined(__x86_64__)
 
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
-
+  int i;
   __asm__ __volatile__(
-   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
-   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
-   "  .p2align 4                   \n\t"
-   "1:movq         (%6),   %%mm4   \n\t" /* packed SRC2 */ 
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
-   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
-   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
-   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
-   "  lea          (%4,%3), %4     \n\t" /*  _src1+_src1_ystride */
-   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
-   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
-   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
-   "  lea          (%6,%5), %6     \n\t" /* _src2+_src2_ystride */  
-   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
-   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
-   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
-   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
-   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
-   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
-   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
-   "  movq         %%mm4, (%0)     \n\t" /* write to src */
-   "  decl         %%eax           \n\t"
-   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
-   "  jns          1b\n\t"
-   :"+r" (_dst) /* 0 */
-   :"r" (_residue), /* 1 */
-    "r" ((long)_dst_ystride), /* 2 */
-    "r" ((long)_src1_ystride), /* 3 */
-    "r" (_src1), /* 4 */
-    "r" ((long)_src2_ystride), /* 5 */
-    "r" (_src2) /* 6 */
-   : "memory", "cc", "eax"
+    /*Zero mm0.*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*Load mm2 with _src1.*/
+    "movq (%[src1]),%%mm2\n\t"
+    :[src1]"+r"(_src1)
+    :
   );
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+     /*Packed _src2.*/ 
+     "movq (%[src2]),%%mm4\n\t"
+     /*Copy packed src1 to mm3.*/
+     "movq %%mm2,%%mm3\n\t"
+     /*Copy packed src2 to mm5.*/
+     "movq %%mm4,%%mm5\n\t"
+     /*Expand low part of src1 to mm2.*/
+     "punpcklbw %%mm0,%%mm2\n\t"
+     /*Expand Low part of src2 to mm4.*/
+     "punpcklbw %%mm0,%%mm4\n\t"
+     /*_src1+=_src1_ystride*/
+     "lea (%[src1],%[src1_ystride]),%[src1]\n\t"
+     /*Expand high part of src1 to mm3.*/
+     "punpckhbw %%mm0,%%mm3\n\t"
+     /*Expand high part of src2 to mm5.*/
+     "punpckhbw %%mm0,%%mm5\n\t"
+     /*Add low parts of src1 and src2.*/
+     "paddsw %%mm2,%%mm4\n\t"
+     /*Add high parts of src1 and src2.*/
+     "paddsw %%mm3,%%mm5\n\t"
+     /*_src2+=_src2_ystride.*/
+     "lea (%[src2],%[src2_ystride]),%[src2]\n\t"
+     /*Load mm2 with _src1.*/
+     "movq (%[src1]),%%mm2\n\t"
+     /*Shift logical 1 to right o 2 dolu.*/
+     "psrlw $1,%%mm4\n\t"
+     /*Shift logical 1 to right.*/
+     "psrlw $1,%%mm5\n\t"
+     /*Add low parts wwith low parts.*/
+     "paddsw (%[residue]),%%mm4\n\t"
+     /*Add highparts with high.*/
+     "paddsw 8(%[residue]),%%mm5\n\t"
+     /*Pack saturate high to low.*/
+     "packuswb %%mm5,%%mm4\n\t"
+     /*_residue+=16.*/
+     "lea 0x10(%[residue]),%[residue]\n\t"
+     /*Write to dst.*/
+     "movq %%mm4,(%[dst])\n\t"
+     /*_dst+=_dst_ystride*/
+     "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
+     :[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
+     :[dst_ystride]"r"((long)_dst_ystride),
+      [src1_ystride]"r"((long)_src1_ystride),
+      [src2_ystride]"r"((long)_src2_ystride)
+     :"memory"
+    );
+  }
 }
+
 #else
 
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
-  int i;
+  long a;
+  int  i;
   __asm__ __volatile__(
-   "  movl         $0x7,   %7      \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
-   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
-   "  .p2align 4                   \n\t"
-   "1: movq        (%6),   %%mm4   \n\t" /* packed SRC2 */ 
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
-   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
-   "  mov          %3,     %%eax   \n\t"
-   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
-   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
-   "  lea          (%4,%%eax), %4  \n\t" /*  _src1+_src1_ystride */
-   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
-   "  mov          %5,     %%eax   \n\t"
-   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
-   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
-   "  lea          (%6,%%eax), %6  \n\t" /* _src2+_src2_ystride */  
-   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
-   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
-   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
-   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
-   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
-   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
-   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
-   "  movq         %%mm4, (%0)     \n\t" /* write to src */
-   "  decl         %7              \n\t"
-   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
-   "  jns          1b\n\t"
-   :"+r" (_dst) /* 0 */
-   :"r" (_residue), /* 1 */
-    "r" (_dst_ystride), /* 2 */
-    "m" (_src1_ystride), /* 3 */
-    "r" (_src1), /* 4 */
-    "m" (_src2_ystride), /* 5 */
-    "r" (_src2), /* 6 */
-    "m" (i)
-   :"memory", "eax", "cc"
+    /*Zero mm0.*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*Load mm2 with _src1.*/
+    "movq (%[src1]),%%mm2\n\t"
+    :[src1]"+r"(_src1)
+    :
   );
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+     /*Packed _src2.*/ 
+     "movq (%[src2]),%%mm4\n\t"
+     /*Copy packed src1 to mm3.*/
+     "movq %%mm2,%%mm3\n\t"
+     /*Copy packed src2 to mm5.*/
+     "movq %%mm4,%%mm5\n\t"
+     /*eax=_src1_ystride*/
+     "mov %[src1_ystride],%[a]\n\t"
+     /*Expand low part of src1 to mm2.*/
+     "punpcklbw %%mm0,%%mm2\n\t"
+     /*Expand Low part of src2 to mm4.*/
+     "punpcklbw %%mm0,%%mm4\n\t"
+     /*_src1+=_src1_ystride*/
+     "lea (%[src1],%[a]),%[src1]\n\t"
+     /*Expand high part of src1 to mm3.*/
+     "punpckhbw %%mm0,%%mm3\n\t"
+     /*Expand high part of src2 to mm5.*/
+     "punpckhbw %%mm0,%%mm5\n\t"
+     /*eax=_src2_ystride*/
+     "mov %[src2_ystride],%[a]\n\t"
+     /*Add low parts of src1 and src2.*/
+     "paddsw %%mm2,%%mm4\n\t"
+     /*Add high parts of src1 and src2.*/
+     "paddsw %%mm3,%%mm5\n\t"
+     /*_src2+=_src2_ystride.*/
+     "lea (%[src2],%[a]),%[src2]\n\t"
+     /*Load mm2 with _src1.*/
+     "movq (%[src1]),%%mm2\n\t"
+     /*Shift logical 1 to right o 2 dolu.*/
+     "psrlw $1,%%mm4\n\t"
+     /*Shift logical 1 to right.*/
+     "psrlw $1,%%mm5\n\t"
+     /*Add low parts wwith low parts.*/
+     "paddsw (%[residue]),%%mm4\n\t"
+     /*Add highparts with high.*/
+     "paddsw 8(%[residue]),%%mm5\n\t"
+     /*eax=_dst_ystride.*/
+     "mov %[dst_ystride],%[a]\n\t"
+     /*Pack saturate high to low.*/
+     "packuswb %%mm5,%%mm4\n\t"
+     /*_residue+=16.*/
+     "lea 0x10(%[residue]),%[residue]\n\t"
+     /*Write to dst.*/
+     "movq %%mm4,(%[dst])\n\t"
+     /*_dst+=_dst_ystride*/
+     "lea (%[dst],%[a]),%[dst]\n\t"
+     :[a]"=&a"(a),[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
+     :[dst_ystride]"m"((long)_dst_ystride),
+      [src1_ystride]"m"((long)_src1_ystride),
+      [src2_ystride]"m"((long)_src2_ystride)
+     :"memory"
+    );
+  }
 }
 
 #endif
 
 void oc_restore_fpu_mmx(void){
-  __asm__ __volatile__(
-   "  emms    \n\t" /* pack with next(high) four values */
-  );
+  __asm__ __volatile__("emms\n\t");
 }
 #endif

Modified: trunk/theora/lib/dec/x86/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxidct.c	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/dec/x86/mmxidct.c	2007-04-16 01:32:17 UTC (rev 12873)
@@ -36,7 +36,8 @@
 
 
 /*A table of constants used by the MMX routines.*/
-ogg_uint16_t __attribute__((aligned(8),used)) OC_IDCT_CONSTS[(4+7+1)*4]={
+static const ogg_uint16_t __attribute__((aligned(8),used))
+ OC_IDCT_CONSTS[(4+7+1)*4]={
   65535,    0,    0,    0,
       0,65535,    0,    0,
       0,    0,65535,    0,
@@ -58,416 +59,484 @@
       8,    8,    8,    8
 };
 
-/*Converts the expression in the argument to a sting.*/
+/*Converts the expression in the argument to a string.*/
 #define OC_M2STR(_s) #_s
 
 /*38 cycles*/
 #define OC_IDCT_BEGIN \
- "  #OC_IDCT_BEGIN\n\t" \
- "  movq   "OC_I(3)",     %mm2\n\t" \
- "  movq   "OC_C(3)",     %mm6\n\t" \
- "  movq        %mm2,     %mm4\n\t" \
- "  movq   "OC_J(5)",     %mm7\n\t" \
- "  pmulhw      %mm6,     %mm4\n\t" \
- "  movq   "OC_C(5)",     %mm1\n\t" \
- "  pmulhw      %mm7,     %mm6\n\t" \
- "  movq        %mm1,     %mm5\n\t" \
- "  pmulhw      %mm2,     %mm1\n\t" \
- "  movq   "OC_I(1)",     %mm3\n\t" \
- "  pmulhw      %mm7,     %mm5\n\t" \
- "  movq   "OC_C(1)",     %mm0\n\t" \
- "  paddw       %mm2,     %mm4\n\t" \
- "  paddw       %mm7,     %mm6\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  movq   "OC_J(7)",     %mm1\n\t" \
- "  paddw       %mm5,     %mm7\n\t" \
- "  movq        %mm0,     %mm5\n\t" \
- "  pmulhw      %mm3,     %mm0\n\t" \
- "  paddw       %mm7,     %mm4\n\t" \
- "  pmulhw      %mm1,     %mm5\n\t" \
- "  movq   "OC_C(7)",     %mm7\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm3,     %mm0\n\t" \
- "  pmulhw      %mm7,     %mm3\n\t" \
- "  movq   "OC_I(2)",     %mm2\n\t" \
- "  pmulhw      %mm1,     %mm7\n\t" \
- "  paddw       %mm1,     %mm5\n\t" \
- "  movq        %mm2,     %mm1\n\t" \
- "  pmulhw "OC_C(2)",     %mm2\n\t" \
- "  psubw       %mm5,     %mm3\n\t" \
- "  movq   "OC_J(6)",     %mm5\n\t" \
- "  paddw       %mm7,     %mm0\n\t" \
- "  movq        %mm5,     %mm7\n\t" \
- "  psubw       %mm4,     %mm0\n\t" \
- "  pmulhw "OC_C(2)",     %mm5\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  paddw       %mm4,     %mm4\n\t" \
- "  paddw       %mm0,     %mm4\n\t" \
- "  psubw       %mm6,     %mm3\n\t" \
- "  paddw       %mm7,     %mm5\n\t" \
- "  paddw       %mm6,     %mm6\n\t" \
- "  pmulhw "OC_C(6)",     %mm7\n\t" \
- "  paddw       %mm3,     %mm6\n\t" \
- "  movq        %mm4,"OC_I(1)"\n\t" \
- "  psubw       %mm5,     %mm1\n\t" \
- "  movq   "OC_C(4)",     %mm4\n\t" \
- "  movq        %mm3,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm3\n\t" \
- "  paddw       %mm2,     %mm7\n\t" \
- "  movq        %mm6,"OC_I(2)"\n\t" \
- "  movq        %mm0,     %mm2\n\t" \
- "  movq   "OC_I(0)",     %mm6\n\t" \
- "  pmulhw      %mm4,     %mm0\n\t" \
- "  paddw       %mm3,     %mm5\n\t" \
- "  movq   "OC_J(4)",     %mm3\n\t" \
- "  psubw       %mm1,     %mm5\n\t" \
- "  paddw       %mm0,     %mm2\n\t" \
- "  psubw       %mm3,     %mm6\n\t" \
- "  movq        %mm6,     %mm0\n\t" \
- "  pmulhw      %mm4,     %mm6\n\t" \
- "  paddw       %mm3,     %mm3\n\t" \
- "  paddw       %mm1,     %mm1\n\t" \
- "  paddw       %mm0,     %mm3\n\t" \
- "  paddw       %mm5,     %mm1\n\t" \
- "  pmulhw      %mm3,     %mm4\n\t" \
- "  paddw       %mm0,     %mm6\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm2,     %mm2\n\t" \
- "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddw       %mm6,     %mm2\n\t" \
- "  paddw       %mm3,     %mm4\n\t" \
- "  psubw       %mm1,     %mm2\n\t" \
- "#end OC_IDCT_BEGIN\n\t"
+  "#OC_IDCT_BEGIN\n\t" \
+  "movq "OC_I(3)",%%mm2\n\t" \
+  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "movq "OC_J(5)",%%mm7\n\t" \
+  "pmulhw %%mm6,%%mm4\n\t" \
+  "movq "OC_C(5)",%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm6\n\t" \
+  "movq %%mm1,%%mm5\n\t" \
+  "pmulhw %%mm2,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm3\n\t" \
+  "pmulhw %%mm7,%%mm5\n\t" \
+  "movq "OC_C(1)",%%mm0\n\t" \
+  "paddw %%mm2,%%mm4\n\t" \
+  "paddw %%mm7,%%mm6\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "movq "OC_J(7)",%%mm1\n\t" \
+  "paddw %%mm5,%%mm7\n\t" \
+  "movq %%mm0,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm0\n\t" \
+  "paddw %%mm7,%%mm4\n\t" \
+  "pmulhw %%mm1,%%mm5\n\t" \
+  "movq "OC_C(7)",%%mm7\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm3,%%mm0\n\t" \
+  "pmulhw %%mm7,%%mm3\n\t" \
+  "movq "OC_I(2)",%%mm2\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "paddw %%mm1,%%mm5\n\t" \
+  "movq %%mm2,%%mm1\n\t" \
+  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "psubw %%mm5,%%mm3\n\t" \
+  "movq "OC_J(6)",%%mm5\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "movq %%mm5,%%mm7\n\t" \
+  "psubw %%mm4,%%mm0\n\t" \
+  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psubw %%mm6,%%mm3\n\t" \
+  "paddw %%mm7,%%mm5\n\t" \
+  "paddw %%mm6,%%mm6\n\t" \
+  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm6\n\t" \
+  "movq %%mm4,"OC_I(1)"\n\t" \
+  "psubw %%mm5,%%mm1\n\t" \
+  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm4,%%mm3\n\t" \
+  "paddw %%mm2,%%mm7\n\t" \
+  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm0,%%mm2\n\t" \
+  "movq "OC_I(0)",%%mm6\n\t" \
+  "pmulhw %%mm4,%%mm0\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "movq "OC_J(4)",%%mm3\n\t" \
+  "psubw %%mm1,%%mm5\n\t" \
+  "paddw %%mm0,%%mm2\n\t" \
+  "psubw %%mm3,%%mm6\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "pmulhw %%mm4,%%mm6\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm0,%%mm3\n\t" \
+  "paddw %%mm5,%%mm1\n\t" \
+  "pmulhw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm6\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "movq "OC_I(1)",%%mm0\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "paddw %%mm3,%%mm4\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "#end OC_IDCT_BEGIN\n\t" \
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm__ __volatile__( \
- "  #OC_ROW_IDCT\n" \
- OC_IDCT_BEGIN \
- "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
- "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
- "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
- "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
- "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddw       %mm3,     %mm3\n\t" \
- "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
- "  paddw       %mm5,     %mm5\n\t" \
- "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
- "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
- "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddw       %mm0,     %mm0\n\t" \
- "  movq        %mm1,"OC_I(1)"\n\t"  /* save R1 */ \
- "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
- "#end OC_ROW_IDCT\n\t" \
-)
+#define OC_ROW_IDCT \
+  "#OC\n" \
+  OC_IDCT_BEGIN \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*Save R1.*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r0=R0=G.+C.*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  "#end OC_ROW_IDCT\n\t" \
 
-/* The following macro does two 4x4 transposes in place.
-   At entry, we assume:
-     r0 = a3 a2 a1 a0
-   I(1) = b3 b2 b1 b0
-     r2 = c3 c2 c1 c0
-     r3 = d3 d2 d1 d0
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
 
-     r4 = e3 e2 e1 e0
-     r5 = f3 f2 f1 f0
-     r6 = g3 g2 g1 g0
-     r7 = h3 h2 h1 h0
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
 
-   At exit, we have:
-   I(0) = d0 c0 b0 a0
-   I(1) = d1 c1 b1 a1
-   I(2) = d2 c2 b2 a2
-   I(3) = d3 c3 b3 a3
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
 
-   J(4) = h0 g0 f0 e0
-   J(5) = h1 g1 f1 e1
-   J(6) = h2 g2 f2 e2
-   J(7) = h3 g3 f3 e3
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
 
-   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
-   J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
 
-   Since r1 is free at entry, we calculate the Js first.*/
+  Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm__ __volatile__( \
- "  #OC_TRANSPOSE\n\t" \
- "  movq           %mm4,     %mm1\n\t" \
- "  punpcklwd      %mm5,     %mm4\n\t" \
- "  movq           %mm0,"OC_I(0)"\n\t" \
- "  punpckhwd      %mm5,     %mm1\n\t" \
- "  movq           %mm6,     %mm0\n\t" \
- "  punpcklwd      %mm7,     %mm6\n\t" \
- "  movq           %mm4,     %mm5\n\t" \
- "  punpckldq      %mm6,     %mm4\n\t" \
- "  punpckhdq      %mm6,     %mm5\n\t" \
- "  movq           %mm1,     %mm6\n\t" \
- "  movq           %mm4,"OC_J(4)"\n\t" \
- "  punpckhwd      %mm7,     %mm0\n\t" \
- "  movq           %mm5,"OC_J(5)"\n\t" \
- "  punpckhdq      %mm0,     %mm6\n\t" \
- "  movq      "OC_I(0)",     %mm4\n\t" \
- "  punpckldq      %mm0,     %mm1\n\t" \
- "  movq      "OC_I(1)",     %mm5\n\t" \
- "  movq           %mm4,     %mm0\n\t" \
- "  movq           %mm6,"OC_J(7)"\n\t" \
- "  punpcklwd      %mm5,     %mm0\n\t" \
- "  movq           %mm1,"OC_J(6)"\n\t" \
- "  punpckhwd      %mm5,     %mm4\n\t" \
- "  movq           %mm2,     %mm5\n\t" \
- "  punpcklwd      %mm3,     %mm2\n\t" \
- "  movq           %mm0,     %mm1\n\t" \
- "  punpckldq      %mm2,     %mm0\n\t" \
- "  punpckhdq      %mm2,     %mm1\n\t" \
- "  movq           %mm4,     %mm2\n\t" \
- "  movq           %mm0,"OC_I(0)"\n\t" \
- "  punpckhwd      %mm3,     %mm5\n\t" \
- "  movq           %mm1,"OC_I(1)"\n\t" \
- "  punpckhdq      %mm5,     %mm4\n\t" \
- "  punpckldq      %mm5,     %mm2\n\t" \
- "  movq           %mm4,"OC_I(3)"\n\t" \
- "  movq           %mm2,"OC_I(2)"\n\t" \
- "#end OC_TRANSPOSE\n\t" \
-)
+#define OC_TRANSPOSE \
+  "#OC_TRANSPOSE\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "punpcklwd %%mm5,%%mm4\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm5,%%mm1\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "punpcklwd %%mm7,%%mm6\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "punpckldq %%mm6,%%mm4\n\t" \
+  "punpckhdq %%mm6,%%mm5\n\t" \
+  "movq %%mm1,%%mm6\n\t" \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  "punpckhwd %%mm7,%%mm0\n\t" \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  "punpckhdq %%mm0,%%mm6\n\t" \
+  "movq "OC_I(0)",%%mm4\n\t" \
+  "punpckldq %%mm0,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq %%mm4,%%mm0\n\t" \
+  "movq %%mm6,"OC_J(7)"\n\t" \
+  "punpcklwd %%mm5,%%mm0\n\t" \
+  "movq %%mm1,"OC_J(6)"\n\t" \
+  "punpckhwd %%mm5,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklwd %%mm3,%%mm2\n\t" \
+  "movq %%mm0,%%mm1\n\t" \
+  "punpckldq %%mm2,%%mm0\n\t" \
+  "punpckhdq %%mm2,%%mm1\n\t" \
+  "movq %%mm4,%%mm2\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm3,%%mm5\n\t" \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  "punpckhdq %%mm5,%%mm4\n\t" \
+  "punpckldq %%mm5,%%mm2\n\t" \
+  "movq %%mm4,"OC_I(3)"\n\t" \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  "#end OC_TRANSPOSE\n\t" \
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm__ __volatile__( \
- "  #OC_COLUMN_IDCT\n" \
- OC_IDCT_BEGIN \
- "  paddw     "OC_8",     %mm2\n\t" \
- "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
- "  psraw         $4,     %mm2\n\t"  /* r2 = NR2 */ \
- "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
- "  psraw         $4,     %mm1\n\t"  /* r1 = NR1 */ \
- "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
- "  movq        %mm2,"OC_I(2)"\n\t"  /* store NR2 at I2 */ \
- "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
- "  movq        %mm1,"OC_I(1)"\n\t"  /* store NR1 at I1 */ \
- "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddw     "OC_8",     %mm4\n\t" \
- "  paddw       %mm3,     %mm3\n\t"  /* r3 = D. + D. */ \
- "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
- "  psraw         $4,     %mm4\n\t"  /* r4 = NR4 */ \
- "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
- "  psraw         $4,     %mm3\n\t"  /* r3 = NR3 */ \
- "  paddw     "OC_8",     %mm6\n\t" \
- "  paddw       %mm5,     %mm5\n\t"  /* r5 = B.. + B.. */ \
- "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
- "  psraw         $4,     %mm6\n\t"  /* r6 = NR6 */ \
- "  movq        %mm4,"OC_J(4)"\n\t"  /* store NR4 at J4 */ \
- "  psraw         $4,     %mm5\n\t"  /* r5 = NR5 */ \
- "  movq        %mm3,"OC_I(3)"\n\t"  /* store NR3 at I3 */ \
- "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddw     "OC_8",     %mm7\n\t" \
- "  paddw       %mm0,     %mm0\n\t"  /* r0 = C. + C. */ \
- "  paddw       %mm7,     %mm0\n\t"  /* r0 = R0 = G. + C. */ \
- "  psraw         $4,     %mm7\n\t"  /* r7 = NR7 */ \
- "  movq        %mm6,"OC_J(6)"\n\t"  /* store NR6 at J6 */ \
- "  psraw         $4,     %mm0\n\t"  /* r0 = NR0 */ \
- "  movq        %mm5,"OC_J(5)"\n\t"  /* store NR5 at J5 */ \
- "  movq        %mm7,"OC_J(7)"\n\t"  /* store NR7 at J7 */ \
- "  movq        %mm0,"OC_I(0)"\n\t"  /* store NR0 at I0 */ \
- "  #end OC_COLUMN_IDCT\n\t" \
-)
+#define OC_COLUMN_IDCT \
+  "#OC_COLUMN_IDCT\n" \
+  OC_IDCT_BEGIN \
+  "paddw "OC_8",%%mm2\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r2=NR2*/ \
+  "psraw $4,%%mm2\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=NR1*/ \
+  "psraw $4,%%mm1\n\t" \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*Store NR2 at I(2).*/ \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*Store NR1 at I(1).*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw "OC_8",%%mm4\n\t" \
+  /*r3=D'+D'*/ \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r4=NR4*/ \
+  "psraw $4,%%mm4\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*r3=NR3*/ \
+  "psraw $4,%%mm3\n\t" \
+  "paddw "OC_8",%%mm6\n\t" \
+  /*r5=B''+B''*/ \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r6=NR6*/ \
+  "psraw $4,%%mm6\n\t" \
+  /*Store NR4 at J(4).*/ \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  /*r5=NR5*/ \
+  "psraw $4,%%mm5\n\t" \
+  /*Store NR3 at I(3).*/ \
+  "movq %%mm3,"OC_I(3)"\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw "OC_8",%%mm7\n\t" \
+  /*r0=C'+C'*/ \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*r0=R0=G'+C'*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  /*r7=NR7*/ \
+  "psraw $4,%%mm7\n\t" \
+  /*Store NR6 at J(6).*/ \
+  "movq %%mm6,"OC_J(6)"\n\t" \
+  /*r0=NR0*/ \
+  "psraw $4,%%mm0\n\t" \
+  /*Store NR5 at J(5).*/ \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  /*Store NR7 at J(7).*/ \
+  "movq %%mm7,"OC_J(7)"\n\t" \
+  /*Store NR0 at I(0).*/ \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "#end OC_COLUMN_IDCT\n\t" \
 
-#if (defined(__amd64__) || defined(__x86_64__))
-# define OC_MID_REG "%rcx"
-# define OC_Y_REG   "%rdx"
-#else
-# define OC_MID_REG "%ecx"
-# define OC_Y_REG   "%edx"
-#endif
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"("OC_MID_REG")"
+#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
 #define OC_M(_i)      OC_MID(OC_MASK_OFFSET,_i)
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
 void oc_idct8x8_mmx(ogg_int16_t _y[64]){
-/*This routine accepts an 8x8 matrix, but in transposed form.
-  Every 4x4 submatrix is transposed.*/
+  /*This routine accepts an 8x8 matrix, but in transposed form.
+    Every 4x4 submatrix is transposed.*/
   __asm__ __volatile__(
-   ""
-   :
-   :"d" (_y),
-    "c" (OC_IDCT_CONSTS)
-  );
-#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
-  OC_ROW_IDCT;
-  OC_TRANSPOSE;
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"("OC_Y_REG")"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"("OC_Y_REG")"
-  OC_ROW_IDCT;
-  OC_TRANSPOSE;
+#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k)      OC_I(_k)
-  OC_COLUMN_IDCT;
+    OC_COLUMN_IDCT
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
 #define OC_J(_k)      OC_I(_k)
-  OC_COLUMN_IDCT;
+    OC_COLUMN_IDCT
 #undef  OC_I
 #undef  OC_J
-  __asm__ __volatile__(
-   " emms\n\t"
+    "emms\n\t"
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 
 /*25 cycles.*/
 #define OC_IDCT_BEGIN_10 \
- "  #OC_IDCT_BEGIN_10\n\t" \
- "  movq   "OC_I(3)",     %mm2\n\t" \
- "  nop\n\t" \
- "  movq   "OC_C(3)",     %mm6\n\t" \
- "  movq        %mm2,     %mm4\n\t" \
- "  movq   "OC_C(5)",     %mm1\n\t" \
- "  pmulhw      %mm6,     %mm4\n\t" \
- "  movq   "OC_I(1)",     %mm3\n\t" \
- "  pmulhw      %mm2,     %mm1\n\t" \
- "  movq   "OC_C(1)",     %mm0\n\t" \
- "  paddw       %mm2,     %mm4\n\t" \
- "  pxor        %mm6,     %mm6\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  movq   "OC_I(2)",     %mm5\n\t" \
- "  pmulhw      %mm3,     %mm0\n\t" \
- "  movq        %mm5,     %mm1\n\t" \
- "  paddw       %mm3,     %mm0\n\t" \
- "  pmulhw "OC_C(7)",     %mm3\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  pmulhw "OC_C(2)",     %mm5\n\t" \
- "  psubw       %mm4,     %mm0\n\t" \
- "  movq   "OC_I(2)",     %mm7\n\t" \
- "  paddw       %mm4,     %mm4\n\t" \
- "  paddw       %mm5,     %mm7\n\t" \
- "  paddw       %mm0,     %mm4\n\t" \
- "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  psubw       %mm6,     %mm3\n\t" \
- "  movq        %mm4,"OC_I(1)"\n\t" \
- "  paddw       %mm6,     %mm6\n\t" \
- "  movq   "OC_C(4)",     %mm4\n\t" \
- "  paddw       %mm3,     %mm6\n\t" \
- "  movq        %mm3,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm3\n\t" \
- "  movq        %mm6,"OC_I(2)"\n\t" \
- "  movq        %mm0,     %mm2\n\t" \
- "  movq   "OC_I(0)",     %mm6\n\t" \
- "  pmulhw      %mm4,     %mm0\n\t" \
- "  paddw       %mm3,     %mm5\n\t" \
- "  paddw       %mm0,     %mm2\n\t" \
- "  psubw       %mm1,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm6\n\t" \
- "  paddw  "OC_I(0)",     %mm6\n\t" \
- "  paddw       %mm1,     %mm1\n\t" \
- "  movq        %mm6,     %mm4\n\t" \
- "  paddw       %mm5,     %mm1\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm2,     %mm2\n\t" \
- "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddw       %mm6,     %mm2\n\t" \
- "  psubw       %mm1,     %mm2\n\t" \
- "  nop\n\t" \
- "  #end OC_IDCT_BEGIN_10\n\t"
+ "#OC_IDCT_BEGIN_10\n\t" \
+ "movq "OC_I(3)",%%mm2\n\t" \
+ "nop\n\t" \
+ "movq "OC_C(3)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_C(5)",%%mm1\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_I(1)",%%mm3\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_C(1)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_I(2)",%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "movq %%mm5,%%mm1\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movq "OC_I(2)",%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movq %%mm4,"OC_I(1)"\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "movq "OC_C(4)",%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "nop\n\t" \
+ "#end OC_IDCT_BEGIN_10\n\t" \
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm__ __volatile__( \
- "  #OC_ROW_IDCT_10\n\t" \
+#define OC_ROW_IDCT_10 \
+ "#OC_ROW_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "  movq    "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  psubw        %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
- "  paddw        %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddw        %mm7,     %mm7\n\t" /* r7 = G + G */ \
- "  paddw        %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- "  paddw        %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
- "  psubw        %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddw        %mm3,     %mm3\n\t" \
- "  psubw        %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- "  paddw        %mm5,     %mm5\n\t" \
- "  paddw        %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
- "  paddw        %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- "  psubw        %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddw        %mm0,     %mm0\n\t" \
- "  movq         %mm1,"OC_I(1)"\n\t" /* save R1 */ \
- "  paddw        %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT_10\n\t" \
-)
 
-/*25+19=44 cycles.*/
-#define OC_COLUMN_IDCT_10 __asm__ __volatile__( \
- "  #OC_COLUMN_IDCT_10\n\t" \
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10 \
+ "#OC_COLUMN_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "  paddw     "OC_8",     %mm2\n\t" \
- "  paddw       %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddw       %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- "  psraw         $4,     %mm2\n\t" /* r2 = NR2 */ \
- "  psubw       %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
- "  psraw         $4,     %mm1\n\t" /* r1 = NR1 */ \
- "  movq   "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  paddw       %mm7,     %mm7\n\t" /* r7 = G + G */ \
- "  movq        %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
- "  paddw       %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
- "  movq        %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
- "  psubw       %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddw     "OC_8",     %mm4\n\t" \
- "  paddw       %mm3,     %mm3\n\t" /* r3 = D. + D. */ \
- "  paddw       %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
- "  psraw         $4,     %mm4\n\t" /* r4 = NR4 */ \
- "  psubw       %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- "  psraw         $4,     %mm3\n\t" /* r3 = NR3 */ \
- "  paddw     "OC_8",     %mm6\n\t" \
- "  paddw       %mm5,     %mm5\n\t" /* r5 = B.. + B.. */ \
- "  paddw       %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- "  psraw         $4,     %mm6\n\t" /* r6 = NR6 */ \
- "  movq        %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
- "  psraw         $4,     %mm5\n\t" /* r5 = NR5 */ \
- "  movq        %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
- "  psubw       %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddw     "OC_8",     %mm7\n\t" \
- "  paddw       %mm0,     %mm0\n\t" /* r0 = C. + C. */ \
- "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
- "  psraw         $4,     %mm7\n\t" /* r7 = NR7 */ \
- "  movq        %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
- "  psraw         $4,     %mm0\n\t" /* r0 = NR0 */ \
- "  movq        %mm5,"OC_J(5)"\n\t" /* store NR5 at J5 */ \
- "  movq        %mm7,"OC_J(7)"\n\t" /* store NR7 at J7 */ \
- "  movq        %mm0,"OC_I(0)"\n\t" /* store NR0 at I0 */ \
- "  #end OC_COLUMN_IDCT_10\n\t" \
-)
+ "paddw "OC_8",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_8",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_8",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_8",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0)"\n\t" \
+ "#end OC_COLUMN_IDCT_10\n\t" \
 
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
   __asm__ __volatile__(
-   ""
-   :
-   :"d" (_y),
-   "c" (OC_IDCT_CONSTS)
-  );
-#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
-  /*Done with dequant, descramble, and partial transpose.
-    Now do the iDCT itself.*/
-  OC_ROW_IDCT_10;
-  OC_TRANSPOSE;
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k) OC_I(_k)
-  OC_COLUMN_IDCT_10;
+    OC_COLUMN_IDCT_10
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
 #define OC_J(_k) OC_I(_k)
-  OC_COLUMN_IDCT_10;
+    OC_COLUMN_IDCT_10
 #undef  OC_I
 #undef  OC_J
-  __asm__ __volatile__(
-   " emms\n\t"
+    "emms\n\t"
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 #endif

Modified: trunk/theora/lib/dec/x86/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxstate.c	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/dec/x86/mmxstate.c	2007-04-16 01:32:17 UTC (rev 12873)
@@ -22,18 +22,10 @@
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V3=
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
  0x0003000300030003LL; 
-static const __attribute__((aligned(8),used)) ogg_int64_t V4=
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
  0x0004000400040004LL; 
-static const __attribute__((aligned(8),used)) ogg_int64_t V100=
- 0x0100010001000100LL;
-  
-#if defined(__APPLE__)
-#define MANGLE(x) "_"#x
-#else
-#define MANGLE(x) #x
-#endif
 
 static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
    0, 8, 1, 2, 9,16,24,17,
@@ -55,7 +47,6 @@
   int dst_framei;
   int dst_ystride;
   int zzi;
-  int ci;
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -82,41 +73,41 @@
     Needless to say we inherited this approach from VP3.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
-    ogg_int16_t p;
+    ogg_uint16_t p;
     /*Why is the iquant product rounded in this case and no others?
       Who knows.*/
     p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-    /*for(ci=0;ci<64;ci++)res_buf[ci]=p;*/
-    /*This could also be done with MMX 2.*/
+    /*Fill res_buf with p.*/
     __asm__ __volatile__(
-     "  movzwl    %1,   %%eax\n\t"
-     "  movd   %%eax,   %%mm0\n\t" /* XXXX XXXX 0000 AAAA */
-     "  movq   %%mm0,   %%mm1\n\t" /* XXXX XXXX 0000 AAAA */
-     "  pslld    $16,   %%mm1\n\t" /* XXXX XXXX AAAA 0000 */
-     "  por    %%mm0,   %%mm1\n\t" /* XXXX XXXX AAAA AAAA */
-     "  movq   %%mm1,   %%mm0\n\t" /* XXXX XXXX AAAA AAAA */
-     "  psllq    $32,   %%mm1\n\t" /* AAAA AAAA 0000 0000 */
-     "  por    %%mm1,   %%mm0\n\t" /* AAAA AAAA AAAA AAAA */
-     "  movq   %%mm0,    (%0)\n\t"
-     "  movq   %%mm0,   8(%0)\n\t"
-     "  movq   %%mm0,  16(%0)\n\t"
-     "  movq   %%mm0,  24(%0)\n\t"
-     "  movq   %%mm0,  32(%0)\n\t"
-     "  movq   %%mm0,  40(%0)\n\t"
-     "  movq   %%mm0,  48(%0)\n\t"
-     "  movq   %%mm0,  56(%0)\n\t"
-     "  movq   %%mm0,  64(%0)\n\t"
-     "  movq   %%mm0,  72(%0)\n\t"
-     "  movq   %%mm0,  80(%0)\n\t"
-     "  movq   %%mm0,  88(%0)\n\t"
-     "  movq   %%mm0,  96(%0)\n\t"
-     "  movq   %%mm0, 104(%0)\n\t"
-     "  movq   %%mm0, 112(%0)\n\t"
-     "  movq   %%mm0, 120(%0)\n\t"
-     :
-     :"r" (res_buf),
-      "r" (p)
-     :"memory"
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm1=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm1\n\t"
+      /*mm0=0000 0000 AAAA 0000*/
+      "pslld $16,%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "por %%mm1,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[res_buf])\n\t"
+      "movq %%mm0,8(%[res_buf])\n\t"
+      "movq %%mm0,16(%[res_buf])\n\t"
+      "movq %%mm0,24(%[res_buf])\n\t"
+      "movq %%mm0,32(%[res_buf])\n\t"
+      "movq %%mm0,40(%[res_buf])\n\t"
+      "movq %%mm0,48(%[res_buf])\n\t"
+      "movq %%mm0,56(%[res_buf])\n\t"
+      "movq %%mm0,64(%[res_buf])\n\t"
+      "movq %%mm0,72(%[res_buf])\n\t"
+      "movq %%mm0,80(%[res_buf])\n\t"
+      "movq %%mm0,88(%[res_buf])\n\t"
+      "movq %%mm0,96(%[res_buf])\n\t"
+      "movq %%mm0,104(%[res_buf])\n\t"
+      "movq %%mm0,112(%[res_buf])\n\t"
+      "movq %%mm0,120(%[res_buf])\n\t"
+      :
+      :[res_buf]"r"(res_buf),[p]"r"((unsigned)p)
+      :"memory"
     );
   }
   else{
@@ -125,26 +116,26 @@
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
     __asm__ __volatile__(
-     "  pxor %%mm0,   %%mm0\n\t"
-     "  movq %%mm0,    (%0)\n\t"
-     "  movq %%mm0,   8(%0)\n\t"
-     "  movq %%mm0,  16(%0)\n\t"
-     "  movq %%mm0,  24(%0)\n\t"
-     "  movq %%mm0,  32(%0)\n\t"
-     "  movq %%mm0,  40(%0)\n\t"
-     "  movq %%mm0,  48(%0)\n\t"
-     "  movq %%mm0,  56(%0)\n\t"
-     "  movq %%mm0,  64(%0)\n\t"
-     "  movq %%mm0,  72(%0)\n\t"
-     "  movq %%mm0,  80(%0)\n\t"
-     "  movq %%mm0,  88(%0)\n\t"
-     "  movq %%mm0,  96(%0)\n\t"
-     "  movq %%mm0, 104(%0)\n\t"
-     "  movq %%mm0, 112(%0)\n\t"
-     "  movq %%mm0, 120(%0)\n\t"
-     :
-     :"r" (res_buf)
-     :"memory"
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[res_buf])\n\t"
+      "movq %%mm0,8(%[res_buf])\n\t"
+      "movq %%mm0,16(%[res_buf])\n\t"
+      "movq %%mm0,24(%[res_buf])\n\t"
+      "movq %%mm0,32(%[res_buf])\n\t"
+      "movq %%mm0,40(%[res_buf])\n\t"
+      "movq %%mm0,48(%[res_buf])\n\t"
+      "movq %%mm0,56(%[res_buf])\n\t"
+      "movq %%mm0,64(%[res_buf])\n\t"
+      "movq %%mm0,72(%[res_buf])\n\t"
+      "movq %%mm0,80(%[res_buf])\n\t"
+      "movq %%mm0,88(%[res_buf])\n\t"
+      "movq %%mm0,96(%[res_buf])\n\t"
+      "movq %%mm0,104(%[res_buf])\n\t"
+      "movq %%mm0,112(%[res_buf])\n\t"
+      "movq %%mm0,120(%[res_buf])\n\t"
+      :
+      :[res_buf]"r"(res_buf)
+      :"memory"
     );
     res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
     /*This is planned to be rewritten in MMX.*/
@@ -154,12 +145,8 @@
       res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
        _ac_iquant[ci]);
     }
-    if(_last_zzi<10){
-      oc_idct8x8_10_mmx(res_buf);
-    }
-    else{
-      oc_idct8x8_mmx(res_buf);
-    }
+    if(_last_zzi<10)oc_idct8x8_10_mmx(res_buf);
+    else oc_idct8x8_mmx(res_buf);
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
@@ -201,9 +188,9 @@
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;
-  int        dst_ystride;
+  long       dst_ystride;
   int        src_framei;
-  int        src_ystride;
+  long       src_ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
   dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
@@ -213,228 +200,346 @@
     oc_fragment   *frag;
     unsigned char *dst;
     unsigned char *src;
+    long           esi;
     frag=_state->frags+*fragi;
     dst=frag->buffer[dst_framei];
     src=frag->buffer[src_framei];
-#if (defined(__amd64__) || defined(__x86_64__))
     __asm__ __volatile__(
-     "  lea         (%3, %3, 2), %%rsi   \n\t"  /* esi=src_stride*3 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  lea         (%2, %2, 2), %%rdi   \n\t"  /* edi=dst_stride*3 */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
-     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%rdi)      \n\t"  /* 3x */
-     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%rdi)     \n\t"  /* 3x */
-     :"+r" (dst) /* 0 */
-     :"r" (src),  /* 1 */
-      "r" ((long)dst_ystride), /* 2 */
-      "r" ((long)src_ystride) /* 3 */
-     :"memory", "rsi","rdi"
+      /*src+0*src_ystride*/
+      "movq (%[src]),%%mm0\n\t"
+      /*esi=src_ystride*3*/
+      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
+      /*src+1*src_ystride*/
+      "movq (%[src],%[src_ystride]),%%mm1\n\t"
+      /*src+2*src_ystride*/
+      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
+      /*src+3*src_ystride*/
+      "movq (%[src],%[s]),%%mm3\n\t"
+      /*dst+0*dst_ystride*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*esi=dst_ystride*3*/
+      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
+      /*dst+1*dst_ystride*/
+      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
+      /*Pointer to next 4.*/
+      "lea (%[src],%[src_ystride],4),%[src]\n\t"
+      /*dst+2*dst_ystride*/
+      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
+      /*dst+3*dst_ystride*/
+      "movq %%mm3,(%[dst],%[s])\n\t"
+      /*Pointer to next 4.*/
+      "lea (%[dst],%[dst_ystride],4),%[dst]\n\t"
+      /*src+0*src_ystride*/
+      "movq (%[src]),%%mm0\n\t"
+      /*esi=src_ystride*3*/
+      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
+      /*src+1*src_ystride*/
+      "movq (%[src],%[src_ystride]),%%mm1\n\t"
+      /*src+2*src_ystride*/
+      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
+      /*src+3*src_ystride*/
+      "movq (%[src],%[s]),%%mm3\n\t"
+      /*dst+0*dst_ystride*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*esi=dst_ystride*3*/
+      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
+      /*dst+1*dst_ystride*/
+      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
+      /*dst+2*dst_ystride*/
+      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
+      /*dst+3*dst_ystride*/
+      "movq %%mm3,(%[dst],%[s])\n\t"
+      :[s]"=&S"(esi)
+      :[dst]"r"(dst),[src]"r"(src),[dst_ystride]"r"(dst_ystride),
+       [src_ystride]"r"(src_ystride)
+      :"memory"
     );
   }
-#else
-    __asm__ __volatile__(
-     "  lea         (%3, %3, 2), %%esi   \n\t"  /* esi=src_stride*3 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  lea         (%2, %2, 2), %%edi   \n\t"  /* edi=dst_stride*3 */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
-     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%edi)      \n\t"  /* 3x */
-     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%edi)     \n\t"  /* 3x */
-     :"+r" (dst) /* 0 */
-     :"r" (src),  /* 1 */
-      "r" (dst_ystride), /* 2 */
-      "r" (src_ystride) /* 3 */
-     :"memory", "esi","edi"
-    );
-  }
-#endif
   /*This needs to be removed when decode specific functions are implemented:*/
   __asm__ __volatile__("emms\n\t");
 }
 
-static void loop_filter_v_mmx(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+  long esi;
+  long edi;
   _pix-=_ystride*2;
-  
   __asm__ __volatile__(
-    "pxor %%mm0,%%mm0\n"  	/* mm0 = 0 */
-    "movq (%0),%%mm7\n"	/* mm7 = _pix[0..8] */
-    "lea (%1,%1,2),%%esi\n"	/* esi = _ystride*3 */
-    "movq (%0,%%esi),%%mm4\n" /* mm4 = _pix[0..8]+_ystride*3] */
-    "movq %%mm7,%%mm6\n"	/* mm6 = _pix[0..8] */
-    "punpcklbw %%mm0,%%mm6\n" /* expand unsigned _pix[0..3] to 16 bits */
-    "movq %%mm4,%%mm5\n"
-    "punpckhbw %%mm0,%%mm7\n" /* expand unsigned _pix[4..8] to 16 bits */
-    "punpcklbw %%mm0,%%mm4\n" /* expand other arrays too */
-    "punpckhbw %%mm0,%%mm5\n"
-    "psubw %%mm4,%%mm6\n" /* mm6 = mm6 - mm4 */
-    "psubw %%mm5,%%mm7\n" /* mm7 = mm7 - mm5 */
-    			/* mm7:mm6 = _p[0]-_p[_ystride*3] */
-    "movq (%0,%1),%%mm4\n"   /* mm4 = _pix[0..8+_ystride] */
-    "movq %%mm4,%%mm5\n"
-    "movq (%0,%1,2),%%mm2\n" /* mm2 = _pix[0..8]+_ystride*2] */
-    "movq %%mm2,%%mm3\n"
-    "movq %%mm2,%%mm1\n" //ystride*2
-    "punpckhbw %%mm0,%%mm5\n"
-    "punpcklbw %%mm0,%%mm4\n" 
-    "punpckhbw %%mm0,%%mm3\n"
-    "punpcklbw %%mm0,%%mm2\n"
-    "psubw %%mm5,%%mm3\n" 
-    "psubw %%mm4,%%mm2\n" 
-    			/* mm3:mm2 = (_pix[_ystride*2]-_pix[_ystride]); */
-    "PMULLW "MANGLE(V3)",%%mm3\n" 		/* *3 */
-    "PMULLW "MANGLE(V3)",%%mm2\n" 		/* *3 */
-    "paddw %%mm7,%%mm3\n"   /* highpart */
-    "paddw %%mm6,%%mm2\n"/* lowpart of _pix[0]-_pix[_ystride*3]+3*(_pix[_ystride*2]-_pix[_ystride]);  */
-    "paddw "MANGLE(V4)",%%mm3\n"  /* add 4 */
-    "paddw "MANGLE(V4)",%%mm2\n"  /* add 4 */
-    "psraw $3,%%mm3\n"  /* >>3 f coefs high */
-    "psraw $3,%%mm2\n"  /* >>3 f coefs low */
-    "paddw "MANGLE(V100)",%%mm3\n"  /* add 256 */
-    "paddw "MANGLE(V100)",%%mm2\n"  /* add 256 */
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t" 
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t" 
+    "psubw %%mm4,%%mm2\n\t" 
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now perform mm7:m6=_bv[(f+4>>3)]*/
+    /*First the low part:*/
+    /*pextrw requires MMX+/SSE.
+    "pextrw $0,%%mm2,%%esi\n\t"
+    "pextrw $1,%%mm2,%%edi\n\t"*/
+    /*We duplicate the value and pull out of two registers in parallel;
+       perhaps we should not bother with just MMX, since any processor with
+       multiply MMX units will also have SSE, and should be using that
+       instead.*/
+    "movq %%mm2,%%mm0\n\t"
+    "psrlq $16,%%mm2\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd %%mm2,%%edi\n\t"
+    "psrlq $32,%%mm0\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $32,%%mm2\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*pinsrw requires MMX+/SSE.
+    "pinsrw $0,(%[bv],%[s],4),%%mm6\n\t"
+    "pinsrw $1,(%[bv],%[d],4),%%mm6\n\t"
+    "pextrw $2,%%mm2,%%esi\n\t"
+    "pextrw $3,%%mm2,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm6\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm0\n\t"
+    "movd %%mm2,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm6\n\t"
+    "pinsrw $3,(%[bv],%%edi,4),%%mm6\n\t"*/
+    "movd (%[bv],%[s],4),%%mm2\n\t"
+    "pslld $16,%%mm2\n\t"
+    "por %%mm2,%%mm6\n\t"
+    "movd (%[bv],%[d],4),%%mm2\n\t"
+    "pslld $16,%%mm2\n\t"
+    "por %%mm2,%%mm0\n\t"
+    "punpcklwd %%mm0,%%mm6\n\t"
+    /*Do it again for the high part:*/
+    /*"pextrw $0,%%mm3,%%esi\n\t" 
+    "pextrw $1,%%mm3,%%edi\n\t"*/
+    "movq %%mm3,%%mm0\n\t"
+    "psrlq $16,%%mm3\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd %%mm3,%%edi\n\t"
+    "psrlq $32,%%mm0\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $32,%%mm3\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $0,(%[bv],%%esi,4),%%mm7\n\t"
+    "pinsrw $1,(%[bv],%%edi,4),%%mm7\n\t"
+    "pextrw $2,%%mm3,%%esi\n\t"
+    "pextrw $3,%%mm3,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm7\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm0\n\t"
+    "movd %%mm3,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm7\n\t"
+    "pinsrw $3, (%[bv],%%edi,4),%%mm7\n\t"*/
+    "movd (%[bv],%[s],4),%%mm2\n\t"
+    "movd (%[bv],%[d],4),%%mm3\n\t"
+    "pslld $16,%%mm2\n\t"
+    "pslld $16,%%mm3\n\t"
+    "por %%mm2,%%mm7\n\t"
+    "por %%mm3,%%mm0\n\t"
+    "punpcklwd %%mm0,%%mm7\n\t"
+    /*mm7:mm6 now contain the final values of f.*/
+    /*_pix[0...8+_ystride]+=f*/
+    "paddw %%mm6,%%mm4\n\t"
+    "paddw %%mm7,%%mm5\n\t"
+    /*Re-expand _pix[0...8+_ystride*2], since we didn't have enough registers
+       to keep the whole thing around.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm1,%%mm2\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm2\n\t"
+    /*_pix[0...8+_ystride*2]-=f*/
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm7,%%mm2\n\t"
+    /*Pack it back into 8 bits and write it back out.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[bv]"r"(_bv),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-    " pextrw $0,%%mm2,%%esi\n"  /* In MM4:MM0 we have f coefs (16bits) */
-    " pextrw $1,%%mm2,%%edi\n"  /* now perform MM7:MM6 = *(_bv+ f) */
-    " pinsrw $0,(%2,%%esi,4),%%mm6\n"
-    " pinsrw $1,(%2,%%edi,4),%%mm6\n"
-
-    " pextrw $2,%%mm2,%%esi\n"
-    " pextrw $3,%%mm2,%%edi\n"
-    " pinsrw $2,(%2,%%esi,4),%%mm6\n"
-    " pinsrw $3,(%2,%%edi,4),%%mm6\n"
-
-    " pextrw $0,%%mm3,%%esi\n" 
-    " pextrw $1,%%mm3,%%edi\n"
-    " pinsrw $0,(%2,%%esi,4),%%mm7\n"
-    " pinsrw $1,(%2,%%edi,4),%%mm7\n"
-
-    " pextrw $2,%%mm3,%%esi\n"
-    " pextrw $3,%%mm3,%%edi\n"
-    " pinsrw $2,(%2,%%esi,4),%%mm7\n"
-    " pinsrw $3, (%2,%%edi,4),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
-
-    "paddw %%mm6,%%mm4\n"  /* (_pix[_ystride]+f); */
-    "paddw %%mm7,%%mm5\n"  /* (_pix[_ystride]+f); */
-    "movq %%mm1,%%mm2\n"
-    "punpcklbw %%mm0,%%mm1\n"
-    "punpckhbw %%mm0,%%mm2\n" //[ystride*2]
-    "psubw %%mm6,%%mm1\n" /* (_pix[_ystride*2]-f); */
-    "psubw %%mm7,%%mm2\n" /* (_pix[_ystride*2]-f); */
-    "packuswb %%mm2,%%mm1\n"
-    "packuswb %%mm5,%%mm4\n"
-    "movq %%mm1,(%0,%1,2)\n" /* _pix[_ystride*2]= */
-    "movq %%mm4,(%0,%1)\n" /* _pix[_ystride]= */
-    "emms\n"
-    : 
-    : "r" (_pix), "r" (_ystride), "r" (_bv)
-    : "esi", "edi" , "memory"
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.
+  TODO: some instruction stalls can be avoided.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,const int *_bv){
+  long esi;
+  long edi;
+  __asm__ __volatile__(
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm0,%%mm1\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm1\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm1\n\t"
+    /*Now perform mm0=_bv[(f+4>>3)]*/
+    /*pextrw requires MMX+/SSE.
+    "pextrw $0,%%mm1,%%esi\n\t"
+    "pextrw $1,%%mm1,%%edi\n\t"*/
+    "movd %%mm1,%%esi\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movd %%mm1,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*pinsrw requires MMX+/SSE.
+    "pinsrw $0,(%[bv],%%esi,4),%%mm0\n\t"
+    "pextrw $2,%%mm1,%%esi\n\t"
+    "pinsrw $1,(%[bv],%%edi,4),%%mm0\n\t"
+    "pextrw $3,%%mm1,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm0\n\t"
+    "movd %%mm1,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm2\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movd %%mm1,%%edi\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm0\n\t"
+    "pinsrw $3,(%[bv],%%edi,4),%%mm0\n\t"*/
+    "movd (%[bv],%[s],4),%%mm3\n\t"
+    "movsx %%di,%[d]\n\t"
+    "pslld $16,%%mm3\n\t"
+    "movd (%[bv],%[d],4),%%mm6\n\t"
+    "por %%mm3,%%mm0\n\t"
+    "pslld $16,%%mm6\n\t"
+    "por %%mm6,%%mm2\n\t"
+    "punpcklwd %%mm2,%%mm0\n\t"
+    /*_pix[1]+=f;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=f;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*esi=6 5 2 1*/
+    "movd %%mm5,%%esi\n\t"
+    "movw %%si,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%esi\n\t"
+    "movw %%si,1(%[pix],%[ystride])\n\t"
+    /*esi=E D A 9*/
+    "movd %%mm5,%%esi\n\t"
+    "lea (%[ystride],%[ystride],2),%[d]\n\t"
+    "movw %%si,(%[pix],%[ystride])\n\t"
+    "shrl $16,%%esi\n\t"
+    "movw %%si,1(%[pix],%[d])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[bv]"+r"(_bv)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
   );
-
 }
 
-
-
-#define OC_LOOP_H_4x4 \
-__asm__ __volatile__( \
-"lea (%1,%1,2),%%esi\n"	 /* esi = _ystride*3 */  \
-"movd (%0), %%mm0\n"		/* 0 0 0 0 3 2 1 0 */ \
-"movd (%0,%1),%%mm1\n"    	/* 0 0 0 0 7 6 5 4 */ \
-"movd (%0,%1,2),%%mm2\n"  	/* 0 0 0 0 b a 9 8 */ \
-"movd (%0,%%esi),%%mm3\n" 	/* 0 0 0 0 f e d c */ \
-"punpcklbw %%mm1,%%mm0\n" 	/* mm0 = 7 3 6 2 5 1 4 0 */ \
-"punpcklbw %%mm3,%%mm2\n" 	/* mm2 = f b e a d 9 c 8 */ \
-"movq %%mm0,%%mm1\n"	 	/* mm1 = 7 3 6 2 5 1 4 0 */ \
-"punpcklwd %%mm2,%%mm1\n"	/* mm1 = d 9 5 1 c 8 4 0 */ \
-"punpckhwd %%mm2,%%mm0\n"	/* mm0 = f b 7 3 e a 6 2 */ \
-"pxor %%mm7,%%mm7\n" \
-"movq %%mm1,%%mm5\n" 		/* mm5 = d 9 5 1 c 8 4 0 */ \
-"punpckhbw %%mm7,%%mm5\n"	/* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/ \
-"punpcklbw %%mm7,%%mm1\n"	/* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/ \
-"movq %%mm0,%%mm3\n" 		/* mm3 = f b 7 3 e a 6 2 */ \
-"punpckhbw %%mm7,%%mm3\n"	/* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/ \
-"punpcklbw %%mm7,%%mm0\n"    	/* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
- \
-"psubw %%mm3,%%mm1\n"		/* mm1 = pix[0]-pix[3] mm1 - mm3 */ \
-"movq %%mm0,%%mm7\n"		/* mm7 = pix[2]*/ \
-"psubw %%mm5,%%mm0\n" 		/* mm0 = pix[2]-pix[1] mm0 - mm5*/ \
-"PMULLW "MANGLE(V3)",%%mm0\n" 		/* *3 */ \
-"paddw %%mm0,%%mm1\n" 		/* mm1 has f[0] ... f[4]*/ \
-"paddw "MANGLE(V4)",%%mm1\n"  /* add 4 */ \
-"psraw $3,%%mm1\n"  	/* >>3 */ \
-"paddw "MANGLE(V100)",%%mm1\n"  /* add 256 */ \
-" pextrw $0,%%mm1,%%esi\n"  /* In MM1 we have 4 f coefs (16bits) */ \
-" pextrw $1,%%mm1,%%edi\n"  /* now perform MM4 = *(_bv+ f) */ \
-" pinsrw $0,(%2,%%esi,4),%%mm4\n" \
-" pextrw $2,%%mm1,%%esi\n" \
-" pinsrw $1,(%2,%%edi,4),%%mm4\n" \
-" pextrw $3,%%mm1,%%edi\n" \
-" pinsrw $2,(%2,%%esi,4),%%mm4\n" \
-" pinsrw $3,(%2,%%edi,4),%%mm4\n" /* new f vals loaded */ \
-"pxor %%mm0,%%mm0\n" \
-" paddw %%mm4,%%mm5\n"	/*(_pix[1]+f);*/ \
-" psubw %%mm4,%%mm7\n" /* (_pix[2]-f); */ \
-" packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \
-" packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */  \
-" punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \
-" movd %%mm5,%%eax\n" /* eax = newpix21 */ \
-" movw %%ax,1(%0)\n" \
-" psrlq $32,%%mm5\n" /* why is so big stall here ? */ \
-" shrl $16,%%eax\n" \
-" lea 1(%0,%1,2),%%edi\n" \
-" movw %%ax,1(%0,%1,1)\n" \
-" movd %%mm5,%%eax\n"  /* eax = newpix21 high part */ \
-" lea (%1,%1,2),%%esi\n" \
-" movw %%ax,(%%edi)\n" \
-" shrl $16,%%eax\n" \
-" movw %%ax,1(%0,%%esi)\n" \
-" emms\n" \
-: \
-: "r" (_pix), "r" (_ystride), "r" (_bv) \
-: "esi", "edi" , "memory", "eax" \
-); \
-
-/* this code implements loop_filter_h
-   data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ...
-   in order to load all (four) p0's to one register we must transpose
-   the values in four mmx regs. When halfs is done we repeat for rest.
-  
-TODO: some instruction stalls can be avoided
-
-*/
-
-static void loop_filter_h_mmx(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
   _pix-=2;
-  OC_LOOP_H_4x4
+  loop_filter_h4(_pix,_ystride,_bv);
   _pix+=_ystride*4;
-  OC_LOOP_H_4x4
+  loop_filter_h4(_pix,_ystride,_bv);
 }
 
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+   and we do just a single emms call at the end.
+  Originally _bv pointer would also not be offset by 256 to get rid of a sign
+   extension instruction, but it turns out this is still needed on x86-64 to
+   avoid a partial register stall, and is needed even on x86-32 once we
+   eliminate the MMX+/SSE-specific pextrw/pinsrw instructions.*/
+
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of
    fragments, so this row also needs to be available.
@@ -443,12 +548,6 @@
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-  
-/*  we copy whole function because mmx routines will be inlined 4 times 
-    also _bv pointer should not be added with 256 because then we can use
-    non negative index in MMX code and we get rid of sign extension instructions
-*/
-
 void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
   th_img_plane  *iplane;
@@ -459,6 +558,7 @@
   oc_fragment       *frag_end;
   oc_fragment       *frag0_end;
   oc_fragment       *frag_bot;
+  _bv+=256;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -476,16 +576,16 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h_mmx(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag0>frag_top){
-          loop_filter_v_mmx(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h_mmx(frag->buffer[_refi]+8,iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
-          loop_filter_v_mmx((frag+fplane->nhfrags)->buffer[_refi],
+          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
            iplane->ystride,_bv);
         }
       }
@@ -493,6 +593,8 @@
     }
     frag0+=fplane->nhfrags;
   }
+  /*This needs to be removed when decode specific functions are implemented:*/
+  __asm__ __volatile__("emms\n\t");
 }
 
 #endif

Modified: trunk/theora/lib/dec/x86/x86state.c
===================================================================
--- trunk/theora/lib/dec/x86/x86state.c	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/dec/x86/x86state.c	2007-04-16 01:32:17 UTC (rev 12873)
@@ -29,14 +29,9 @@
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
-    
-    /* loop filter code uses some MMXext instructions which are not present on 
-       early processors (Pentium II and AMD K6) */
-    if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
-      _state->opt_vtable.oc_state_loop_filter_frag_rows=
-                     oc_state_loop_filter_frag_rows_mmx;
-    }  
   }
   else oc_state_vtable_init_c(_state);
 }

Modified: trunk/theora/lib/internal.h
===================================================================
--- trunk/theora/lib/internal.h	2007-04-16 00:05:14 UTC (rev 12872)
+++ trunk/theora/lib/internal.h	2007-04-16 01:32:17 UTC (rev 12873)
@@ -260,8 +260,8 @@
    int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
   void (*restore_fpu)(void);
-  void (*oc_state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
-    int _refi,int _pli,int _fragy0,int _fragy_end);  
+  void (*state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
+   int _refi,int _pli,int _fragy0,int _fragy_end);  
 }oc_base_opt_vtable;
 
 
@@ -429,6 +429,8 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu(const oc_theora_state *_state);
 
 /*Default pure-C implementations.*/