[xiph-commits] r15056 - in branches/theora-thusnelda/lib/enc: . x86_32 x86_64

Sun Jun 22 12:50:33 PDT 2008

Author: xiphmont
Date: 2008-06-22 12:50:32 -0700 (Sun, 22 Jun 2008)
New Revision: 15056

Modified:
   branches/theora-thusnelda/lib/enc/dct_decode.c
   branches/theora-thusnelda/lib/enc/dsp.h
   branches/theora-thusnelda/lib/enc/x86_32/dct_decode_mmx.c
   branches/theora-thusnelda/lib/enc/x86_64/dct_decode_mmx.c
Log:
Staging a fix for mainline Theora: refactor code slightly to use the MMX 
loop filter routines from decode. This eliminates a problem with the 
MMX loop filter code using SSE instructions. 



Modified: branches/theora-thusnelda/lib/enc/dct_decode.c
===================================================================

--- branches/theora-thusnelda/lib/enc/dct_decode.c	2008-06-22 13:24:13 UTC (rev 15055)
+++ branches/theora-thusnelda/lib/enc/dct_decode.c	2008-06-22 19:50:32 UTC (rev 15056)
@@ -132,11 +132,12 @@
   UpdateUMV_HBorders( cpi, DestReconPtr, 2);
 }
 
-static void FilterHoriz__c(unsigned char * PixelPtr,
-			   ogg_int32_t LineLength,
-			   ogg_int16_t *BoundingValuePtr){
+static void loop_filter_h(unsigned char * PixelPtr,
+			  ogg_int32_t LineLength,
+			  ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
+  PixelPtr-=2;
 
   for ( j = 0; j < 8; j++ ){
     FiltVal =
@@ -154,15 +155,12 @@
   }
 }
 
-static void FilterVert__c(unsigned char * PixelPtr,
-                ogg_int32_t LineLength,
-                ogg_int16_t *BoundingValuePtr){
+static void loop_filter_v(unsigned char * PixelPtr,
+			  ogg_int32_t LineLength,
+			  ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
   PixelPtr -= 2*LineLength;
-  /* the math was correct, but negative array indicies are forbidden
-     by ANSI/C99 and will break optimization on several modern
-     compilers */
 
   for ( j = 0; j < 8; j++ ) {
     FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
@@ -179,215 +177,41 @@
   }
 }
 
-static void LoopFilter(CP_INSTANCE *cpi){
+static void LoopFilter__c(CP_INSTANCE *cpi, int FLimit){
 
+  int j;
   ogg_int16_t BoundingValues[256];
-  ogg_int16_t *BoundingValuePtr = BoundingValues+127;
-  ogg_int32_t FLimit = cpi->quant_info.loop_filter_limits[cpi->BaseQ]; // temp
-  int j,m,n;
-  unsigned char *cp;
-  ogg_uint32_t *bp;
-  int offset = 0;
+  ogg_int16_t *bvp = BoundingValues+127;
+  unsigned char *cp = cpi->frag_coded;
+  ogg_uint32_t *bp = cpi->frag_buffer_index;
 
   if ( FLimit == 0 ) return;
   SetupBoundingValueArray_Generic(BoundingValues, FLimit);
 
   for ( j = 0; j < 3 ; j++){
-    ogg_int32_t LineFragments = cpi->frag_h[j];
-    ogg_int32_t LineLength = cpi->stride[j];
-    cp = cpi->frag_coded+offset;
-    bp = cpi->frag_buffer_index+offset;
-    offset += cpi->frag_n[j];
+    ogg_uint32_t *bp_begin = bp;
+    ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
+    int stride = cpi->stride[j];
+    int h = cpi->frag_h[j];
 
-    /**************************************************************
-     First Row
-    **************************************************************/
-    /* first column conditions */
-    /* only do 2 prediction if fragment coded and on non intra or if
-       all fragments are intra */
-    if(cp[0]){
-      /* Filter right hand border only if the block to the right is
-         not coded */
-      if (!cp[1])
-        dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			LineLength,BoundingValuePtr);
-
-      /* Bottom done if next row set */
-      if(!cp[LineFragments])
-        dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-		       LineLength, BoundingValuePtr);
-    }
-    bp++;
-    cp++;
-
-    /***************************************************************/
-    /* middle columns  */
-    for ( n = 1 ; n < cpi->frag_h[j] - 1 ; n++, bp++, cp++) {
-      if(cp[0]){
-	
-        /* Filter Left edge always */
-        dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-			LineLength, BoundingValuePtr);
-	
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if (!cp[1])
-          dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			  LineLength, BoundingValuePtr);
-	
-        /* Bottom done if next row set */
-        if(!cp[LineFragments])
-          dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-			 LineLength, BoundingValuePtr);
-	
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&cpi->lastrecon[bp[0]],stride,bvp);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&cpi->lastrecon[bp[0]],stride,bvp);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,bvp);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&cpi->lastrecon[bp[h]]+8,stride,bvp);
+	}
+	bp++;
+	cp++;
       }
     }
-    
-    /***************************************************************/
-    /* Last Column */
-    if(cp[0]){
-      /* Filter Left edge always */
-      dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-		      LineLength, BoundingValuePtr);
-      
-      /* Bottom done if next row set */
-      if(!cp[LineFragments])
-        dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-		       LineLength, BoundingValuePtr);
-      
-    }
-    bp++;
-    cp++;
-
-    /***************************************************************/
-    /* Middle Rows */
-    /***************************************************************/
-    for ( m = 1 ; m < cpi->frag_v[j]-1 ; m++) {
-      
-      /*****************************************************************/
-      /* first column conditions */
-      /* only do 2 prediction if fragment coded and on non intra or if
-         all fragments are intra */
-      if(cp[0]){
-        /* TopRow is always done */
-        dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],
-		       LineLength, BoundingValuePtr);
-	
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if (!cp[1])
-          dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			  LineLength, BoundingValuePtr);
-	
-        /* Bottom done if next row set */
-        if(!cp[LineFragments])
-          dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-			 LineLength, BoundingValuePtr);
-        
-      }
-      bp++;
-      cp++;
-
-      /*****************************************************************/
-      /* middle columns  */
-      for ( n = 1 ; n < cpi->frag_h[j] - 1 ; n++, bp++, cp++){
-        if(cp[0]){
-          /* Filter Left edge always */
-          dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-			  LineLength, BoundingValuePtr);
-	  
-          /* TopRow is always done */
-          dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],
-			 LineLength, BoundingValuePtr);
-	  
-          /* Filter right hand border only if the block to the right
-             is not coded */
-          if (!cp[1])
-            dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			    LineLength, BoundingValuePtr);
-
-          /* Bottom done if next row set */
-          if(!cp[LineFragments])
-            dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-			   LineLength, BoundingValuePtr);
-        }
-      }
-
-      /******************************************************************/
-      /* Last Column */
-      if(cp[0]){
-        /* Filter Left edge always*/
-        dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-			LineLength, BoundingValuePtr);
-	
-        /* TopRow is always done */
-        dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],		       
-		       LineLength, BoundingValuePtr);
-	
-        /* Bottom done if next row set */
-        if(!cp[LineFragments])
-          dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[LineFragments],
-			 LineLength, BoundingValuePtr);
-      }
-      bp++;
-      cp++;
-    }
-
-    /*******************************************************************/
-    /* Last Row  */
-
-    /* first column conditions */
-    /* only do 2 prediction if fragment coded and on non intra or if
-       all fragments are intra */
-    if(cp[0]){
-
-      /* TopRow is always done */
-      dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],
-		     LineLength, BoundingValuePtr);
-      
-      /* Filter right hand border only if the block to the right is
-         not coded */
-      if (!cp[1])
-        dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			LineLength, BoundingValuePtr);
-    }
-    bp++;
-    cp++;
-
-    /******************************************************************/
-    /* middle columns  */
-    for ( n = 1 ; n < cpi->frag_h[j] - 1 ; n++, bp++, cp++){
-      if(cp[0]){
-        /* Filter Left edge always */
-        dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-			LineLength, BoundingValuePtr);
-	
-        /* TopRow is always done */
-        dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],
-		       LineLength, BoundingValuePtr);
-
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if (!cp[1])
-          dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] + 6,
-			  LineLength, BoundingValuePtr);
-      }
-    }
-
-    /******************************************************************/
-    /* Last Column */
-    if(cp[0]){
-      /* Filter Left edge always */
-      dsp_FilterHoriz(cpi->dsp,cpi->lastrecon + bp[0] - 2,
-		      LineLength, BoundingValuePtr);
-      
-      /* TopRow is always done */
-      dsp_FilterVert(cpi->dsp,cpi->lastrecon + bp[0],
-		     LineLength, BoundingValuePtr);
-      
-    }
-    bp++;
-    cp++;
   }
 }
 
@@ -399,7 +223,7 @@
   cpi->recon=temp;
 
   /* Apply a loop filter to edge pixels of updated blocks */
-  LoopFilter(cpi);
+  dsp_LoopFilter(cpi->dsp, cpi, cpi->quant_info.loop_filter_limits[cpi->BaseQ] /* temp */);
 
   /* We may need to update the UMV border */
   UpdateUMVBorder(cpi, cpi->lastrecon);
@@ -411,8 +235,7 @@
 
 void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
 {
-  funcs->FilterVert = FilterVert__c;
-  funcs->FilterHoriz = FilterHoriz__c;
+  funcs->LoopFilter = LoopFilter__c;
 #if defined(USE_ASM)
   if (cpu_flags & OC_CPU_X86_MMX) {
     dsp_mmx_dct_decode_init(funcs);

Modified: branches/theora-thusnelda/lib/enc/dsp.h
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.h	2008-06-22 13:24:13 UTC (rev 15055)
+++ branches/theora-thusnelda/lib/enc/dsp.h	2008-06-22 19:50:32 UTC (rev 15056)
@@ -73,8 +73,7 @@
   ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, unsigned char *RefDataPtr1,
 				   unsigned char *RefDataPtr2, ogg_uint32_t stride);
                
-  void (*FilterHoriz)             (unsigned char * PixelPtr,
-				   ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
+  void (*LoopFilter)              (void *cpi, int FLimit);
 
   void (*FilterVert)              (unsigned char * PixelPtr,
 				   ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
@@ -144,12 +143,9 @@
 #define dsp_inter8x8_err_xy2(funcs,ptr1,ptr2,ptr3,str) \
   (funcs.inter8x8_err_xy2 (ptr1,ptr2,ptr3,str))
 
-#define dsp_FilterHoriz(funcs, ptr1, ptr2, ptr3) \
-  (funcs.FilterHoriz(ptr1, ptr2, ptr3))
+#define dsp_LoopFilter(funcs, ptr1, i) \
+  (funcs.LoopFilter(ptr1, i))
 
-#define dsp_FilterVert(funcs, ptr1, ptr2, ptr3) \
-  (funcs.FilterVert(ptr1, ptr2, ptr3))
-
 #define dsp_IDctSlow(funcs, ptr1, ptr2, ptr3) \
     (funcs.IDctSlow(ptr1, ptr2, ptr3))
 

Modified: branches/theora-thusnelda/lib/enc/x86_32/dct_decode_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_32/dct_decode_mmx.c	2008-06-22 13:24:13 UTC (rev 15055)
+++ branches/theora-thusnelda/lib/enc/x86_32/dct_decode_mmx.c	2008-06-22 19:50:32 UTC (rev 15056)
@@ -5,7 +5,7 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
@@ -17,166 +17,381 @@
 
 #include <stdlib.h>
 
-#include "../codec_internal.h"
+#include "codec_internal.h"
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL;
 
-#if defined(__APPLE__) || defined(__CYGWIN__) || defined (__WIN32__)
-#define MANGLE(x) "_"#x
-#else
-#define MANGLE(x) #x
-#endif
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  long esi;
+  _pix-=_ystride*2;
+  __asm__ __volatile__(
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t"
+    "psubw %%mm4,%%mm2\n\t"
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+    /*Free up mm5.*/
+    "packuswb %%mm5,%%mm4\n\t"
+    /*mm0=L L L L*/
+    "movq (%[ll]),%%mm0\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm2,%%mm5\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psllw $1,%%mm7\n\t"
+    "psllw $1,%%mm6\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm2,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm3,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm3\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm6\n\t"
+    "movq %%mm2,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm2,%%mm6\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm7=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm7\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm7,%%mm2\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm2,%%mm5\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm3,%%mm6\n\t"
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm3\n\t"
+    "psllw $1,%%mm0\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    /*mm0=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm0\n\t"
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm0,%%mm3\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm3,%%mm5\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "movq %%mm1,%%mm6\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm6\n\t"
+    /*_pix[0...8+_ystride]+=R_i*/
+    "paddw %%mm2,%%mm4\n\t"
+    "paddw %%mm3,%%mm5\n\t"
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    "psubw %%mm2,%%mm1\n\t"
+    "psubw %%mm3,%%mm6\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "packuswb %%mm6,%%mm1\n\t"
+    /*Write it back out.*/
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    :[s]"=&S"(esi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-static void FilterHoriz__mmx(unsigned char * PixelPtr,
-                        ogg_int32_t LineLength,
-                        ogg_int16_t *BoundingValuePtr){
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+			   const ogg_int16_t *_ll){
+  long esi;
+  long edi;
+  __asm__ __volatile__(
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm1,%%mm0\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm0\n\t"
+    /*"Divide" by 8, producing the residuals R_i.*/
+    "psraw $3,%%mm0\n\t"
+    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+    /*mm6=L L L L*/
+    "movq (%[ll]),%%mm6\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm0,%%mm1\n\t"
+    "pxor %%mm2,%%mm2\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "psllw $1,%%mm3\n\t"
+    "psllw $1,%%mm2\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-2L -2L -2L -2L*/
+    /*mm3==2L 2L 2L 2L*/
+    "pcmpgtw %%mm0,%%mm3\n\t"
+    "pcmpgtw %%mm2,%%mm1\n\t"
+    "pand %%mm3,%%mm0\n\t"
+    "pand %%mm1,%%mm0\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm2\n\t"
+    "movq %%mm0,%%mm1\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-L -L -L -L*/
+    /*mm6==L L L L*/
+    /*mm2=-L>R_i?FF:00*/
+    "pcmpgtw %%mm0,%%mm2\n\t"
+    /*mm1=R_i>L?FF:00*/
+    "pcmpgtw %%mm6,%%mm1\n\t"
+    /*mm3=2L 2L 2L 2L*/
+    "psllw $1,%%mm3\n\t"
+    /*mm6=2L 2L 2L 2L*/
+    "psllw $1,%%mm6\n\t"
+    /*mm3=R_i>L?2L:0*/
+    "pand %%mm1,%%mm3\n\t"
+    /*mm6=-L>R_i?2L:0*/
+    "pand %%mm2,%%mm6\n\t"
+    /*mm0=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm3,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L*/
+    "por %%mm2,%%mm1\n\t"
+    /*mm0=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm6,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L?R_i':0*/
+    "pand %%mm0,%%mm1\n\t"
+    /*mm0=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*_pix[1]+=R_i;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=R_i;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*edi=6 5 2 1*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride])\n\t"
+    /*edi=E D A 9*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride],2)\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[s])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-#define OC_LOOP_H_4x4                                                   \
-    __asm__ __volatile__(                                               \
-    "lea (%1,%1,2),%%esi\n"     /* esi = ystride*3 */                   \
-    "movd (%0), %%mm0\n"        /* 0 0 0 0 3 2 1 0 */                   \
-    "movd (%0,%1),%%mm1\n"      /* 0 0 0 0 7 6 5 4 */                   \
-    "movd (%0,%1,2),%%mm2\n"    /* 0 0 0 0 b a 9 8 */                   \
-    "movd (%0,%%esi),%%mm3\n"   /* 0 0 0 0 f e d c */                   \
-    "punpcklbw %%mm1,%%mm0\n"   /* mm0 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklbw %%mm3,%%mm2\n"   /* mm2 = f b e a d 9 c 8 */             \
-    "movq %%mm0,%%mm1\n"        /* mm1 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklwd %%mm2,%%mm1\n"   /* mm1 = d 9 5 1 c 8 4 0 */             \
-    "punpckhwd %%mm2,%%mm0\n"   /* mm0 = f b 7 3 e a 6 2 */             \
-    "pxor %%mm7,%%mm7\n"                                                \
-    "movq %%mm1,%%mm5\n"        /* mm5 = d 9 5 1 c 8 4 0 */             \
-    "punpckhbw %%mm7,%%mm5\n"   /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/     \
-    "punpcklbw %%mm7,%%mm1\n"   /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/     \
-    "movq %%mm0,%%mm3\n"        /* mm3 = f b 7 3 e a 6 2 */             \
-    "punpckhbw %%mm7,%%mm3\n"   /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/     \
-    "punpcklbw %%mm7,%%mm0\n"       /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
-                                                                        \
-    "psubw %%mm3,%%mm1\n"       /* mm1 = pix[0]-pix[3] mm1 - mm3 */     \
-    "movq %%mm0,%%mm7\n"        /* mm7 = pix[2]*/                       \
-    "psubw %%mm5,%%mm0\n"       /* mm0 = pix[2]-pix[1] mm0 - mm5*/      \
-    "PMULLW "MANGLE(V3)",%%mm0\n" /* *3 */                              \
-    "paddw %%mm0,%%mm1\n"         /* mm1 has f[0] ... f[4]*/            \
-    "paddw "MANGLE(V804)",%%mm1\n"/* add 4 */ /* add 256 after shift */ \
-    "psraw $3,%%mm1\n"          /* >>3 */                               \
-    " pextrw $0,%%mm1,%%esi\n"  /* In MM1 we have 4 f coefs (16bits) */ \
-    " pextrw $1,%%mm1,%%edi\n"  /* now perform MM4 = *(_bv+ f) */       \
-    " pinsrw $0,(%2,%%esi,2),%%mm4\n"                                   \
-    " pextrw $2,%%mm1,%%esi\n"                                          \
-    " pinsrw $1,(%2,%%edi,2),%%mm4\n"                                   \
-    " pextrw $3,%%mm1,%%edi\n"                                          \
-    " pinsrw $2,(%2,%%esi,2),%%mm4\n"                                   \
-    " pinsrw $3,(%2,%%edi,2),%%mm4\n" /* new f vals loaded */           \
-    "pxor %%mm0,%%mm0\n"                                                \
-    " paddw %%mm4,%%mm5\n"      /*(pix[1]+f);*/                         \
-    " psubw %%mm4,%%mm7\n"      /* (pix[2]-f); */                       \
-    " packuswb %%mm0,%%mm5\n"   /* mm5 = x x x x newpix1 */             \
-    " packuswb %%mm0,%%mm7\n"   /* mm7 = x x x x newpix2 */             \
-    " punpcklbw %%mm7,%%mm5\n"  /* 2 1 2 1 2 1 2 1 */                   \
-    " movd %%mm5,%%edi\n"       /* edi = newpix21 */                    \
-    " movw %%di,1(%0)\n"                                                \
-    " psrlq $32,%%mm5\n"        /* why is so big stall here ? */        \
-    " shrl $16,%%edi\n"                                                 \
-    " movw %%di,1(%0,%1,1)\n"                                           \
-    " movd %%mm5,%%edi\n"       /* eax = newpix21 high part */          \
-    " lea (%1,%1,2),%%esi\n"                                            \
-    " movw %%di,1(%0,%1,2)\n"                                           \
-    " shrl $16,%%edi\n"                                                 \
-    " movw %%di,1(%0,%%esi)\n"                                          \
-    :                                                                   \
-    : "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256)      \
-    : "esi", "edi" , "memory"                                           \
-    );
-
-    OC_LOOP_H_4x4
-    PixelPtr += LineLength*4;
-    OC_LOOP_H_4x4
-    __asm__ __volatile__("emms\n");
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
+ 
+static void loop_filter_mmx(CP_INSTANCE *cpi, int FLimit){
+  int j;
+  ogg_int16_t __attribute__((aligned(8)))  ll[4];
+  unsigned char *cp = cpi->frag_coded;
+  ogg_uint32_t *bp = cpi->frag_buffer_index;
 
-static void FilterVert__mmx(unsigned char * PixelPtr,
-                ogg_int32_t LineLength,
-                ogg_int16_t *BoundingValuePtr){
-    __asm__ __volatile__(
-    "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
-    "movq (%0),%%mm7\n"         /* mm7 = pix[0..7] */
-    "lea (%1,%1,2),%%esi\n"     /* esi = ystride*3 */
-    "movq (%0,%%esi),%%mm4\n"   /* mm4 = pix[0..7+ystride*3] */
-    "movq %%mm7,%%mm6\n"        /* mm6 = pix[0..7] */
-    "punpcklbw %%mm0,%%mm6\n"   /* expand unsigned pix[0..3] to 16 bits */
-    "movq %%mm4,%%mm5\n"
-    "punpckhbw %%mm0,%%mm7\n"   /* expand unsigned pix[4..7] to 16 bits */
-    "punpcklbw %%mm0,%%mm4\n"   /* expand other arrays too */
-    "punpckhbw %%mm0,%%mm5\n"
-    "psubw %%mm4,%%mm6\n"       /* mm6 = mm6 - mm4 */
-    "psubw %%mm5,%%mm7\n"       /* mm7 = mm7 - mm5 */
-                /* mm7:mm6 = _p[0]-_p[ystride*3] */
-    "movq (%0,%1),%%mm4\n"      /* mm4 = pix[0..7+ystride] */
-    "movq %%mm4,%%mm5\n"
-    "movq (%0,%1,2),%%mm2\n"    /* mm2 = pix[0..7+ystride*2] */
-    "movq %%mm2,%%mm3\n"
-    "movq %%mm2,%%mm1\n"        //ystride*2
-    "punpckhbw %%mm0,%%mm5\n"
-    "punpcklbw %%mm0,%%mm4\n"
-    "punpckhbw %%mm0,%%mm3\n"
-    "punpcklbw %%mm0,%%mm2\n"
-    "psubw %%mm5,%%mm3\n"
-    "psubw %%mm4,%%mm2\n"
-                /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
-    "PMULLW "MANGLE(V3)",%%mm3\n"    /* *3 */
-    "PMULLW "MANGLE(V3)",%%mm2\n"    /* *3 */
-    "paddw %%mm7,%%mm3\n"            /* highpart */
-    "paddw %%mm6,%%mm2\n"            /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]);  */
-    "paddw "MANGLE(V804)",%%mm3\n"   /* add 4 */ /* add 256 after shift */
-    "paddw "MANGLE(V804)",%%mm2\n"   /* add 4 */ /* add 256 after shift */
-    "psraw $3,%%mm3\n"               /* >>3 f coefs high */
-    "psraw $3,%%mm2\n"               /* >>3 f coefs low */
+  if ( FLimit == 0 ) return;
+  ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
-    " pextrw $0,%%mm2,%%esi\n"  /* In MM3:MM2 we have f coefs (16bits) */
-    " pextrw $1,%%mm2,%%edi\n"  /* now perform MM7:MM6 = *(_bv+ f) */
-    " pinsrw $0,(%2,%%esi,2),%%mm6\n"
-    " pinsrw $1,(%2,%%edi,2),%%mm6\n"
+  for ( j = 0; j < 3 ; j++){
+    ogg_uint32_t *bp_begin = bp;
+    ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
+    int stride = cpi->stride[j];
+    int h = cpi->frag_h[j];
 
-    " pextrw $2,%%mm2,%%esi\n"
-    " pextrw $3,%%mm2,%%edi\n"
-    " pinsrw $2,(%2,%%esi,2),%%mm6\n"
-    " pinsrw $3,(%2,%%edi,2),%%mm6\n"
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&cpi->lastrecon[bp[0]],stride,ll);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&cpi->lastrecon[bp[0]],stride,ll);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,ll);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&cpi->lastrecon[bp[h]]+8,stride,ll);
+	}
+	bp++;
+	cp++;
+      }
+    }
+  }
 
-    " pextrw $0,%%mm3,%%esi\n"
-    " pextrw $1,%%mm3,%%edi\n"
-    " pinsrw $0,(%2,%%esi,2),%%mm7\n"
-    " pinsrw $1,(%2,%%edi,2),%%mm7\n"
-
-    " pextrw $2,%%mm3,%%esi\n"
-    " pextrw $3,%%mm3,%%edi\n"
-    " pinsrw $2,(%2,%%esi,2),%%mm7\n"
-    " pinsrw $3,(%2,%%edi,2),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
-
-    "paddw %%mm6,%%mm4\n"       /* (pix[ystride]+f); */
-    "paddw %%mm7,%%mm5\n"       /* (pix[ystride]+f); */
-    "movq %%mm1,%%mm2\n"
-    "punpcklbw %%mm0,%%mm1\n"
-    "punpckhbw %%mm0,%%mm2\n"   //[ystride*2]
-    "psubw %%mm6,%%mm1\n"       /* (pix[ystride*2]-f); */
-    "psubw %%mm7,%%mm2\n"       /* (pix[ystride*2]-f); */
-    "packuswb %%mm2,%%mm1\n"
-    "packuswb %%mm5,%%mm4\n"
-    "movq %%mm1,(%0,%1,2)\n"    /* pix[ystride*2]= */
-    "movq %%mm4,(%0,%1)\n"      /* pix[ystride]= */
-    "emms\n"
-    :
-    : "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256)
-    : "esi", "edi" , "memory"
-    );
+  /*This needs to be removed when decode specific functions are implemented:*/
+  __asm__ __volatile__("emms\n\t");
 }
 
 /* install our implementation in the function table */
 void dsp_mmx_dct_decode_init(DspFunctions *funcs)
 {
-  funcs->FilterVert = FilterVert__mmx;
-  funcs->FilterHoriz = FilterHoriz__mmx;
+  funcs->LoopFilter = loop_filter_mmx;
 }
 
 #endif /* USE_ASM */

Modified: branches/theora-thusnelda/lib/enc/x86_64/dct_decode_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/dct_decode_mmx.c	2008-06-22 13:24:13 UTC (rev 15055)
+++ branches/theora-thusnelda/lib/enc/x86_64/dct_decode_mmx.c	2008-06-22 19:50:32 UTC (rev 15056)
@@ -21,165 +21,377 @@
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL;
 
-#define OC_LOOP_H_4x4                                                   \
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */                   \
-    "movd (%[pp]), %%mm0\n"        /* 0 0 0 0 3 2 1 0 */                   \
-    "movd (%[pp],%[ll]),%%mm1\n"      /* 0 0 0 0 7 6 5 4 */                   \
-    "movd (%[pp],%[ll],2),%%mm2\n"    /* 0 0 0 0 b a 9 8 */                   \
-    "movd (%[pp],%[s]),%%mm3\n"   /* 0 0 0 0 f e d c */                   \
-    "punpcklbw %%mm1,%%mm0\n"   /* mm0 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklbw %%mm3,%%mm2\n"   /* mm2 = f b e a d 9 c 8 */             \
-    "movq %%mm0,%%mm1\n"        /* mm1 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklwd %%mm2,%%mm1\n"   /* mm1 = d 9 5 1 c 8 4 0 */             \
-    "punpckhwd %%mm2,%%mm0\n"   /* mm0 = f b 7 3 e a 6 2 */             \
-    "pxor %%mm7,%%mm7\n"                                                \
-    "movq %%mm1,%%mm5\n"        /* mm5 = d 9 5 1 c 8 4 0 */             \
-    "punpckhbw %%mm7,%%mm5\n"   /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/     \
-    "punpcklbw %%mm7,%%mm1\n"   /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/     \
-    "movq %%mm0,%%mm3\n"        /* mm3 = f b 7 3 e a 6 2 */             \
-    "punpckhbw %%mm7,%%mm3\n"   /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/     \
-    "punpcklbw %%mm7,%%mm0\n"       /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
-                                                                        \
-    "psubw %%mm3,%%mm1\n"       /* mm1 = pix[0]-pix[3] mm1 - mm3 */     \
-    "movq %%mm0,%%mm7\n"        /* mm7 = pix[2]*/                       \
-    "psubw %%mm5,%%mm0\n"       /* mm0 = pix[2]-pix[1] mm0 - mm5*/      \
-    "pmullw %[V3],%%mm0\n"      /* *3 */                              \
-    "paddw %%mm0,%%mm1\n"         /* mm1 has f[0] ... f[4]*/            \
-    "paddw %[V804],%%mm1\n"     /* add 4 */ /* add 256 after shift */ \
-    "psraw $3,%%mm1\n"          /* >>3 */                               \
-    " pextrw $0,%%mm1,%[s]\n"  /* In MM1 we have 4 f coefs (16bits) */ \
-    " pextrw $1,%%mm1,%[d]\n"  /* now perform MM4 = *(_bv+ f) */       \
-    " pinsrw $0,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pextrw $2,%%mm1,%[s]\n"                                          \
-    " pinsrw $1,(%[bound],%[d],2),%%mm4\n"                                   \
-    " pextrw $3,%%mm1,%[d]\n"                                          \
-    " pinsrw $2,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pinsrw $3,(%[bound],%[d],2),%%mm4\n" /* new f vals loaded */           \
-    "pxor %%mm0,%%mm0\n"                                                \
-    " paddw %%mm4,%%mm5\n"      /*(pix[1]+f);*/                         \
-    " psubw %%mm4,%%mm7\n"      /* (pix[2]-f); */                       \
-    " packuswb %%mm0,%%mm5\n"   /* mm5 = x x x x newpix1 */             \
-    " packuswb %%mm0,%%mm7\n"   /* mm7 = x x x x newpix2 */             \
-    " punpcklbw %%mm7,%%mm5\n"  /* 2 1 2 1 2 1 2 1 */                   \
-    " movd %%mm5,%[d]\n"       /* edi = newpix21 */                    \
-    " movw %[d],1(%[pp])\n"                                                \
-    " psrlq $32,%%mm5\n"        /* why is so big stall here ? */        \
-    " shr  $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[ll],1)\n"                                           \
-    " movd %%mm5,%[d]\n"       /* eax = newpix21 high part */          \
-    " lea (%[ll],%[ll],2),%[s]\n"                                            \
-    " movw %[d],1(%[pp],%[ll],2)\n"                                           \
-    " shr $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[s])\n"                                          
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  long esi;
+  _pix-=_ystride*2;
+  __asm__ __volatile__(
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t"
+    "psubw %%mm4,%%mm2\n\t"
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+    /*Free up mm5.*/
+    "packuswb %%mm5,%%mm4\n\t"
+    /*mm0=L L L L*/
+    "movq (%[ll]),%%mm0\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm2,%%mm5\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psllw $1,%%mm7\n\t"
+    "psllw $1,%%mm6\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm2,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm3,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm3\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm6\n\t"
+    "movq %%mm2,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm2,%%mm6\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm7=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm7\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm7,%%mm2\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm2,%%mm5\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm3,%%mm6\n\t"
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm3\n\t"
+    "psllw $1,%%mm0\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    /*mm0=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm0\n\t"
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm0,%%mm3\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm3,%%mm5\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "movq %%mm1,%%mm6\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm6\n\t"
+    /*_pix[0...8+_ystride]+=R_i*/
+    "paddw %%mm2,%%mm4\n\t"
+    "paddw %%mm3,%%mm5\n\t"
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    "psubw %%mm2,%%mm1\n\t"
+    "psubw %%mm3,%%mm6\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "packuswb %%mm6,%%mm1\n\t"
+    /*Write it back out.*/
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    :[s]"=&S"(esi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-static void FilterHoriz__mmx(unsigned char * PixelPtr,
-			     ogg_int32_t LineLength,
-			     ogg_int16_t *BoundingValuePtr){
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+			   const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) 
-    : "memory"                                           
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm1,%%mm0\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm0\n\t"
+    /*"Divide" by 8, producing the residuals R_i.*/
+    "psraw $3,%%mm0\n\t"
+    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+    /*mm6=L L L L*/
+    "movq (%[ll]),%%mm6\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm0,%%mm1\n\t"
+    "pxor %%mm2,%%mm2\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "psllw $1,%%mm3\n\t"
+    "psllw $1,%%mm2\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-2L -2L -2L -2L*/
+    /*mm3==2L 2L 2L 2L*/
+    "pcmpgtw %%mm0,%%mm3\n\t"
+    "pcmpgtw %%mm2,%%mm1\n\t"
+    "pand %%mm3,%%mm0\n\t"
+    "pand %%mm1,%%mm0\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm2\n\t"
+    "movq %%mm0,%%mm1\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-L -L -L -L*/
+    /*mm6==L L L L*/
+    /*mm2=-L>R_i?FF:00*/
+    "pcmpgtw %%mm0,%%mm2\n\t"
+    /*mm1=R_i>L?FF:00*/
+    "pcmpgtw %%mm6,%%mm1\n\t"
+    /*mm3=2L 2L 2L 2L*/
+    "psllw $1,%%mm3\n\t"
+    /*mm6=2L 2L 2L 2L*/
+    "psllw $1,%%mm6\n\t"
+    /*mm3=R_i>L?2L:0*/
+    "pand %%mm1,%%mm3\n\t"
+    /*mm6=-L>R_i?2L:0*/
+    "pand %%mm2,%%mm6\n\t"
+    /*mm0=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm3,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L*/
+    "por %%mm2,%%mm1\n\t"
+    /*mm0=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm6,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L?R_i':0*/
+    "pand %%mm0,%%mm1\n\t"
+    /*mm0=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*_pix[1]+=R_i;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=R_i;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*edi=6 5 2 1*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride])\n\t"
+    /*edi=E D A 9*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride],2)\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[s])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
   );
+}
 
-    PixelPtr += LineLength*4;
-
-  __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    "emms\n"       
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       \
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) \
-    : "memory"                                           \
-    );
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
+ 
+static void loop_filter_mmx(CP_INSTANCE *cpi, int FLimit){
+  int j;
+  ogg_int16_t __attribute__((aligned(8)))  ll[4];
+  unsigned char *cp = cpi->frag_coded;
+  ogg_uint32_t *bp = cpi->frag_buffer_index;
 
-static void FilterVert__mmx(unsigned char * PixelPtr,
-			    ogg_int32_t LineLength,
-			    ogg_int16_t *BoundingValuePtr){
-  long esi,edi;
-  __asm__ __volatile__(
-    "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
-    "movq (%[pp]),%%mm7\n"         /* mm7 = pix[0..7] */
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */
-    "movq (%[pp],%[s]),%%mm4\n"   /* mm4 = pix[0..7+ystride*3] */
-    "movq %%mm7,%%mm6\n"        /* mm6 = pix[0..7] */
-    "punpcklbw %%mm0,%%mm6\n"   /* expand unsigned pix[0..3] to 16 bits */
-    "movq %%mm4,%%mm5\n"
-    "punpckhbw %%mm0,%%mm7\n"   /* expand unsigned pix[4..7] to 16 bits */
-    "punpcklbw %%mm0,%%mm4\n"   /* expand other arrays too */
-    "punpckhbw %%mm0,%%mm5\n"
-    "psubw %%mm4,%%mm6\n"       /* mm6 = mm6 - mm4 */
-    "psubw %%mm5,%%mm7\n"       /* mm7 = mm7 - mm5 */
-                /* mm7:mm6 = _p[0]-_p[ystride*3] */
-    "movq (%[pp],%[ll]),%%mm4\n"      /* mm4 = pix[0..7+ystride] */
-    "movq %%mm4,%%mm5\n"
-    "movq (%[pp],%[ll],2),%%mm2\n"    /* mm2 = pix[0..7+ystride*2] */
-    "movq %%mm2,%%mm3\n"
-    "movq %%mm2,%%mm1\n"        //ystride*2
-    "punpckhbw %%mm0,%%mm5\n"
-    "punpcklbw %%mm0,%%mm4\n"
-    "punpckhbw %%mm0,%%mm3\n"
-    "punpcklbw %%mm0,%%mm2\n"
-    "psubw %%mm5,%%mm3\n"
-    "psubw %%mm4,%%mm2\n"
-                /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
-    "pmullw %[V3],%%mm3\n"           /* *3 */
-    "pmullw %[V3],%%mm2\n"           /* *3 */
-    "paddw %%mm7,%%mm3\n"            /* highpart */
-    "paddw %%mm6,%%mm2\n"            /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]);  */
-    "paddw %[V804],%%mm3\n"          /* add 4 */ /* add 256 after shift */
-    "paddw %[V804],%%mm2\n"          /* add 4 */ /* add 256 after shift */
-    "psraw $3,%%mm3\n"               /* >>3 f coefs high */
-    "psraw $3,%%mm2\n"               /* >>3 f coefs low */
+  if ( FLimit == 0 ) return;
+  ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
-    " pextrw $0,%%mm2,%[s]\n"  /* In MM3:MM2 we have f coefs (16bits) */
-    " pextrw $1,%%mm2,%[d]\n"  /* now perform MM7:MM6 = *(_bv+ f) */
-    " pinsrw $0,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm6\n"
+  for ( j = 0; j < 3 ; j++){
+    ogg_uint32_t *bp_begin = bp;
+    ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
+    int stride = cpi->stride[j];
+    int h = cpi->frag_h[j];
 
-    " pextrw $2,%%mm2,%[s]\n"
-    " pextrw $3,%%mm2,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm6\n"
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&cpi->lastrecon[bp[0]],stride,ll);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&cpi->lastrecon[bp[0]],stride,ll);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,ll);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&cpi->lastrecon[bp[h]]+8,stride,ll);
+	}
+	bp++;
+	cp++;
+      }
+    }
+  }
 
-    " pextrw $0,%%mm3,%[s]\n"
-    " pextrw $1,%%mm3,%[d]\n"
-    " pinsrw $0,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm7\n"
-
-    " pextrw $2,%%mm3,%[s]\n"
-    " pextrw $3,%%mm3,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
-
-    "paddw %%mm6,%%mm4\n"       /* (pix[ystride]+f); */
-    "paddw %%mm7,%%mm5\n"       /* (pix[ystride]+f); */
-    "movq %%mm1,%%mm2\n"
-    "punpcklbw %%mm0,%%mm1\n"
-    "punpckhbw %%mm0,%%mm2\n"   //[ystride*2]
-    "psubw %%mm6,%%mm1\n"       /* (pix[ystride*2]-f); */
-    "psubw %%mm7,%%mm2\n"       /* (pix[ystride*2]-f); */
-    "packuswb %%mm2,%%mm1\n"
-    "packuswb %%mm5,%%mm4\n"
-    "movq %%mm1,(%[pp],%[ll],2)\n"    /* pix[ystride*2]= */
-    "movq %%mm4,(%[pp],%[ll])\n"      /* pix[ystride]= */
-    "emms\n"
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr-2*LineLength), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804)
-    : "memory"
-    );
+  /*This needs to be removed when decode specific functions are implemented:*/
+  __asm__ __volatile__("emms\n\t");
 }
 
 /* install our implementation in the function table */
 void dsp_mmx_dct_decode_init(DspFunctions *funcs)
 {
-  funcs->FilterVert = FilterVert__mmx;
-  funcs->FilterHoriz = FilterHoriz__mmx;
+  funcs->LoopFilter = loop_filter_mmx;
 }
 
 #endif /* USE_ASM */