[xiph-commits] r15057 - in trunk/theora/lib/enc: . x86_32 x86_64

xiphmont at svn.xiph.org xiphmont at svn.xiph.org
Sun Jun 22 14:07:32 PDT 2008


Author: xiphmont
Date: 2008-06-22 14:07:32 -0700 (Sun, 22 Jun 2008)
New Revision: 15057

Modified:
   trunk/theora/lib/enc/codec_internal.h
   trunk/theora/lib/enc/dct_decode.c
   trunk/theora/lib/enc/dsp.h
   trunk/theora/lib/enc/pp.c
   trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
   trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
Log:
Eliminate use of SSE instruction in loop filter MMX code by replacing 
current loop filter with one from Derf's new decoder.  This required 
some amount of code refactoring as Derf's code expects slightly 
different input.



Modified: trunk/theora/lib/enc/codec_internal.h
===================================================================
--- trunk/theora/lib/enc/codec_internal.h	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/codec_internal.h	2008-06-22 21:07:32 UTC (rev 15057)
@@ -22,9 +22,11 @@
 # include "config.h"
 #endif
 
+typedef struct PB_INSTANCE PB_INSTANCE;
+#include "dsp.h"
+
 #include "theora/theora.h"
 #include "encoder_huffman.h"
-#include "dsp.h"
 
 #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
 
@@ -279,11 +281,10 @@
 
   HUFF_ENTRY *HuffRoot[NUM_HUFF_TABLES];
 
-  unsigned char LoopFilterLimitValues[Q_TABLE_SIZE];
 } codec_setup_info;
 
 /** Decoder (Playback) instance -- installed in a theora_state */
-typedef struct PB_INSTANCE {
+struct PB_INSTANCE {
   oggpack_buffer *opb;
   theora_info     info;
   
@@ -526,7 +527,7 @@
                                                Array Pointers */
 #endif
 
-} PB_INSTANCE;
+};
 
 /* Encoder (Compressor) instance -- installed in a theora_state */
 typedef struct CP_INSTANCE {

Modified: trunk/theora/lib/enc/dct_decode.c
===================================================================
--- trunk/theora/lib/enc/dct_decode.c	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/dct_decode.c	2008-06-22 21:07:32 UTC (rev 15057)
@@ -31,45 +31,21 @@
 
 static const int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
 
-static void SetupBoundingValueArray_Generic(PB_INSTANCE *pbi,
+static void SetupBoundingValueArray_Generic(ogg_int16_t *BoundingValuePtr,
                                             ogg_int32_t FLimit){
 
-  ogg_int16_t * BoundingValuePtr = pbi->FiltBoundingValue+127;
   ogg_int32_t i;
 
   /* Set up the bounding value array. */
-  memset ( pbi->FiltBoundingValue, 0, (256*sizeof(*pbi->FiltBoundingValue)) );
+  memset ( BoundingValuePtr, 0, (256*sizeof(*BoundingValuePtr)) );
   for ( i = 0; i < FLimit; i++ ){
-    BoundingValuePtr[-i-FLimit] = (-FLimit+i);
-    BoundingValuePtr[-i] = -i;
-    BoundingValuePtr[i] = i;
-    BoundingValuePtr[i+FLimit] = FLimit-i;
+    BoundingValuePtr[127-i-FLimit] = (-FLimit+i);
+    BoundingValuePtr[127-i] = -i;
+    BoundingValuePtr[127+i] = i;
+    BoundingValuePtr[127+i+FLimit] = FLimit-i;
   }
 }
 
-/* handle the in-loop filter limit value table */
-
-int ReadFilterTables(codec_setup_info *ci, oggpack_buffer *opb){
-  int i;
-  int bits, value;
-
-  theora_read(opb, 3, &bits);
-  for(i=0;i<Q_TABLE_SIZE;i++){
-    theora_read(opb,bits,&value);
-    ci->LoopFilterLimitValues[i]=value;
-  }
-  if(bits<0)return OC_BADHEADER;
-
-  return 0;
-}
-
-void SetupLoopFilter(PB_INSTANCE *pbi){
-  ogg_int32_t FLimit;
-
-  FLimit = pbi->quant_info.loop_filter_limits[pbi->FrameQIndex];
-  SetupBoundingValueArray_Generic(pbi, FLimit);
-}
-
 static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
   ogg_uint32_t ReconPixelsPerLine;
   ogg_int32_t     ReconPixelIndex;
@@ -669,11 +645,12 @@
   }
 }
 
-static void FilterHoriz__c(unsigned char * PixelPtr,
-                        ogg_int32_t LineLength,
-                        ogg_int16_t *BoundingValuePtr){
+static void loop_filter_h(unsigned char * PixelPtr,
+			  ogg_int32_t LineLength,
+			  ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
+  PixelPtr-=2;
 
   for ( j = 0; j < 8; j++ ){
     FiltVal =
@@ -691,15 +668,12 @@
   }
 }
 
-static void FilterVert__c(unsigned char * PixelPtr,
-                ogg_int32_t LineLength,
-                ogg_int16_t *BoundingValuePtr){
+static void loop_filter_v(unsigned char * PixelPtr,
+			  ogg_int32_t LineLength,
+			  ogg_int16_t *BoundingValuePtr){
   ogg_int32_t j;
   ogg_int32_t FiltVal;
   PixelPtr -= 2*LineLength;
-  /* the math was correct, but negative array indicies are forbidden
-     by ANSI/C99 and will break optimization on several modern
-     compilers */
 
   for ( j = 0; j < 8; j++ ) {
     FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
@@ -716,281 +690,54 @@
   }
 }
 
-void LoopFilter(PB_INSTANCE *pbi){
-  ogg_int32_t i;
+static void LoopFilter__c(PB_INSTANCE *pbi, int FLimit){
 
-  ogg_int16_t * BoundingValuePtr=pbi->FiltBoundingValue+127;
-  int FragsAcross=pbi->HFragments;
-  int FromFragment,ToFragment;
-  int FragsDown = pbi->VFragments;
-  ogg_int32_t LineFragments;
-  ogg_int32_t LineLength;
-  ogg_int32_t FLimit;
-  int QIndex;
-  int j,m,n;
+  int j;
+  ogg_int16_t BoundingValues[256];
+  ogg_int16_t *bvp = BoundingValues+127;
+  unsigned char *cp = pbi->display_fragments;
+  ogg_uint32_t *bp = pbi->recon_pixel_index_table;
 
-  /* Set the limit value for the loop filter based upon the current
-     quantizer. */
-  QIndex = Q_TABLE_SIZE - 1;
-  while ( QIndex >= 0 ) {
-    if ( (QIndex == 0) ||
-         ( pbi->quant_info.ac_scale[QIndex] >= pbi->ThisFrameQualityValue) )
-      break;
-    QIndex --;
-  }
-
-  FLimit = pbi->quant_info.loop_filter_limits[QIndex];
   if ( FLimit == 0 ) return;
-  SetupBoundingValueArray_Generic(pbi, FLimit);
+  SetupBoundingValueArray_Generic(BoundingValues, FLimit);
 
   for ( j = 0; j < 3 ; j++){
+    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_end;
+    int stride;
+    int h;
+
     switch(j) {
     case 0: /* y */
-      FromFragment = 0;
-      ToFragment = pbi->YPlaneFragments;
-      FragsAcross = pbi->HFragments;
-      FragsDown = pbi->VFragments;
-      LineLength = pbi->YStride;
-      LineFragments = pbi->HFragments;
+      bp_end = bp + pbi->YPlaneFragments;
+      h = pbi->HFragments;
+      stride = pbi->YStride;
       break;
-    case 1: /* u */
-      FromFragment = pbi->YPlaneFragments;
-      ToFragment = pbi->YPlaneFragments + pbi->UVPlaneFragments ;
-      FragsAcross = pbi->HFragments >> 1;
-      FragsDown = pbi->VFragments >> 1;
-      LineLength = pbi->UVStride;
-      LineFragments = pbi->HFragments / 2;
+    default: /* u,v, 4:20 specific */
+      bp_end = bp + pbi->UVPlaneFragments;
+      h = pbi->HFragments >> 1;
+      stride = pbi->UVStride;
       break;
-    /*case 2:  v */
-    default:
-      FromFragment = pbi->YPlaneFragments + pbi->UVPlaneFragments;
-      ToFragment = pbi->YPlaneFragments + (2 * pbi->UVPlaneFragments) ;
-      FragsAcross = pbi->HFragments >> 1;
-      FragsDown = pbi->VFragments >> 1;
-      LineLength = pbi->UVStride;
-      LineFragments = pbi->HFragments / 2;
-      break;
     }
-
-    i=FromFragment;
-
-    /**************************************************************
-     First Row
-    **************************************************************/
-    /* first column conditions */
-    /* only do 2 prediction if fragment coded and on non intra or if
-       all fragments are intra */
-    if( pbi->display_fragments[i]){
-      /* Filter right hand border only if the block to the right is
-         not coded */
-      if ( !pbi->display_fragments[ i + 1 ] ){
-        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                    pbi->recon_pixel_index_table[i]+6,
-                    LineLength,BoundingValuePtr);
+    
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,bvp);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,bvp);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&pbi->LastFrameRecon[bp[h]]+8,stride,bvp);
+	}
+	bp++;
+	cp++;
       }
-
-      /* Bottom done if next row set */
-      if( !pbi->display_fragments[ i + LineFragments] ){
-        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                   pbi->recon_pixel_index_table[i+LineFragments],
-                   LineLength, BoundingValuePtr);
-      }
     }
-    i++;
-
-    /***************************************************************/
-    /* middle columns  */
-    for ( n = 1 ; n < FragsAcross - 1 ; n++, i++) {
-      if( pbi->display_fragments[i]){
-        /* Filter Left edge always */
-        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                    pbi->recon_pixel_index_table[i]-2,
-                    LineLength, BoundingValuePtr);
-
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if ( !pbi->display_fragments[ i + 1 ] ){
-          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                      pbi->recon_pixel_index_table[i]+6,
-                      LineLength, BoundingValuePtr);
-        }
-
-        /* Bottom done if next row set */
-        if( !pbi->display_fragments[ i + LineFragments] ){
-          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                     pbi->recon_pixel_index_table[i + LineFragments],
-                     LineLength, BoundingValuePtr);
-        }
-
-      }
-    }
-
-    /***************************************************************/
-    /* Last Column */
-    if( pbi->display_fragments[i]){
-      /* Filter Left edge always */
-      dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                  pbi->recon_pixel_index_table[i] - 2 ,
-                  LineLength, BoundingValuePtr);
-
-      /* Bottom done if next row set */
-      if( !pbi->display_fragments[ i + LineFragments] ){
-        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                   pbi->recon_pixel_index_table[i + LineFragments],
-                   LineLength, BoundingValuePtr);
-      }
-    }
-    i++;
-
-    /***************************************************************/
-    /* Middle Rows */
-    /***************************************************************/
-    for ( m = 1 ; m < FragsDown-1 ; m++) {
-
-      /*****************************************************************/
-      /* first column conditions */
-      /* only do 2 prediction if fragment coded and on non intra or if
-         all fragments are intra */
-      if( pbi->display_fragments[i]){
-        /* TopRow is always done */
-        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                   pbi->recon_pixel_index_table[i],
-                   LineLength, BoundingValuePtr);
-
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if ( !pbi->display_fragments[ i + 1 ] ){
-          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                      pbi->recon_pixel_index_table[i] + 6,
-                      LineLength, BoundingValuePtr);
-        }
-
-        /* Bottom done if next row set */
-        if( !pbi->display_fragments[ i + LineFragments] ){
-          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                     pbi->recon_pixel_index_table[i + LineFragments],
-                     LineLength, BoundingValuePtr);
-        }
-      }
-      i++;
-
-      /*****************************************************************/
-      /* middle columns  */
-      for ( n = 1 ; n < FragsAcross - 1 ; n++, i++){
-        if( pbi->display_fragments[i]){
-          /* Filter Left edge always */
-          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                      pbi->recon_pixel_index_table[i] - 2,
-                      LineLength, BoundingValuePtr);
-
-          /* TopRow is always done */
-          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                     pbi->recon_pixel_index_table[i],
-                     LineLength, BoundingValuePtr);
-
-          /* Filter right hand border only if the block to the right
-             is not coded */
-          if ( !pbi->display_fragments[ i + 1 ] ){
-            dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                        pbi->recon_pixel_index_table[i] + 6,
-                        LineLength, BoundingValuePtr);
-          }
-
-          /* Bottom done if next row set */
-          if( !pbi->display_fragments[ i + LineFragments] ){
-            dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                       pbi->recon_pixel_index_table[i + LineFragments],
-                       LineLength, BoundingValuePtr);
-          }
-        }
-      }
-
-      /******************************************************************/
-      /* Last Column */
-      if( pbi->display_fragments[i]){
-        /* Filter Left edge always*/
-        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                    pbi->recon_pixel_index_table[i] - 2,
-                    LineLength, BoundingValuePtr);
-
-        /* TopRow is always done */
-        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                   pbi->recon_pixel_index_table[i],
-                   LineLength, BoundingValuePtr);
-
-        /* Bottom done if next row set */
-        if( !pbi->display_fragments[ i + LineFragments] ){
-          dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                     pbi->recon_pixel_index_table[i + LineFragments],
-                     LineLength, BoundingValuePtr);
-        }
-      }
-      i++;
-
-    }
-
-    /*******************************************************************/
-    /* Last Row  */
-
-    /* first column conditions */
-    /* only do 2 prediction if fragment coded and on non intra or if
-       all fragments are intra */
-    if( pbi->display_fragments[i]){
-
-      /* TopRow is always done */
-      dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                 pbi->recon_pixel_index_table[i],
-                 LineLength, BoundingValuePtr);
-
-      /* Filter right hand border only if the block to the right is
-         not coded */
-      if ( !pbi->display_fragments[ i + 1 ] ){
-        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                    pbi->recon_pixel_index_table[i] + 6,
-                    LineLength, BoundingValuePtr);
-      }
-    }
-    i++;
-
-    /******************************************************************/
-    /* middle columns  */
-    for ( n = 1 ; n < FragsAcross - 1 ; n++, i++){
-      if( pbi->display_fragments[i]){
-        /* Filter Left edge always */
-        dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                    pbi->recon_pixel_index_table[i] - 2,
-                    LineLength, BoundingValuePtr);
-
-        /* TopRow is always done */
-        dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                   pbi->recon_pixel_index_table[i],
-                   LineLength, BoundingValuePtr);
-
-        /* Filter right hand border only if the block to the right is
-           not coded */
-        if ( !pbi->display_fragments[ i + 1 ] ){
-          dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                      pbi->recon_pixel_index_table[i] + 6,
-                      LineLength, BoundingValuePtr);
-        }
-      }
-    }
-
-    /******************************************************************/
-    /* Last Column */
-    if( pbi->display_fragments[i]){
-      /* Filter Left edge always */
-      dsp_FilterHoriz(pbi->dsp,pbi->LastFrameRecon+
-                  pbi->recon_pixel_index_table[i] - 2,
-                  LineLength, BoundingValuePtr);
-
-      /* TopRow is always done */
-      dsp_FilterVert(pbi->dsp,pbi->LastFrameRecon+
-                 pbi->recon_pixel_index_table[i],
-                 LineLength, BoundingValuePtr);
-
-    }
-    i++;
   }
 }
 
@@ -1078,8 +825,6 @@
   else
     ExpandBlockA=ExpandBlock;
 
-  SetupLoopFilter(pbi);
-
   /* for y,u,v */
   for ( j = 0; j < 3 ; j++) {
     /* pick which fragments based on Y, U, V */
@@ -1202,7 +947,7 @@
   }
 
   /* Apply a loop filter to edge pixels of updated blocks */
-  LoopFilter(pbi);
+  dsp_LoopFilter(pbi->dsp, pbi, pbi->quant_info.loop_filter_limits[pbi->FrameQIndex]);
 
 #ifdef _TH_DEBUG_
     {
@@ -1306,8 +1051,7 @@
 
 void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
 {
-  funcs->FilterVert = FilterVert__c;
-  funcs->FilterHoriz = FilterHoriz__c;
+  funcs->LoopFilter = LoopFilter__c;
 #if defined(USE_ASM)
   // Todo: Port the dct for MSC one day.
 #if !defined (_MSC_VER)  

Modified: trunk/theora/lib/enc/dsp.h
===================================================================
--- trunk/theora/lib/enc/dsp.h	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/dsp.h	2008-06-22 21:07:32 UTC (rev 15057)
@@ -79,8 +79,7 @@
                      unsigned char *RefDataPtr1,
                unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
                
-  void (*FilterHoriz) (unsigned char * PixelPtr,
-                ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
+  void (*LoopFilter) (PB_INSTANCE *pbi, int FLimit);
 
   void (*FilterVert) (unsigned char * PixelPtr,
                  ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
@@ -152,12 +151,9 @@
 #define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
   (funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
 
-#define dsp_FilterHoriz(funcs, ptr1, ptr2, ptr3) \
-  (funcs.FilterHoriz(ptr1, ptr2, ptr3))
+#define dsp_LoopFilter(funcs, ptr1, i) \
+  (funcs.LoopFilter(ptr1, i))
 
-#define dsp_FilterVert(funcs, ptr1, ptr2, ptr3) \
-  (funcs.FilterVert(ptr1, ptr2, ptr3))
-
 #define dsp_IDctSlow(funcs, ptr1, ptr2, ptr3) \
     (funcs.IDctSlow(ptr1, ptr2, ptr3))
 

Modified: trunk/theora/lib/enc/pp.c
===================================================================
--- trunk/theora/lib/enc/pp.c	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/pp.c	2008-06-22 21:07:32 UTC (rev 15057)
@@ -901,9 +901,6 @@
 
   UpdateFragQIndex(pbi);
 
-
-  SetupLoopFilter(pbi);
-
   /* Y */
   DeblockPlane( pbi, SourceBuffer, DestinationBuffer, 0);
 

Modified: trunk/theora/lib/enc/x86_32/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32/dct_decode_mmx.c	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/x86_32/dct_decode_mmx.c	2008-06-22 21:07:32 UTC (rev 15057)
@@ -21,165 +21,389 @@
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL;
 
-#define OC_LOOP_H_4x4                                                   \
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */                   \
-    "movd (%[pp]), %%mm0\n"        /* 0 0 0 0 3 2 1 0 */                   \
-    "movd (%[pp],%[ll]),%%mm1\n"      /* 0 0 0 0 7 6 5 4 */                   \
-    "movd (%[pp],%[ll],2),%%mm2\n"    /* 0 0 0 0 b a 9 8 */                   \
-    "movd (%[pp],%[s]),%%mm3\n"   /* 0 0 0 0 f e d c */                   \
-    "punpcklbw %%mm1,%%mm0\n"   /* mm0 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklbw %%mm3,%%mm2\n"   /* mm2 = f b e a d 9 c 8 */             \
-    "movq %%mm0,%%mm1\n"        /* mm1 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklwd %%mm2,%%mm1\n"   /* mm1 = d 9 5 1 c 8 4 0 */             \
-    "punpckhwd %%mm2,%%mm0\n"   /* mm0 = f b 7 3 e a 6 2 */             \
-    "pxor %%mm7,%%mm7\n"                                                \
-    "movq %%mm1,%%mm5\n"        /* mm5 = d 9 5 1 c 8 4 0 */             \
-    "punpckhbw %%mm7,%%mm5\n"   /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/     \
-    "punpcklbw %%mm7,%%mm1\n"   /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/     \
-    "movq %%mm0,%%mm3\n"        /* mm3 = f b 7 3 e a 6 2 */             \
-    "punpckhbw %%mm7,%%mm3\n"   /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/     \
-    "punpcklbw %%mm7,%%mm0\n"       /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
-                                                                        \
-    "psubw %%mm3,%%mm1\n"       /* mm1 = pix[0]-pix[3] mm1 - mm3 */     \
-    "movq %%mm0,%%mm7\n"        /* mm7 = pix[2]*/                       \
-    "psubw %%mm5,%%mm0\n"       /* mm0 = pix[2]-pix[1] mm0 - mm5*/      \
-    "pmullw %[V3],%%mm0\n"      /* *3 */                              \
-    "paddw %%mm0,%%mm1\n"         /* mm1 has f[0] ... f[4]*/            \
-    "paddw %[V804],%%mm1\n"     /* add 4 */ /* add 256 after shift */ \
-    "psraw $3,%%mm1\n"          /* >>3 */                               \
-    " pextrw $0,%%mm1,%[s]\n"  /* In MM1 we have 4 f coefs (16bits) */ \
-    " pextrw $1,%%mm1,%[d]\n"  /* now perform MM4 = *(_bv+ f) */       \
-    " pinsrw $0,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pextrw $2,%%mm1,%[s]\n"                                          \
-    " pinsrw $1,(%[bound],%[d],2),%%mm4\n"                                   \
-    " pextrw $3,%%mm1,%[d]\n"                                          \
-    " pinsrw $2,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pinsrw $3,(%[bound],%[d],2),%%mm4\n" /* new f vals loaded */           \
-    "pxor %%mm0,%%mm0\n"                                                \
-    " paddw %%mm4,%%mm5\n"      /*(pix[1]+f);*/                         \
-    " psubw %%mm4,%%mm7\n"      /* (pix[2]-f); */                       \
-    " packuswb %%mm0,%%mm5\n"   /* mm5 = x x x x newpix1 */             \
-    " packuswb %%mm0,%%mm7\n"   /* mm7 = x x x x newpix2 */             \
-    " punpcklbw %%mm7,%%mm5\n"  /* 2 1 2 1 2 1 2 1 */                   \
-    " movd %%mm5,%[d]\n"       /* edi = newpix21 */                    \
-    " movw %[d],1(%[pp])\n"                                                \
-    " psrlq $32,%%mm5\n"        /* why is so big stall here ? */        \
-    " shr  $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[ll],1)\n"                                           \
-    " movd %%mm5,%[d]\n"       /* eax = newpix21 high part */          \
-    " lea (%[ll],%[ll],2),%[s]\n"                                            \
-    " movw %[d],1(%[pp],%[ll],2)\n"                                           \
-    " shr $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[s])\n"                                          
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  long esi;
+  _pix-=_ystride*2;
+  __asm__ __volatile__(
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t"
+    "psubw %%mm4,%%mm2\n\t"
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+    /*Free up mm5.*/
+    "packuswb %%mm5,%%mm4\n\t"
+    /*mm0=L L L L*/
+    "movq (%[ll]),%%mm0\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm2,%%mm5\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psllw $1,%%mm7\n\t"
+    "psllw $1,%%mm6\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm2,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm3,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm3\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm6\n\t"
+    "movq %%mm2,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm2,%%mm6\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm7=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm7\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm7,%%mm2\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm2,%%mm5\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm3,%%mm6\n\t"
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm3\n\t"
+    "psllw $1,%%mm0\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    /*mm0=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm0\n\t"
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm0,%%mm3\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm3,%%mm5\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "movq %%mm1,%%mm6\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm6\n\t"
+    /*_pix[0...8+_ystride]+=R_i*/
+    "paddw %%mm2,%%mm4\n\t"
+    "paddw %%mm3,%%mm5\n\t"
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    "psubw %%mm2,%%mm1\n\t"
+    "psubw %%mm3,%%mm6\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "packuswb %%mm6,%%mm1\n\t"
+    /*Write it back out.*/
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    :[s]"=&S"(esi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-static void FilterHoriz__mmx(unsigned char * PixelPtr,
-			     ogg_int32_t LineLength,
-			     ogg_int16_t *BoundingValuePtr){
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+			   const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) 
-    : "memory"                                           
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm1,%%mm0\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm0\n\t"
+    /*"Divide" by 8, producing the residuals R_i.*/
+    "psraw $3,%%mm0\n\t"
+    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+    /*mm6=L L L L*/
+    "movq (%[ll]),%%mm6\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm0,%%mm1\n\t"
+    "pxor %%mm2,%%mm2\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "psllw $1,%%mm3\n\t"
+    "psllw $1,%%mm2\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-2L -2L -2L -2L*/
+    /*mm3==2L 2L 2L 2L*/
+    "pcmpgtw %%mm0,%%mm3\n\t"
+    "pcmpgtw %%mm2,%%mm1\n\t"
+    "pand %%mm3,%%mm0\n\t"
+    "pand %%mm1,%%mm0\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm2\n\t"
+    "movq %%mm0,%%mm1\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-L -L -L -L*/
+    /*mm6==L L L L*/
+    /*mm2=-L>R_i?FF:00*/
+    "pcmpgtw %%mm0,%%mm2\n\t"
+    /*mm1=R_i>L?FF:00*/
+    "pcmpgtw %%mm6,%%mm1\n\t"
+    /*mm3=2L 2L 2L 2L*/
+    "psllw $1,%%mm3\n\t"
+    /*mm6=2L 2L 2L 2L*/
+    "psllw $1,%%mm6\n\t"
+    /*mm3=R_i>L?2L:0*/
+    "pand %%mm1,%%mm3\n\t"
+    /*mm6=-L>R_i?2L:0*/
+    "pand %%mm2,%%mm6\n\t"
+    /*mm0=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm3,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L*/
+    "por %%mm2,%%mm1\n\t"
+    /*mm0=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm6,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L?R_i':0*/
+    "pand %%mm0,%%mm1\n\t"
+    /*mm0=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*_pix[1]+=R_i;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=R_i;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*edi=6 5 2 1*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride])\n\t"
+    /*edi=E D A 9*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride],2)\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[s])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
   );
+}
 
-    PixelPtr += LineLength*4;
-
-  __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    "emms\n"       
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       \
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) \
-    : "memory"                                           \
-    );
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
+ 
+static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
+  int j;
+  ogg_int16_t __attribute__((aligned(8)))  ll[4];
+  unsigned char *cp = pbi->display_fragments;
+  ogg_uint32_t *bp = pbi->recon_pixel_index_table;
 
-static void FilterVert__mmx(unsigned char * PixelPtr,
-			    ogg_int32_t LineLength,
-			    ogg_int16_t *BoundingValuePtr){
-  long esi,edi;
-  __asm__ __volatile__(
-    "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
-    "movq (%[pp]),%%mm7\n"         /* mm7 = pix[0..7] */
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */
-    "movq (%[pp],%[s]),%%mm4\n"   /* mm4 = pix[0..7+ystride*3] */
-    "movq %%mm7,%%mm6\n"        /* mm6 = pix[0..7] */
-    "punpcklbw %%mm0,%%mm6\n"   /* expand unsigned pix[0..3] to 16 bits */
-    "movq %%mm4,%%mm5\n"
-    "punpckhbw %%mm0,%%mm7\n"   /* expand unsigned pix[4..7] to 16 bits */
-    "punpcklbw %%mm0,%%mm4\n"   /* expand other arrays too */
-    "punpckhbw %%mm0,%%mm5\n"
-    "psubw %%mm4,%%mm6\n"       /* mm6 = mm6 - mm4 */
-    "psubw %%mm5,%%mm7\n"       /* mm7 = mm7 - mm5 */
-                /* mm7:mm6 = _p[0]-_p[ystride*3] */
-    "movq (%[pp],%[ll]),%%mm4\n"      /* mm4 = pix[0..7+ystride] */
-    "movq %%mm4,%%mm5\n"
-    "movq (%[pp],%[ll],2),%%mm2\n"    /* mm2 = pix[0..7+ystride*2] */
-    "movq %%mm2,%%mm3\n"
-    "movq %%mm2,%%mm1\n"        //ystride*2
-    "punpckhbw %%mm0,%%mm5\n"
-    "punpcklbw %%mm0,%%mm4\n"
-    "punpckhbw %%mm0,%%mm3\n"
-    "punpcklbw %%mm0,%%mm2\n"
-    "psubw %%mm5,%%mm3\n"
-    "psubw %%mm4,%%mm2\n"
-                /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
-    "pmullw %[V3],%%mm3\n"           /* *3 */
-    "pmullw %[V3],%%mm2\n"           /* *3 */
-    "paddw %%mm7,%%mm3\n"            /* highpart */
-    "paddw %%mm6,%%mm2\n"            /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]);  */
-    "paddw %[V804],%%mm3\n"          /* add 4 */ /* add 256 after shift */
-    "paddw %[V804],%%mm2\n"          /* add 4 */ /* add 256 after shift */
-    "psraw $3,%%mm3\n"               /* >>3 f coefs high */
-    "psraw $3,%%mm2\n"               /* >>3 f coefs low */
+  if ( FLimit == 0 ) return;
+  ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
-    " pextrw $0,%%mm2,%[s]\n"  /* In MM3:MM2 we have f coefs (16bits) */
-    " pextrw $1,%%mm2,%[d]\n"  /* now perform MM7:MM6 = *(_bv+ f) */
-    " pinsrw $0,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm6\n"
+  for ( j = 0; j < 3 ; j++){
+    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_end;
+    int stride;
+    int h;
 
-    " pextrw $2,%%mm2,%[s]\n"
-    " pextrw $3,%%mm2,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm6\n"
+    switch(j) {
+    case 0: /* y */
+      bp_end = bp + pbi->YPlaneFragments;
+      h = pbi->HFragments;
+      stride = pbi->YStride;
+      break;
+    default: /* u,v, 4:20 specific */
+      bp_end = bp + pbi->UVPlaneFragments;
+      h = pbi->HFragments >> 1;
+      stride = pbi->UVStride;
+      break;
+    }
+    
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&pbi->LastFrameRecon[bp[h]]+8,stride,ll);
+	}
+	bp++;
+	cp++;
+      }
+    }
+  }
 
-    " pextrw $0,%%mm3,%[s]\n"
-    " pextrw $1,%%mm3,%[d]\n"
-    " pinsrw $0,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm7\n"
-
-    " pextrw $2,%%mm3,%[s]\n"
-    " pextrw $3,%%mm3,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
-
-    "paddw %%mm6,%%mm4\n"       /* (pix[ystride]+f); */
-    "paddw %%mm7,%%mm5\n"       /* (pix[ystride]+f); */
-    "movq %%mm1,%%mm2\n"
-    "punpcklbw %%mm0,%%mm1\n"
-    "punpckhbw %%mm0,%%mm2\n"   //[ystride*2]
-    "psubw %%mm6,%%mm1\n"       /* (pix[ystride*2]-f); */
-    "psubw %%mm7,%%mm2\n"       /* (pix[ystride*2]-f); */
-    "packuswb %%mm2,%%mm1\n"
-    "packuswb %%mm5,%%mm4\n"
-    "movq %%mm1,(%[pp],%[ll],2)\n"    /* pix[ystride*2]= */
-    "movq %%mm4,(%[pp],%[ll])\n"      /* pix[ystride]= */
-    "emms\n"
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr-2*LineLength), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804)
-    : "memory"
-    );
+  __asm__ __volatile__("emms\n\t");
 }
 
 /* install our implementation in the function table */
 void dsp_mmx_dct_decode_init(DspFunctions *funcs)
 {
-  funcs->FilterVert = FilterVert__mmx;
-  funcs->FilterHoriz = FilterHoriz__mmx;
+  funcs->LoopFilter = loop_filter_mmx;
 }
 
 #endif /* USE_ASM */

Modified: trunk/theora/lib/enc/x86_64/dct_decode_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_64/dct_decode_mmx.c	2008-06-22 19:50:32 UTC (rev 15056)
+++ trunk/theora/lib/enc/x86_64/dct_decode_mmx.c	2008-06-22 21:07:32 UTC (rev 15057)
@@ -21,165 +21,389 @@
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL;
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL;
 
-#define OC_LOOP_H_4x4                                                   \
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */                   \
-    "movd (%[pp]), %%mm0\n"        /* 0 0 0 0 3 2 1 0 */                   \
-    "movd (%[pp],%[ll]),%%mm1\n"      /* 0 0 0 0 7 6 5 4 */                   \
-    "movd (%[pp],%[ll],2),%%mm2\n"    /* 0 0 0 0 b a 9 8 */                   \
-    "movd (%[pp],%[s]),%%mm3\n"   /* 0 0 0 0 f e d c */                   \
-    "punpcklbw %%mm1,%%mm0\n"   /* mm0 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklbw %%mm3,%%mm2\n"   /* mm2 = f b e a d 9 c 8 */             \
-    "movq %%mm0,%%mm1\n"        /* mm1 = 7 3 6 2 5 1 4 0 */             \
-    "punpcklwd %%mm2,%%mm1\n"   /* mm1 = d 9 5 1 c 8 4 0 */             \
-    "punpckhwd %%mm2,%%mm0\n"   /* mm0 = f b 7 3 e a 6 2 */             \
-    "pxor %%mm7,%%mm7\n"                                                \
-    "movq %%mm1,%%mm5\n"        /* mm5 = d 9 5 1 c 8 4 0 */             \
-    "punpckhbw %%mm7,%%mm5\n"   /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/     \
-    "punpcklbw %%mm7,%%mm1\n"   /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/     \
-    "movq %%mm0,%%mm3\n"        /* mm3 = f b 7 3 e a 6 2 */             \
-    "punpckhbw %%mm7,%%mm3\n"   /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/     \
-    "punpcklbw %%mm7,%%mm0\n"       /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
-                                                                        \
-    "psubw %%mm3,%%mm1\n"       /* mm1 = pix[0]-pix[3] mm1 - mm3 */     \
-    "movq %%mm0,%%mm7\n"        /* mm7 = pix[2]*/                       \
-    "psubw %%mm5,%%mm0\n"       /* mm0 = pix[2]-pix[1] mm0 - mm5*/      \
-    "pmullw %[V3],%%mm0\n"      /* *3 */                              \
-    "paddw %%mm0,%%mm1\n"         /* mm1 has f[0] ... f[4]*/            \
-    "paddw %[V804],%%mm1\n"     /* add 4 */ /* add 256 after shift */ \
-    "psraw $3,%%mm1\n"          /* >>3 */                               \
-    " pextrw $0,%%mm1,%[s]\n"  /* In MM1 we have 4 f coefs (16bits) */ \
-    " pextrw $1,%%mm1,%[d]\n"  /* now perform MM4 = *(_bv+ f) */       \
-    " pinsrw $0,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pextrw $2,%%mm1,%[s]\n"                                          \
-    " pinsrw $1,(%[bound],%[d],2),%%mm4\n"                                   \
-    " pextrw $3,%%mm1,%[d]\n"                                          \
-    " pinsrw $2,(%[bound],%[s],2),%%mm4\n"                                   \
-    " pinsrw $3,(%[bound],%[d],2),%%mm4\n" /* new f vals loaded */           \
-    "pxor %%mm0,%%mm0\n"                                                \
-    " paddw %%mm4,%%mm5\n"      /*(pix[1]+f);*/                         \
-    " psubw %%mm4,%%mm7\n"      /* (pix[2]-f); */                       \
-    " packuswb %%mm0,%%mm5\n"   /* mm5 = x x x x newpix1 */             \
-    " packuswb %%mm0,%%mm7\n"   /* mm7 = x x x x newpix2 */             \
-    " punpcklbw %%mm7,%%mm5\n"  /* 2 1 2 1 2 1 2 1 */                   \
-    " movd %%mm5,%[d]\n"       /* edi = newpix21 */                    \
-    " movw %[d],1(%[pp])\n"                                                \
-    " psrlq $32,%%mm5\n"        /* why is so big stall here ? */        \
-    " shr  $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[ll],1)\n"                                           \
-    " movd %%mm5,%[d]\n"       /* eax = newpix21 high part */          \
-    " lea (%[ll],%[ll],2),%[s]\n"                                            \
-    " movw %[d],1(%[pp],%[ll],2)\n"                                           \
-    " shr $16,%[d]\n"                                                 \
-    " movw %[d],1(%[pp],%[s])\n"                                          
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  long esi;
+  _pix-=_ystride*2;
+  __asm__ __volatile__(
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t"
+    "psubw %%mm4,%%mm2\n\t"
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+    /*Free up mm5.*/
+    "packuswb %%mm5,%%mm4\n\t"
+    /*mm0=L L L L*/
+    "movq (%[ll]),%%mm0\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm2,%%mm5\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psllw $1,%%mm7\n\t"
+    "psllw $1,%%mm6\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm2,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    "pcmpgtw %%mm3,%%mm7\n\t"
+    "pcmpgtw %%mm6,%%mm5\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    "pand %%mm5,%%mm3\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm6\n\t"
+    "movq %%mm2,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm2,%%mm6\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm2\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm7=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm7\n\t"
+    "pxor %%mm6,%%mm6\n\t"
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm7,%%mm2\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm2,%%mm5\n\t"
+    "movq %%mm0,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $1,%%mm7\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    "pcmpgtw %%mm3,%%mm6\n\t"
+    /*mm5=R_i>L?FF:00*/
+    "pcmpgtw %%mm0,%%mm5\n\t"
+    /*mm7=R_i>L?2L:0*/
+    "pand %%mm5,%%mm7\n\t"
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm7,%%mm3\n\t"
+    "psllw $1,%%mm0\n\t"
+    /*mm5=-L>R_i||R_i>L*/
+    "por %%mm6,%%mm5\n\t"
+    /*mm0=-L>R_i?2L:0*/
+    "pand %%mm6,%%mm0\n\t"
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm0,%%mm3\n\t"
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    "pand %%mm3,%%mm5\n\t"
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm5,%%mm3\n\t"
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "movq %%mm1,%%mm6\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm6\n\t"
+    /*_pix[0...8+_ystride]+=R_i*/
+    "paddw %%mm2,%%mm4\n\t"
+    "paddw %%mm3,%%mm5\n\t"
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    "psubw %%mm2,%%mm1\n\t"
+    "psubw %%mm3,%%mm6\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "packuswb %%mm6,%%mm1\n\t"
+    /*Write it back out.*/
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    :[s]"=&S"(esi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
 
-static void FilterHoriz__mmx(unsigned char * PixelPtr,
-			     ogg_int32_t LineLength,
-			     ogg_int16_t *BoundingValuePtr){
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+			   const ogg_int16_t *_ll){
   long esi;
   long edi;
   __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) 
-    : "memory"                                           
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm1,%%mm0\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm0\n\t"
+    /*"Divide" by 8, producing the residuals R_i.*/
+    "psraw $3,%%mm0\n\t"
+    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
+    /*mm6=L L L L*/
+    "movq (%[ll]),%%mm6\n\t"
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    "movq %%mm0,%%mm1\n\t"
+    "pxor %%mm2,%%mm2\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "psllw $1,%%mm3\n\t"
+    "psllw $1,%%mm2\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-2L -2L -2L -2L*/
+    /*mm3==2L 2L 2L 2L*/
+    "pcmpgtw %%mm0,%%mm3\n\t"
+    "pcmpgtw %%mm2,%%mm1\n\t"
+    "pand %%mm3,%%mm0\n\t"
+    "pand %%mm1,%%mm0\n\t"
+    /*if(R_i<-L)R_i'=R_i+2L;
+      if(R_i>L)R_i'=R_i-2L;
+      if(R_i<-L||R_i>L)R_i=-R_i':*/
+    "psraw $1,%%mm2\n\t"
+    "movq %%mm0,%%mm1\n\t"
+    "movq %%mm6,%%mm3\n\t"
+    /*mm0==R_3 R_2 R_1 R_0*/
+    /*mm1==R_3 R_2 R_1 R_0*/
+    /*mm2==-L -L -L -L*/
+    /*mm6==L L L L*/
+    /*mm2=-L>R_i?FF:00*/
+    "pcmpgtw %%mm0,%%mm2\n\t"
+    /*mm1=R_i>L?FF:00*/
+    "pcmpgtw %%mm6,%%mm1\n\t"
+    /*mm3=2L 2L 2L 2L*/
+    "psllw $1,%%mm3\n\t"
+    /*mm6=2L 2L 2L 2L*/
+    "psllw $1,%%mm6\n\t"
+    /*mm3=R_i>L?2L:0*/
+    "pand %%mm1,%%mm3\n\t"
+    /*mm6=-L>R_i?2L:0*/
+    "pand %%mm2,%%mm6\n\t"
+    /*mm0=R_i>L?R_i-2L:R_i*/
+    "psubw %%mm3,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L*/
+    "por %%mm2,%%mm1\n\t"
+    /*mm0=-L>R_i?R_i+2L:R_i*/
+    "paddw %%mm6,%%mm0\n\t"
+    /*mm1=-L>R_i||R_i>L?R_i':0*/
+    "pand %%mm0,%%mm1\n\t"
+    /*mm0=-L>R_i||R_i>L?0:R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
+    "psubw %%mm1,%%mm0\n\t"
+    /*_pix[1]+=R_i;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=R_i;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*edi=6 5 2 1*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride])\n\t"
+    /*edi=E D A 9*/
+    "movd %%mm5,%%edi\n\t"
+    "movw %%di,1(%[pix],%[ystride],2)\n\t"
+    "shrl $16,%%edi\n\t"
+    "movw %%di,1(%[pix],%[s])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
   );
+}
 
-    PixelPtr += LineLength*4;
-
-  __asm__ __volatile__(
-    OC_LOOP_H_4x4
-    "emms\n"       
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       \
-    : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) \
-    : "memory"                                           \
-    );
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+			  const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
 }
+ 
+static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){
+  int j;
+  ogg_int16_t __attribute__((aligned(8)))  ll[4];
+  unsigned char *cp = pbi->display_fragments;
+  ogg_uint32_t *bp = pbi->recon_pixel_index_table;
 
-static void FilterVert__mmx(unsigned char * PixelPtr,
-			    ogg_int32_t LineLength,
-			    ogg_int16_t *BoundingValuePtr){
-  long esi,edi;
-  __asm__ __volatile__(
-    "pxor %%mm0,%%mm0\n"        /* mm0 = 0 */
-    "movq (%[pp]),%%mm7\n"         /* mm7 = pix[0..7] */
-    "lea (%[ll],%[ll],2),%[s]\n"     /* esi = ystride*3 */
-    "movq (%[pp],%[s]),%%mm4\n"   /* mm4 = pix[0..7+ystride*3] */
-    "movq %%mm7,%%mm6\n"        /* mm6 = pix[0..7] */
-    "punpcklbw %%mm0,%%mm6\n"   /* expand unsigned pix[0..3] to 16 bits */
-    "movq %%mm4,%%mm5\n"
-    "punpckhbw %%mm0,%%mm7\n"   /* expand unsigned pix[4..7] to 16 bits */
-    "punpcklbw %%mm0,%%mm4\n"   /* expand other arrays too */
-    "punpckhbw %%mm0,%%mm5\n"
-    "psubw %%mm4,%%mm6\n"       /* mm6 = mm6 - mm4 */
-    "psubw %%mm5,%%mm7\n"       /* mm7 = mm7 - mm5 */
-                /* mm7:mm6 = _p[0]-_p[ystride*3] */
-    "movq (%[pp],%[ll]),%%mm4\n"      /* mm4 = pix[0..7+ystride] */
-    "movq %%mm4,%%mm5\n"
-    "movq (%[pp],%[ll],2),%%mm2\n"    /* mm2 = pix[0..7+ystride*2] */
-    "movq %%mm2,%%mm3\n"
-    "movq %%mm2,%%mm1\n"        //ystride*2
-    "punpckhbw %%mm0,%%mm5\n"
-    "punpcklbw %%mm0,%%mm4\n"
-    "punpckhbw %%mm0,%%mm3\n"
-    "punpcklbw %%mm0,%%mm2\n"
-    "psubw %%mm5,%%mm3\n"
-    "psubw %%mm4,%%mm2\n"
-                /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
-    "pmullw %[V3],%%mm3\n"           /* *3 */
-    "pmullw %[V3],%%mm2\n"           /* *3 */
-    "paddw %%mm7,%%mm3\n"            /* highpart */
-    "paddw %%mm6,%%mm2\n"            /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]);  */
-    "paddw %[V804],%%mm3\n"          /* add 4 */ /* add 256 after shift */
-    "paddw %[V804],%%mm2\n"          /* add 4 */ /* add 256 after shift */
-    "psraw $3,%%mm3\n"               /* >>3 f coefs high */
-    "psraw $3,%%mm2\n"               /* >>3 f coefs low */
+  if ( FLimit == 0 ) return;
+  ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
 
-    " pextrw $0,%%mm2,%[s]\n"  /* In MM3:MM2 we have f coefs (16bits) */
-    " pextrw $1,%%mm2,%[d]\n"  /* now perform MM7:MM6 = *(_bv+ f) */
-    " pinsrw $0,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm6\n"
+  for ( j = 0; j < 3 ; j++){
+    ogg_uint32_t *bp_begin = bp; 
+    ogg_uint32_t *bp_end;
+    int stride;
+    int h;
 
-    " pextrw $2,%%mm2,%[s]\n"
-    " pextrw $3,%%mm2,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm6\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm6\n"
+    switch(j) {
+    case 0: /* y */
+      bp_end = bp + pbi->YPlaneFragments;
+      h = pbi->HFragments;
+      stride = pbi->YStride;
+      break;
+    default: /* u,v, 4:20 specific */
+      bp_end = bp + pbi->UVPlaneFragments;
+      h = pbi->HFragments >> 1;
+      stride = pbi->UVStride;
+      break;
+    }
+    
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left = bp;
+      ogg_uint32_t *bp_right = bp + h;
+      while(bp<bp_right){
+	if(cp[0]){
+	  if(bp>bp_left)
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll);
+	  if(bp_left>bp_begin)
+	    loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll);
+	  if(bp+1<bp_right && !cp[1])
+	    loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll);
+	  if(bp+stride<bp_end && !cp[stride])
+	    loop_filter_v(&pbi->LastFrameRecon[bp[h]]+8,stride,ll);
+	}
+	bp++;
+	cp++;
+      }
+    }
+  }
 
-    " pextrw $0,%%mm3,%[s]\n"
-    " pextrw $1,%%mm3,%[d]\n"
-    " pinsrw $0,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $1,(%[bound],%[d],2),%%mm7\n"
-
-    " pextrw $2,%%mm3,%[s]\n"
-    " pextrw $3,%%mm3,%[d]\n"
-    " pinsrw $2,(%[bound],%[s],2),%%mm7\n"
-    " pinsrw $3,(%[bound],%[d],2),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
-
-    "paddw %%mm6,%%mm4\n"       /* (pix[ystride]+f); */
-    "paddw %%mm7,%%mm5\n"       /* (pix[ystride]+f); */
-    "movq %%mm1,%%mm2\n"
-    "punpcklbw %%mm0,%%mm1\n"
-    "punpckhbw %%mm0,%%mm2\n"   //[ystride*2]
-    "psubw %%mm6,%%mm1\n"       /* (pix[ystride*2]-f); */
-    "psubw %%mm7,%%mm2\n"       /* (pix[ystride*2]-f); */
-    "packuswb %%mm2,%%mm1\n"
-    "packuswb %%mm5,%%mm4\n"
-    "movq %%mm1,(%[pp],%[ll],2)\n"    /* pix[ystride*2]= */
-    "movq %%mm4,(%[pp],%[ll])\n"      /* pix[ystride]= */
-    "emms\n"
-    : [s]"=&r"(esi),[d]"=&r"(edi)                                       
-    : [pp]"r"(PixelPtr-2*LineLength), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804)
-    : "memory"
-    );
+  __asm__ __volatile__("emms\n\t");
 }
 
 /* install our implementation in the function table */
 void dsp_mmx_dct_decode_init(DspFunctions *funcs)
 {
-  funcs->FilterVert = FilterVert__mmx;
-  funcs->FilterHoriz = FilterHoriz__mmx;
+  funcs->LoopFilter = loop_filter_mmx;
 }
 
 #endif /* USE_ASM */



More information about the commits mailing list