[Theora-dev] MMX loop filter for theora-exp

Wed Aug 17 12:23:31 PDT 2005

Hello,

I would like to announce the semi-optimized oc_state_loop_filter_frag_rows
It gains like 7% speedup. Unfortunately it has some issues:

1) wont compile on 64bit (I will fix it later hopefully)
2) is not yet fully optimized (instruction stalls)

Here are the results.

CPU: Athlon, speed 1466.91 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 400000
Counted DATA_CACHE_MISSES events (Data cache misses) with a unit mask of 0x00 (No unit mask) count 2000
samples  %        samples  %        image name               symbol name
163107   23.5109  120026   22.8927  dump                     theora_decode_packetin
118986   17.1511  160269   30.5682  libc-2.3.2.so            (no symbols)
72543    10.4566  12407     2.3664  libogg.so.0.5.2          (no symbols)
56169     8.0964  46845     8.9348  dump                     oc_state_loop_filter_frag_rows_mmx
42864     6.1786  50895     9.7072  dump                     oc_frag_recon_inter2_mmx
36961     5.3277  11800     2.2506  dump                     oc_state_frag_recon_mmx
33020     4.7596  17433     3.3250  dump                     oc_frag_pred_dc
26958     3.8858  15152     2.8900  dump                     oc_huff_token_decode
15535     2.2393  31301     5.9701  dump                     oc_frag_recon_inter_mmx
14941     2.1537  280       0.0534  dump                     oc_idct8x8_mmx
12614     1.8182  648       0.1236  dump                     oc_idct8x8_10_mmx
11807     1.7019  1371      0.2615  dump                     theora_look

Patch is in the attachment and is NOT intended to be checked into SVN. The purpose is to give out
some in-progress version, because I will leave to Germany for one month. Also it may help to play big videos.

Enjoy! :)

Regards

Rudolf
-------------- next part --------------
diff -Naur a/lib/decode.c b/lib/decode.c

--- a/lib/decode.c	2005-08-17 09:29:57.250868250 +0200
+++ b/lib/decode.c	2005-08-17 09:58:23.601508500 +0200
@@ -2031,7 +2031,7 @@
         if(pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
-          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values+256,
+          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
            refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a
diff -Naur a/lib/internal.h b/lib/internal.h
--- a/lib/internal.h	2005-08-17 09:29:57.318872500 +0200
+++ b/lib/internal.h	2005-08-17 10:05:34.076411500 +0200
@@ -242,6 +242,8 @@
    int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
   void (*restore_fpu)(void);
+  void (*oc_state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
+   int _refi,int _pli,int _fragy0,int _fragy_end);
 }oc_base_opt_vtable;
 
 
@@ -391,8 +393,6 @@
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
-void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end);
 #if defined(OC_DUMP_IMAGES)
 int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
  const char *_suf);
@@ -412,6 +412,8 @@
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu(const oc_theora_state *_state);
+void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 
 /*Default pure-C implementations.*/
 void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
@@ -427,5 +429,7 @@
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_c(void);
+void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 
 #endif
diff -Naur a/lib/state.c b/lib/state.c
--- a/lib/state.c	2005-08-17 09:29:57.314872250 +0200
+++ b/lib/state.c	2005-08-17 10:29:23.000000000 +0200
@@ -514,6 +514,8 @@
   _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+  _state->opt_vtable.oc_state_loop_filter_frag_rows=
+  			oc_state_loop_filter_frag_rows_c;
 }
 
 /*Initialize the accelerated function pointers.*/
@@ -922,7 +924,7 @@
   }
 }
 
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h_c(unsigned char *_pix,int _ystride,int *_bv){
   int y;
   _pix-=2;
   for(y=0;y<8;y++){
@@ -935,7 +937,7 @@
   }
 }
 
-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v_c(unsigned char *_pix,int _ystride,int *_bv){
   int y;
   _pix-=_ystride*2;
   for(y=0;y<8;y++){
@@ -977,8 +979,15 @@
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
+   _state->opt_vtable.oc_state_loop_filter_frag_rows(_state,_bv,_refi,
+   		_pli,_fragy0,_fragy_end);
+}
+
+void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
   theora_img_plane  *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag_top;
@@ -987,6 +996,7 @@
   oc_fragment       *frag_end;
   oc_fragment       *frag0_end;
   oc_fragment       *frag_bot;
+  _bv+=256;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -1004,16 +1014,16 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_h_c(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_v_c(frag->buffer[_refi],iplane->ystride,_bv);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
+          loop_filter_h_c(frag->buffer[_refi]+8,iplane->ystride,_bv);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
-          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+          loop_filter_v_c((frag+fplane->nhfrags)->buffer[_refi],
            iplane->ystride,_bv);
         }
       }
@@ -1031,7 +1041,7 @@
   framei=_state->ref_frame_idx[_frame];
   if(oc_state_loop_filter_init(_state,bounding_values+256))return;
   for(pli=0;pli<3;pli++){
-    oc_state_loop_filter_frag_rows(_state,bounding_values+256,
+    oc_state_loop_filter_frag_rows(_state,bounding_values,
      framei,pli,0,_state->fplanes[pli].nvfrags);
   }
 }
diff -Naur a/lib/x86/mmxstate.c b/lib/x86/mmxstate.c
--- a/lib/x86/mmxstate.c	2005-08-17 09:29:56.618828750 +0200
+++ b/lib/x86/mmxstate.c	2005-08-17 21:03:14.678545000 +0200
@@ -258,3 +258,237 @@
   /*This needs to be removed when decode specific functions are implemented:*/
   __asm__ __volatile__("emms\n\t");
 }
+
+/* loop filter code uses some MMXext instructions which are not present on 
+   early processors (Pentium II and AMD K6) */
+
+static const __attribute__((aligned(8),used)) ogg_int64_t V3=
+ 0x0003000300030003LL; 
+static const __attribute__((aligned(8),used)) ogg_int64_t V4=
+ 0x0004000400040004LL; 
+static const __attribute__((aligned(8),used)) ogg_int64_t V100=
+ 0x0100010001000100LL; 
+
+/* this code is straighforward implementation of the loop_filter_v
+   pix cells are not striped so we can load them directly to MMX regs
+
+TODO: some instruction stalls can be avoided
+
+*/
+
+static void loop_filter_v_mmx(unsigned char *_pix,int _ystride,int *_bv){
+  int y;
+  _pix-=_ystride*2;
+
+__asm__ __volatile__(
+"pxor %%mm0,%%mm0\n"  	/* mm0 = 0 */
+"movq (%0),%%mm7\n"	/* mm7 = _pix[0..8] */
+"lea (%1,%1,2),%%esi\n"	/* esi = _ystride*3 */
+"movq (%0,%%esi),%%mm4\n" /* mm4 = _pix[0..8]+_ystride*3] */
+"movq %%mm7,%%mm6\n"	/* mm6 = _pix[0..8] */
+"punpcklbw %%mm0,%%mm6\n" /* expand unsigned _pix[0..3] to 16 bits */
+"movq %%mm4,%%mm5\n"
+"punpckhbw %%mm0,%%mm7\n" /* expand unsigned _pix[4..8] to 16 bits */
+"punpcklbw %%mm0,%%mm4\n" /* expand other arrays too */
+"punpckhbw %%mm0,%%mm5\n"
+"psubw %%mm4,%%mm6\n" /* mm6 = mm6 - mm4 */
+"psubw %%mm5,%%mm7\n" /* mm7 = mm7 - mm5 */
+			/* mm7:mm6 = _p[0]-_p[_ystride*3] */
+"movq (%0,%1),%%mm4\n"   /* mm4 = _pix[0..8+_ystride] */
+"movq %%mm4,%%mm5\n"
+"movq (%0,%1,2),%%mm2\n" /* mm2 = _pix[0..8]+_ystride*2] */
+"movq %%mm2,%%mm3\n"
+"movq %%mm2,%%mm1\n" //ystride*2
+"punpckhbw %%mm0,%%mm5\n"
+"punpcklbw %%mm0,%%mm4\n" 
+"punpckhbw %%mm0,%%mm3\n"
+"punpcklbw %%mm0,%%mm2\n"
+"psubw %%mm5,%%mm3\n" 
+"psubw %%mm4,%%mm2\n" 
+			/* mm3:mm2 = (_pix[_ystride*2]-_pix[_ystride]); */
+"PMULLW (V3),%%mm3\n" 		/* *3 */
+"PMULLW (V3),%%mm2\n" 		/* *3 */
+"paddw %%mm7,%%mm3\n"   /* highpart */
+"paddw %%mm6,%%mm2\n"/* lowpart of _pix[0]-_pix[_ystride*3]+3*(_pix[_ystride*2]-_pix[_ystride]);  */
+"paddw (V4),%%mm3\n"  /* add 4 */
+"paddw (V4),%%mm2\n"  /* add 4 */
+"psraw $3,%%mm3\n"  /* >>3 f coefs high */
+"psraw $3,%%mm2\n"  /* >>3 f coefs low */
+"paddw (V100),%%mm3\n"  /* add 256 */
+"paddw (V100),%%mm2\n"  /* add 256 */
+
+" pextrw $0,%%mm2,%%esi\n"  /* In MM4:MM0 we have f coefs (16bits) */
+" pextrw $1,%%mm2,%%edi\n"  /* now perform MM7:MM6 = *(_bv+ f) */
+" pinsrw $0,(%2,%%esi,4),%%mm6\n"
+" pinsrw $1,(%2,%%edi,4),%%mm6\n"
+
+" pextrw $2,%%mm2,%%esi\n"
+" pextrw $3,%%mm2,%%edi\n"
+" pinsrw $2,(%2,%%esi,4),%%mm6\n"
+" pinsrw $3,(%2,%%edi,4),%%mm6\n"
+
+" pextrw $0,%%mm3,%%esi\n" 
+" pextrw $1,%%mm3,%%edi\n"
+" pinsrw $0,(%2,%%esi,4),%%mm7\n"
+" pinsrw $1,(%2,%%edi,4),%%mm7\n"
+
+" pextrw $2,%%mm3,%%esi\n"
+" pextrw $3,%%mm3,%%edi\n"
+" pinsrw $2,(%2,%%esi,4),%%mm7\n"
+" pinsrw $3, (%2,%%edi,4),%%mm7\n"   //MM7 MM6   f=*(_bv+(f+4>>3));
+
+"paddw %%mm6,%%mm4\n"  /* (_pix[_ystride]+f); */
+"paddw %%mm7,%%mm5\n"  /* (_pix[_ystride]+f); */
+"movq %%mm1,%%mm2\n"
+"punpcklbw %%mm0,%%mm1\n"
+"punpckhbw %%mm0,%%mm2\n" //[ystride*2]
+"psubw %%mm6,%%mm1\n" /* (_pix[_ystride*2]-f); */
+"psubw %%mm7,%%mm2\n" /* (_pix[_ystride*2]-f); */
+"packuswb %%mm2,%%mm1\n"
+"packuswb %%mm5,%%mm4\n"
+"movq %%mm1,(%0,%1,2)\n" /* _pix[_ystride*2]= */
+"movq %%mm4,(%0,%1)\n" /* _pix[_ystride]= */
+"emms\n"
+: 
+: "r" (_pix), "r" (_ystride), "r" (_bv)
+: "esi", "edi" , "memory"
+);
+
+}
+
+
+
+#define OC_LOOP_H_4x4 \
+__asm__ __volatile__( \
+"lea (%1,%1,2),%%esi\n"	 /* esi = _ystride*3 */  \
+"movd (%0), %%mm0\n"		/* 0 0 0 0 3 2 1 0 */ \
+"movd (%0,%1),%%mm1\n"    	/* 0 0 0 0 7 6 5 4 */ \
+"movd (%0,%1,2),%%mm2\n"  	/* 0 0 0 0 b a 9 8 */ \
+"movd (%0,%%esi),%%mm3\n" 	/* 0 0 0 0 f e d c */ \
+"punpcklbw %%mm1,%%mm0\n" 	/* mm0 = 7 3 6 2 5 1 4 0 */ \
+"punpcklbw %%mm3,%%mm2\n" 	/* mm2 = f b e a d 9 c 8 */ \
+"movq %%mm0,%%mm1\n"	 	/* mm1 = 7 3 6 2 5 1 4 0 */ \
+"punpcklwd %%mm2,%%mm1\n"	/* mm1 = d 9 5 1 c 8 4 0 */ \
+"punpckhwd %%mm2,%%mm0\n"	/* mm0 = f b 7 3 e a 6 2 */ \
+"pxor %%mm7,%%mm7\n" \
+"movq %%mm1,%%mm5\n" 		/* mm5 = d 9 5 1 c 8 4 0 */ \
+"punpckhbw %%mm7,%%mm5\n"	/* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/ \
+"punpcklbw %%mm7,%%mm1\n"	/* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/ \
+"movq %%mm0,%%mm3\n" 		/* mm3 = f b 7 3 e a 6 2 */ \
+"punpckhbw %%mm7,%%mm3\n"	/* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/ \
+"punpcklbw %%mm7,%%mm0\n"    	/* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
+ \
+"psubw %%mm3,%%mm1\n"		/* mm1 = pix[0]-pix[3] mm1 - mm3 */ \
+"movq %%mm0,%%mm7\n"		/* mm7 = pix[2]*/ \
+"psubw %%mm5,%%mm0\n" 		/* mm0 = pix[2]-pix[1] mm0 - mm5*/ \
+"PMULLW (V3),%%mm0\n" 		/* *3 */ \
+"paddw %%mm0,%%mm1\n" 		/* mm1 has f[0] ... f[4]*/ \
+"paddw (V4),%%mm1\n"  /* add 4 */ \
+"psraw $3,%%mm1\n"  	/* >>3 */ \
+"paddw (V100),%%mm1\n"  /* add 256 */ \
+" pextrw $0,%%mm1,%%esi\n"  /* In MM1 we have 4 f coefs (16bits) */ \
+" pextrw $1,%%mm1,%%edi\n"  /* now perform MM4 = *(_bv+ f) */ \
+" pinsrw $0,(%2,%%esi,4),%%mm4\n" \
+" pextrw $2,%%mm1,%%esi\n" \
+" pinsrw $1,(%2,%%edi,4),%%mm4\n" \
+" pextrw $3,%%mm1,%%edi\n" \
+" pinsrw $2,(%2,%%esi,4),%%mm4\n" \
+" pinsrw $3,(%2,%%edi,4),%%mm4\n" /* new f vals loaded */ \
+"pxor %%mm0,%%mm0\n" \
+" paddw %%mm4,%%mm5\n"	/*(_pix[1]+f);*/ \
+" psubw %%mm4,%%mm7\n" /* (_pix[2]-f); */ \
+" packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \
+" packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */  \
+" punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \
+" movd %%mm5,%%eax\n" /* eax = newpix21 */ \
+" movw %%ax,1(%0)\n" \
+" psrlq $32,%%mm5\n" /* why is so big stall here ? */ \
+" shrl $16,%%eax\n" \
+" lea 1(%0,%1,2),%%edi\n" \
+" movw %%ax,1(%0,%1,1)\n" \
+" movd %%mm5,%%eax\n"  /* eax = newpix21 high part */ \
+" lea (%1,%1,2),%%esi\n" \
+" movw %%ax,(%%edi)\n" \
+" shrl $16,%%eax\n" \
+" movw %%ax,1(%0,%%esi)\n" \
+" emms\n" \
+: \
+: "r" (_pix), "r" (_ystride), "r" (_bv) \
+: "esi", "edi" , "memory", "eax" \
+); \
+
+/* this code implements loop_filter_h
+   data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ...
+   in order to load all (four) p0's to one register we must transpose
+   the values in four mmx regs. When halfs is done we repeat for rest.
+  
+TODO: some instruction stalls can be avoided
+
+*/
+
+static void loop_filter_h_mmx(unsigned char *_pix,int _ystride,int *_bv){
+  _pix-=2;
+  OC_LOOP_H_4x4
+_pix+=_ystride*4;
+  OC_LOOP_H_4x4
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+  
+/*  we copy whole function because mmx routines will be inlined 4 times 
+    also _bv pointer should not be added with 256 because then we can use
+    non negative index in MMX code and we get rid of sign extension instructions
+*/
+
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  theora_img_plane  *iplane;
+  oc_fragment_plane *fplane;
+  oc_fragment       *frag_top;
+  oc_fragment       *frag0;
+  oc_fragment       *frag;
+  oc_fragment       *frag_end;
+  oc_fragment       *frag0_end;
+  oc_fragment       *frag_bot;
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  fplane=_state->fplanes+_pli;
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  frag_top=_state->frags+fplane->froffset;
+  frag0=frag_top+_fragy0*fplane->nhfrags;
+  frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+  frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+  while(frag0<frag0_end){
+    frag=frag0;
+    frag_end=frag+fplane->nhfrags;
+    while(frag<frag_end){
+      if(frag->coded){
+        if(frag>frag0){
+          loop_filter_h_mmx(frag->buffer[_refi],iplane->ystride,_bv);
+        }
+        if(frag0>frag_top){
+          loop_filter_v_mmx(frag->buffer[_refi],iplane->ystride,_bv);
+        }
+        if(frag+1<frag_end&&!(frag+1)->coded){
+          loop_filter_h_mmx(frag->buffer[_refi]+8,iplane->ystride,_bv);
+        }
+        if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+          loop_filter_v_mmx((frag+fplane->nhfrags)->buffer[_refi],
+           iplane->ystride,_bv);
+        }
+      }
+      frag++;
+    }
+    frag0+=fplane->nhfrags;
+  }
+}
diff -Naur a/lib/x86/x86int.h b/lib/x86/x86int.h
--- a/lib/x86/x86int.h	2005-08-17 09:29:56.622829000 +0200
+++ b/lib/x86/x86int.h	2005-08-17 10:11:36.867084500 +0200
@@ -20,5 +20,6 @@
 void oc_idct8x8_mmx(ogg_int16_t _y[64]);
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
 void oc_fill_idct_constants_mmx(void);
-
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,                                                    
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 #endif
diff -Naur a/lib/x86/x86state.c b/lib/x86/x86state.c
--- a/lib/x86/x86state.c	2005-08-17 09:29:56.622829000 +0200
+++ b/lib/x86/x86state.c	2005-08-17 10:22:48.441055250 +0200
@@ -10,6 +10,10 @@
     _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    	_state->opt_vtable.oc_state_loop_filter_frag_rows=
+			oc_state_loop_filter_frag_rows_mmx;
+    }
   }
   else oc_state_vtable_init_c(_state);
 }