[xiph-commits] r16332 - in branches/theora-thusnelda/lib: . dec dec/x86 dec/x86_vc enc

tterribe at svn.xiph.org tterribe at svn.xiph.org
Fri Jul 24 03:24:28 PDT 2009


Author: tterribe
Date: 2009-07-24 03:24:28 -0700 (Fri, 24 Jul 2009)
New Revision: 16332

Modified:
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/fragment.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
   branches/theora-thusnelda/lib/enc/analyze.c
   branches/theora-thusnelda/lib/enc/tokenize.c
   branches/theora-thusnelda/lib/internal.h
Log:
C optimizations for DC prediction.
These are relatively minor on modern platforms (1.6-2.7% in the decoder on
 x86-64, less than 1% on the encoder), but can have a much larger impact on
 older platforms/underpowered CPUs with poor branch prediction.
Modified version of a patch by Simon Hosie.


Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -1415,12 +1415,88 @@
   ncoded_fragis=0;
   fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
   for(fragy=fragy0;fragy<fragy_end;fragy++){
-    for(fragx=0;fragx<nhfrags;fragx++,fragi++){
-      if(!frags[fragi].coded)continue;
-      pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc+=
-       oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
-      ncoded_fragis++;
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+          ncoded_fragis++;
+        }
+      }
     }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          pred_last[ref]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
   }
   _pipe->ncoded_fragis[_pli]=ncoded_fragis;
   /*Also save the number of uncoded fragments so we know how many to copy.*/

Modified: branches/theora-thusnelda/lib/dec/fragment.c
===================================================================
--- branches/theora-thusnelda/lib/dec/fragment.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/fragment.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -85,121 +85,3 @@
 }
 
 void oc_restore_fpu_c(void){}
-
-
-/*Computes the predicted DC value for the given fragment.
-  This requires that the fully decoded DC values be available for the left,
-   upper-left, upper, and upper-right fragments (if they exist).
-  _frag:      The fragment to predict the DC value for.
-  _fplane:    The fragment plane the fragment belongs to.
-  _x:         The x-coordinate of the fragment.
-  _y:         The y-coordinate of the fragment.
-  _pred_last: The last fully-decoded DC value for each predictor frame
-               (OC_FRAME_GOLD, OC_FRAME_PREV and OC_FRAME_SELF).
-              This should be initialized to 0's for the first fragment in each
-               color plane.
-  Return: The predicted DC value for this fragment.*/
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]){
-  static const signed char   PRED_SCALE[16][4]={
-    /*0*/
-    {0,0,0,0},
-    /*OC_PL*/
-    {1,0,0,0},
-    /*OC_PUL*/
-    {1,0,0,0},
-    /*OC_PL|OC_PUL*/
-    {1,0,0,0},
-    /*OC_PU*/
-    {1,0,0,0},
-    /*OC_PL|OC_PU*/
-    {1,1,0,0},
-    /*OC_PUL|OC_PU*/
-    {0,1,0,0},
-    /*OC_PL|OC_PUL|PC_PU*/
-    {29,-26,29,0},
-    /*OC_PUR*/
-    {1,0,0,0},
-    /*OC_PL|OC_PUR*/
-    {75,53,0,0},
-    /*OC_PUL|OC_PUR*/
-    {1,1,0,0},
-    /*OC_PL|OC_PUL|OC_PUR*/
-    {75,0,53,0},
-    /*OC_PU|OC_PUR*/
-    {1,0,0,0},
-    /*OC_PL|OC_PU|OC_PUR*/
-    {75,0,53,0},
-    /*OC_PUL|OC_PU|OC_PUR*/
-    {3,10,3,0},
-    /*OC_PL|OC_PUL|OC_PU|OC_PUR*/
-    {29,-26,29,0}
-  };
-  static const unsigned char PRED_SHIFT[16]={0,0,0,0,0,1,0,5,0,7,1,7,0,7,4,5};
-  static const unsigned char PRED_RMASK[16]={
-    0,0,0,0,0,1,0,31,0,127,1,127,0,127,15,31
-  };
-  static const unsigned char BC_MASK[8]={
-    /*No boundary condition.*/
-    OC_PL|OC_PUL|OC_PU|OC_PUR,
-    /*Left column.*/
-    OC_PU|OC_PUR,
-    /*Top row.*/
-    OC_PL,
-    /*Top row, left column.*/
-    0,
-    /*Right column.*/
-    OC_PL|OC_PUL|OC_PU,
-    /*Right and left column.*/
-    OC_PU,
-    /*Top row, right column.*/
-    OC_PL,
-    /*Top row, right and left column.*/
-    0
-  };
-  /*Predictor fragments, left, up-left, up, up-right.*/
-  const oc_fragment *predfr[4];
-  /*The frame used for prediction for this fragment.*/
-  int                pred_frame;
-  /*The boundary condition flags.*/
-  int                bc;
-  /*DC predictor values: left, up-left, up, up-right, missing values skipped.*/
-  int                p[4];
-  /*Predictor count.*/
-  int                np;
-  /*Which predictor constants to use.*/
-  int                pflags;
-  /*The predicted DC value.*/
-  int                ret;
-  int                i;
-  pred_frame=OC_FRAME_FOR_MODE[_frag->mb_mode];
-  bc=(_x==0)+((_y==0)<<1)+((_x+1==_fplane->nhfrags)<<2);
-  predfr[0]=_frag-1;
-  predfr[1]=_frag-_fplane->nhfrags-1;
-  predfr[2]=predfr[1]+1;
-  predfr[3]=predfr[2]+1;
-  np=0;
-  pflags=0;
-  for(i=0;i<4;i++){
-    int pflag;
-    pflag=1<<i;
-    if((BC_MASK[bc]&pflag)&&predfr[i]->coded&&
-     OC_FRAME_FOR_MODE[predfr[i]->mb_mode]==pred_frame){
-      p[np++]=predfr[i]->dc;
-      pflags|=pflag;
-    }
-  }
-  if(pflags==0)return _pred_last[pred_frame];
-  else{
-    ret=PRED_SCALE[pflags][0]*p[0];
-    /*LOOP VECTORIZES.*/
-    for(i=1;i<np;i++)ret+=PRED_SCALE[pflags][i]*p[i];
-    ret=OC_DIV_POW2(ret,PRED_SHIFT[pflags],PRED_RMASK[pflags]);
-  }
-  if((pflags&(OC_PL|OC_PUL|OC_PU))==(OC_PL|OC_PUL|OC_PU)){
-    if(abs(ret-p[2])>128)ret=p[2];
-    else if(abs(ret-p[0])>128)ret=p[0];
-    else if(abs(ret-p[1])>128)ret=p[1];
-  }
-  return ret;
-}

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -50,26 +50,6 @@
   35,36,48,49,57,58,62,63
 };
 
-/*The predictor frame to use for each macro block mode.*/
-const unsigned char OC_FRAME_FOR_MODE[8]={
-  /*OC_MODE_INTER_NOMV*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTRA*/
-  OC_FRAME_SELF,
-  /*OC_MODE_INTER_MV*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTER_MV_LAST*/
-  OC_FRAME_PREV,
-  /*OC_MODE_INTER_MV_LAST2*/
-  OC_FRAME_PREV,
-  /*OC_MODE_GOLDEN*/
-  OC_FRAME_GOLD,
-  /*OC_MODE_GOLDEN_MV*/
-  OC_FRAME_GOLD,
-  /*OC_MODE_INTER_MV_FOUR*/
-  OC_FRAME_PREV,
-};
-
 /*A map from physical macro block ordering to bitstream macro block
    ordering within a super block.*/
 const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}};

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -881,7 +881,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -45,7 +45,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -45,7 +45,7 @@
     const unsigned char *ref;
     int                  mvoffsets[2];
     ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){

Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/enc/analyze.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -686,7 +686,7 @@
   }
   mb_mode=frags[_fragi].mb_mode;
   ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]+frag_offs;
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
   dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
    +frag_offs;
   /*Motion compensation:*/
@@ -1655,7 +1655,7 @@
   ptrdiff_t              frag_offs;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[_mb_mode]]];
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];

Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/enc/tokenize.c	2009-07-24 10:24:28 UTC (rev 16332)
@@ -680,13 +680,88 @@
   nhfrags=fplane->nhfrags;
   fragi=fplane->froffset+_fragy0*nhfrags;
   for(fragy=_fragy0;fragy<_frag_yend;fragy++){
-    for(fragx=0;fragx<nhfrags;fragx++,fragi++){
-      if(frags[fragi].coded){
-        frag_dc[fragi]=frags[fragi].dc
-         -oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
-        pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc;
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
+          pred_last[ref]=frags[fragi].dc;
+        }
       }
     }
+    else{
+      const oc_fragment *u_frags;
+      int                l_ref;
+      int                ul_ref;
+      int                u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
+          pred_last[ref]=frags[fragi].dc;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
   }
 }
 

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/internal.h	2009-07-24 10:24:28 UTC (rev 16332)
@@ -134,19 +134,11 @@
 /*The number of (coded) modes.*/
 #define OC_NMODES              (8)
 
-/*Macro block is not coded.*/
-#define OC_MODE_NOT_CODED      (8)
+/*Determines the reference frame used for a given MB mode.*/
+#define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
 
-/*Predictor bit flags.*/
-/*Left.*/
-#define OC_PL  (1)
-/*Upper-left.*/
-#define OC_PUL (2)
-/*Up.*/
-#define OC_PU  (4)
-/*Upper-right.*/
-#define OC_PUR (8)
-
 /*Constants for the packet state machine common between encoder and decoder.*/
 
 /*Next packet to emit/read: Codec info header.*/
@@ -403,8 +395,6 @@
 /*A map from the coefficient number in a block to its index in the zig zag
    scan.*/
 extern const unsigned char OC_IZIG_ZAG[64];
-/*The predictor frame to use for each macro block mode.*/
-extern const unsigned char OC_FRAME_FOR_MODE[OC_NMODES];
 /*A map from physical macro block ordering to bitstream macro block
    ordering within a super block.*/
 extern const unsigned char OC_MB_MAP[2][2];
@@ -431,9 +421,6 @@
 
 ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits);
 
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
-
 int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
 void oc_state_clear(oc_theora_state *_state);
 void oc_state_vtable_init_c(oc_theora_state *_state);



More information about the commits mailing list