[xiph-commits] r16332 - in branches/theora-thusnelda/lib: . dec dec/x86 dec/x86_vc enc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Fri Jul 24 03:24:28 PDT 2009
Author: tterribe
Date: 2009-07-24 03:24:28 -0700 (Fri, 24 Jul 2009)
New Revision: 16332
Modified:
branches/theora-thusnelda/lib/dec/decode.c
branches/theora-thusnelda/lib/dec/fragment.c
branches/theora-thusnelda/lib/dec/internal.c
branches/theora-thusnelda/lib/dec/state.c
branches/theora-thusnelda/lib/dec/x86/mmxstate.c
branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
branches/theora-thusnelda/lib/enc/analyze.c
branches/theora-thusnelda/lib/enc/tokenize.c
branches/theora-thusnelda/lib/internal.h
Log:
C optimizations for DC prediction.
These are relatively minor on modern platforms (1.6-2.7% in the decoder on
x86-64, less than 1% on the encoder), but can have a much larger impact on
older platforms/underpowered CPUs with poor branch prediction.
Modified version of a patch by Simon Hosie.
Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/decode.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -1415,12 +1415,88 @@
ncoded_fragis=0;
fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
for(fragy=fragy0;fragy<fragy_end;fragy++){
- for(fragx=0;fragx<nhfrags;fragx++,fragi++){
- if(!frags[fragi].coded)continue;
- pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc+=
- oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
- ncoded_fragis++;
+ if(fragy==0){
+ /*For the first row, all of the cases reduce to just using the previous
+ predictor for the same reference frame.*/
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ if(frags[fragi].coded){
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+ ncoded_fragis++;
+ }
+ }
}
+ else{
+ oc_fragment *u_frags;
+ int l_ref;
+ int ul_ref;
+ int u_ref;
+ u_frags=frags-nhfrags;
+ l_ref=-1;
+ ul_ref=-1;
+ u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ int ur_ref;
+ if(fragx+1>=nhfrags)ur_ref=-1;
+ else{
+ ur_ref=u_frags[fragi+1].coded?
+ OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+ }
+ if(frags[fragi].coded){
+ int pred;
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ /*We break out a separate case based on which of our neighbors use
+ the same reference frames.
+ This is somewhat faster than trying to make a generic case which
+ handles all of them, since it reduces lots of poorly predicted
+ jumps to one switch statement, and also lets a number of the
+ multiplications be optimized out by strength reduction.*/
+ switch((l_ref==ref)|(ul_ref==ref)<<1|
+ (u_ref==ref)<<2|(ur_ref==ref)<<3){
+ default:pred=pred_last[ref];break;
+ case 1:
+ case 3:pred=frags[fragi-1].dc;break;
+ case 2:pred=u_frags[fragi-1].dc;break;
+ case 4:
+ case 6:
+ case 12:pred=u_frags[fragi].dc;break;
+ case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+ case 8:pred=u_frags[fragi+1].dc;break;
+ case 9:
+ case 11:
+ case 13:{
+ pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+ }break;
+ case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+ case 14:{
+ pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+ +10*u_frags[fragi].dc)/16;
+ }break;
+ case 7:
+ case 15:{
+ int p0;
+ int p1;
+ int p2;
+ p0=frags[fragi-1].dc;
+ p1=u_frags[fragi-1].dc;
+ p2=u_frags[fragi].dc;
+ pred=(29*(p0+p2)-26*p1)/32;
+ if(abs(pred-p2)>128)pred=p2;
+ else if(abs(pred-p0)>128)pred=p0;
+ else if(abs(pred-p1)>128)pred=p1;
+ }break;
+ }
+ pred_last[ref]=frags[fragi].dc+=pred;
+ ncoded_fragis++;
+ l_ref=ref;
+ }
+ else l_ref=-1;
+ ul_ref=u_ref;
+ u_ref=ur_ref;
+ }
+ }
}
_pipe->ncoded_fragis[_pli]=ncoded_fragis;
/*Also save the number of uncoded fragments so we know how many to copy.*/
Modified: branches/theora-thusnelda/lib/dec/fragment.c
===================================================================
--- branches/theora-thusnelda/lib/dec/fragment.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/fragment.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -85,121 +85,3 @@
}
void oc_restore_fpu_c(void){}
-
-
-/*Computes the predicted DC value for the given fragment.
- This requires that the fully decoded DC values be available for the left,
- upper-left, upper, and upper-right fragments (if they exist).
- _frag: The fragment to predict the DC value for.
- _fplane: The fragment plane the fragment belongs to.
- _x: The x-coordinate of the fragment.
- _y: The y-coordinate of the fragment.
- _pred_last: The last fully-decoded DC value for each predictor frame
- (OC_FRAME_GOLD, OC_FRAME_PREV and OC_FRAME_SELF).
- This should be initialized to 0's for the first fragment in each
- color plane.
- Return: The predicted DC value for this fragment.*/
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]){
- static const signed char PRED_SCALE[16][4]={
- /*0*/
- {0,0,0,0},
- /*OC_PL*/
- {1,0,0,0},
- /*OC_PUL*/
- {1,0,0,0},
- /*OC_PL|OC_PUL*/
- {1,0,0,0},
- /*OC_PU*/
- {1,0,0,0},
- /*OC_PL|OC_PU*/
- {1,1,0,0},
- /*OC_PUL|OC_PU*/
- {0,1,0,0},
- /*OC_PL|OC_PUL|PC_PU*/
- {29,-26,29,0},
- /*OC_PUR*/
- {1,0,0,0},
- /*OC_PL|OC_PUR*/
- {75,53,0,0},
- /*OC_PUL|OC_PUR*/
- {1,1,0,0},
- /*OC_PL|OC_PUL|OC_PUR*/
- {75,0,53,0},
- /*OC_PU|OC_PUR*/
- {1,0,0,0},
- /*OC_PL|OC_PU|OC_PUR*/
- {75,0,53,0},
- /*OC_PUL|OC_PU|OC_PUR*/
- {3,10,3,0},
- /*OC_PL|OC_PUL|OC_PU|OC_PUR*/
- {29,-26,29,0}
- };
- static const unsigned char PRED_SHIFT[16]={0,0,0,0,0,1,0,5,0,7,1,7,0,7,4,5};
- static const unsigned char PRED_RMASK[16]={
- 0,0,0,0,0,1,0,31,0,127,1,127,0,127,15,31
- };
- static const unsigned char BC_MASK[8]={
- /*No boundary condition.*/
- OC_PL|OC_PUL|OC_PU|OC_PUR,
- /*Left column.*/
- OC_PU|OC_PUR,
- /*Top row.*/
- OC_PL,
- /*Top row, left column.*/
- 0,
- /*Right column.*/
- OC_PL|OC_PUL|OC_PU,
- /*Right and left column.*/
- OC_PU,
- /*Top row, right column.*/
- OC_PL,
- /*Top row, right and left column.*/
- 0
- };
- /*Predictor fragments, left, up-left, up, up-right.*/
- const oc_fragment *predfr[4];
- /*The frame used for prediction for this fragment.*/
- int pred_frame;
- /*The boundary condition flags.*/
- int bc;
- /*DC predictor values: left, up-left, up, up-right, missing values skipped.*/
- int p[4];
- /*Predictor count.*/
- int np;
- /*Which predictor constants to use.*/
- int pflags;
- /*The predicted DC value.*/
- int ret;
- int i;
- pred_frame=OC_FRAME_FOR_MODE[_frag->mb_mode];
- bc=(_x==0)+((_y==0)<<1)+((_x+1==_fplane->nhfrags)<<2);
- predfr[0]=_frag-1;
- predfr[1]=_frag-_fplane->nhfrags-1;
- predfr[2]=predfr[1]+1;
- predfr[3]=predfr[2]+1;
- np=0;
- pflags=0;
- for(i=0;i<4;i++){
- int pflag;
- pflag=1<<i;
- if((BC_MASK[bc]&pflag)&&predfr[i]->coded&&
- OC_FRAME_FOR_MODE[predfr[i]->mb_mode]==pred_frame){
- p[np++]=predfr[i]->dc;
- pflags|=pflag;
- }
- }
- if(pflags==0)return _pred_last[pred_frame];
- else{
- ret=PRED_SCALE[pflags][0]*p[0];
- /*LOOP VECTORIZES.*/
- for(i=1;i<np;i++)ret+=PRED_SCALE[pflags][i]*p[i];
- ret=OC_DIV_POW2(ret,PRED_SHIFT[pflags],PRED_RMASK[pflags]);
- }
- if((pflags&(OC_PL|OC_PUL|OC_PU))==(OC_PL|OC_PUL|OC_PU)){
- if(abs(ret-p[2])>128)ret=p[2];
- else if(abs(ret-p[0])>128)ret=p[0];
- else if(abs(ret-p[1])>128)ret=p[1];
- }
- return ret;
-}
Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/internal.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -50,26 +50,6 @@
35,36,48,49,57,58,62,63
};
-/*The predictor frame to use for each macro block mode.*/
-const unsigned char OC_FRAME_FOR_MODE[8]={
- /*OC_MODE_INTER_NOMV*/
- OC_FRAME_PREV,
- /*OC_MODE_INTRA*/
- OC_FRAME_SELF,
- /*OC_MODE_INTER_MV*/
- OC_FRAME_PREV,
- /*OC_MODE_INTER_MV_LAST*/
- OC_FRAME_PREV,
- /*OC_MODE_INTER_MV_LAST2*/
- OC_FRAME_PREV,
- /*OC_MODE_GOLDEN*/
- OC_FRAME_GOLD,
- /*OC_MODE_GOLDEN_MV*/
- OC_FRAME_GOLD,
- /*OC_MODE_INTER_MV_FOUR*/
- OC_FRAME_PREV,
-};
-
/*A map from physical macro block ordering to bitstream macro block
ordering within a super block.*/
const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}};
Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/state.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -881,7 +881,7 @@
const unsigned char *ref;
int mvoffsets[2];
ref=
- _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -45,7 +45,7 @@
const unsigned char *ref;
int mvoffsets[2];
ref=
- _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -45,7 +45,7 @@
const unsigned char *ref;
int mvoffsets[2];
ref=
- _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/enc/analyze.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -686,7 +686,7 @@
}
mb_mode=frags[_fragi].mb_mode;
ref=_enc->state.ref_frame_data[
- _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]+frag_offs;
+ _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
+frag_offs;
/*Motion compensation:*/
@@ -1655,7 +1655,7 @@
ptrdiff_t frag_offs;
src=_enc->state.ref_frame_data[OC_FRAME_IO];
ref=_enc->state.ref_frame_data[
- _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[_mb_mode]]];
+ _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
ystride=_enc->state.ref_ystride[0];
frag_buf_offs=_enc->state.frag_buf_offs;
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/enc/tokenize.c 2009-07-24 10:24:28 UTC (rev 16332)
@@ -680,13 +680,88 @@
nhfrags=fplane->nhfrags;
fragi=fplane->froffset+_fragy0*nhfrags;
for(fragy=_fragy0;fragy<_frag_yend;fragy++){
- for(fragx=0;fragx<nhfrags;fragx++,fragi++){
- if(frags[fragi].coded){
- frag_dc[fragi]=frags[fragi].dc
- -oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
- pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc;
+ if(fragy==0){
+ /*For the first row, all of the cases reduce to just using the previous
+ predictor for the same reference frame.*/
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ if(frags[fragi].coded){
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
+ pred_last[ref]=frags[fragi].dc;
+ }
}
}
+ else{
+ const oc_fragment *u_frags;
+ int l_ref;
+ int ul_ref;
+ int u_ref;
+ u_frags=frags-nhfrags;
+ l_ref=-1;
+ ul_ref=-1;
+ u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+ for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+ int ur_ref;
+ if(fragx+1>=nhfrags)ur_ref=-1;
+ else{
+ ur_ref=u_frags[fragi+1].coded?
+ OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+ }
+ if(frags[fragi].coded){
+ int pred;
+ int ref;
+ ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+ /*We break out a separate case based on which of our neighbors use
+ the same reference frames.
+ This is somewhat faster than trying to make a generic case which
+ handles all of them, since it reduces lots of poorly predicted
+ jumps to one switch statement, and also lets a number of the
+ multiplications be optimized out by strength reduction.*/
+ switch((l_ref==ref)|(ul_ref==ref)<<1|
+ (u_ref==ref)<<2|(ur_ref==ref)<<3){
+ default:pred=pred_last[ref];break;
+ case 1:
+ case 3:pred=frags[fragi-1].dc;break;
+ case 2:pred=u_frags[fragi-1].dc;break;
+ case 4:
+ case 6:
+ case 12:pred=u_frags[fragi].dc;break;
+ case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+ case 8:pred=u_frags[fragi+1].dc;break;
+ case 9:
+ case 11:
+ case 13:{
+ pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+ }break;
+ case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+ case 14:{
+ pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+ +10*u_frags[fragi].dc)/16;
+ }break;
+ case 7:
+ case 15:{
+ int p0;
+ int p1;
+ int p2;
+ p0=frags[fragi-1].dc;
+ p1=u_frags[fragi-1].dc;
+ p2=u_frags[fragi].dc;
+ pred=(29*(p0+p2)-26*p1)/32;
+ if(abs(pred-p2)>128)pred=p2;
+ else if(abs(pred-p0)>128)pred=p0;
+ else if(abs(pred-p1)>128)pred=p1;
+ }break;
+ }
+ frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
+ pred_last[ref]=frags[fragi].dc;
+ l_ref=ref;
+ }
+ else l_ref=-1;
+ ul_ref=u_ref;
+ u_ref=ur_ref;
+ }
+ }
}
}
Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h 2009-07-24 07:26:18 UTC (rev 16331)
+++ branches/theora-thusnelda/lib/internal.h 2009-07-24 10:24:28 UTC (rev 16332)
@@ -134,19 +134,11 @@
/*The number of (coded) modes.*/
#define OC_NMODES (8)
-/*Macro block is not coded.*/
-#define OC_MODE_NOT_CODED (8)
+/*Determines the reference frame used for a given MB mode.*/
+#define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+ OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
-/*Predictor bit flags.*/
-/*Left.*/
-#define OC_PL (1)
-/*Upper-left.*/
-#define OC_PUL (2)
-/*Up.*/
-#define OC_PU (4)
-/*Upper-right.*/
-#define OC_PUR (8)
-
/*Constants for the packet state machine common between encoder and decoder.*/
/*Next packet to emit/read: Codec info header.*/
@@ -403,8 +395,6 @@
/*A map from the coefficient number in a block to its index in the zig zag
scan.*/
extern const unsigned char OC_IZIG_ZAG[64];
-/*The predictor frame to use for each macro block mode.*/
-extern const unsigned char OC_FRAME_FOR_MODE[OC_NMODES];
/*A map from physical macro block ordering to bitstream macro block
ordering within a super block.*/
extern const unsigned char OC_MB_MAP[2][2];
@@ -431,9 +421,6 @@
ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits);
-int oc_frag_pred_dc(const oc_fragment *_frag,
- const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
-
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
void oc_state_clear(oc_theora_state *_state);
void oc_state_vtable_init_c(oc_theora_state *_state);
More information about the commits
mailing list