[xiph-commits] r16325 - in branches/theora-gumboot/lib: . dec dec/x86 dec/x86_vc enc
gumboot at svn.xiph.org
gumboot at svn.xiph.org
Thu Jul 23 05:50:50 PDT 2009
Author: gumboot
Date: 2009-07-23 05:50:50 -0700 (Thu, 23 Jul 2009)
New Revision: 16325
Modified:
branches/theora-gumboot/lib/dec/decode.c
branches/theora-gumboot/lib/dec/state.c
branches/theora-gumboot/lib/dec/x86/mmxidct.c
branches/theora-gumboot/lib/dec/x86/x86state.c
branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
branches/theora-gumboot/lib/dec/x86_vc/x86state.c
branches/theora-gumboot/lib/enc/
branches/theora-gumboot/lib/enc/analyze.c
branches/theora-gumboot/lib/enc/tokenize.c
branches/theora-gumboot/lib/internal.h
Log:
Drop zzbuffer encoder optimisation, because there's no visible benefit. move OC_FZIG_ZAG_MMX to x86state.c and expose it alongside the vtable stuff (I think this slows it down marginally, but it's more presentable). Integrate change 16324.
Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================
--- branches/theora-gumboot/lib/dec/decode.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/decode.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -1319,29 +1319,6 @@
(fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
}
-#if defined(OC_X86_ASM)
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
- each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[128]={
- 0, 8, 1, 2, 9,16,24,17,
- 10, 3,32,11,18,25, 4,12,
- 5,26,19,40,33,34,41,48,
- 27, 6,13,20,28,21,14, 7,
- 56,49,42,35,43,50,57,36,
- 15,22,29,30,23,44,37,58,
- 51,59,38,45,52,31,60,53,
- 46,39,47,54,61,62,55,63,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
- 64,64,64,64,64,64,64,64,
-};
-#endif
-
/*Reconstructs all coded fragments in a single MCU (one or two super block
rows).
This requires that each coded fragment have a proper macro block mode and
@@ -1354,6 +1331,7 @@
static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
oc_dec_pipeline_state *_pipe,int _pli){
unsigned char *dct_tokens;
+ const unsigned char *dct_fzig_zag;
ogg_uint16_t dc_quant[2];
const oc_fragment *frags;
const ptrdiff_t *coded_fragis;
@@ -1363,6 +1341,7 @@
ptrdiff_t *eob_runs;
int qti;
dct_tokens=_dec->dct_tokens;
+ dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
frags=_dec->state.frags;
coded_fragis=_pipe->coded_fragis[_pli];
ncoded_fragis=_pipe->ncoded_fragis[_pli];
@@ -1418,11 +1397,7 @@
eob_runs[zzi]=eob;
ti[zzi]=lti;
zzi+=rlen;
-#if defined(OC_X86_ASM)
- dct_coeffs[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
-#else
- dct_coeffs[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
-#endif
+ dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
zzi+=(eob==0);
}
}
Modified: branches/theora-gumboot/lib/dec/state.c
===================================================================
--- branches/theora-gumboot/lib/dec/state.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/state.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -591,6 +591,7 @@
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_c;
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
}
/*Initialize the accelerated function pointers.*/
Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -528,19 +528,6 @@
);
}
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
- each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
- 0, 8, 1, 2, 9,16,24,17,
- 10, 3,32,11,18,25, 4,12,
- 5,26,19,40,33,34,41,48,
- 27, 6,13,20,28,21,14, 7,
- 56,49,42,35,43,50,57,36,
- 15,22,29,30,23,44,37,58,
- 51,59,38,45,52,31,60,53,
- 46,39,47,54,61,62,55,63
-};
-
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
Modified: branches/theora-gumboot/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/x86state.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86/x86state.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -21,6 +21,27 @@
#include "../../cpu.c"
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+ each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+};
+
void oc_state_vtable_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -34,6 +55,7 @@
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
else oc_state_vtable_init_c(_state);
}
Modified: branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -526,19 +526,6 @@
}
}
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
- each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
- 0, 8, 1, 2, 9,16,24,17,
- 10, 3,32,11,18,25, 4,12,
- 5,26,19,40,33,34,41,48,
- 27, 6,13,20,28,21,14, 7,
- 56,49,42,35,43,50,57,36,
- 15,22,29,30,23,44,37,58,
- 51,59,38,45,52,31,60,53,
- 46,39,47,54,61,62,55,63
-};
-
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
Modified: branches/theora-gumboot/lib/dec/x86_vc/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/x86state.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86_vc/x86state.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -21,6 +21,27 @@
#include "../../cpu.c"
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+ each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+};
+
void oc_state_vtable_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -34,6 +55,7 @@
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
else oc_state_vtable_init_c(_state);
}
Property changes on: branches/theora-gumboot/lib/enc
___________________________________________________________________
Modified: svn:mergeinfo
- /branches/theora-thusnelda/lib/enc:16321
+ /branches/theora-thusnelda/lib/enc:16321,16323-16324
Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/enc/analyze.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -636,8 +636,7 @@
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
oc_rd_metric *_mo,oc_token_checkpoint **_stack){
- OC_ALIGN16(ogg_int16_t buffer[64]);
- OC_ALIGN16(ogg_int16_t zzbuffer[64]);
+ OC_ALIGN16(ogg_int16_t dct[64]);
OC_ALIGN16(ogg_int16_t data[64]);
ogg_uint16_t dc_dequant;
const ogg_uint16_t *dequant;
@@ -675,15 +674,16 @@
borderi=frags[_fragi].borderi;
qii=frags[_fragi].qii;
if(qii&~3){
-#if 1
- /*Enable early skip detection.*/
- frags[_fragi].coded=0;
- return 0;
-#else
- /*Try and code the fragment anyway.*/
- qii&=3;
- frags[_fragi].qii=qii;
-#endif
+ if(!_pli){
+ /*Enable early skip detection only for luma blocks.*/
+ frags[_fragi].coded=0;
+ return 0;
+ }
+ else{
+ /*Try and code chroma blocks anyway.*/
+ qii&=3;
+ frags[_fragi].qii=qii;
+ }
}
mb_mode=frags[_fragi].mb_mode;
ref=_enc->state.ref_frame_data[
@@ -731,25 +731,25 @@
}
#endif
/*Transform:*/
- oc_enc_fdct8x8(_enc,buffer,data);
+ oc_enc_fdct8x8(_enc,dct,data);
/*Quantize the DC coefficient:*/
qti=mb_mode!=OC_MODE_INTRA;
enquant=_pipe->enquant[_pli][0][qti];
dc_dequant=_pipe->dequant[_pli][0][qti][0];
- v=buffer[0];
+ v=dct[0];
val=v<<1;
s=OC_SIGNMASK(val);
val+=dc_dequant+s^s;
val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
- data[0]=OC_CLAMPI(-580,val,580);
+ dc=OC_CLAMPI(-580,val,580);
+ data[0]=dc;
nonzero=0;
/*Quantize the AC coefficients:*/
dequant=_pipe->dequant[_pli][qii][qti];
enquant=_pipe->enquant[_pli][qii][qti];
for(zzi=1;zzi<64;zzi++){
- v=buffer[OC_FZIG_ZAG[zzi]];
+ v=dct[OC_FZIG_ZAG[zzi]];
d=dequant[zzi];
- zzbuffer[zzi]=v;
val=v<<1;
v=abs(val);
if(v>=d){
@@ -768,9 +768,8 @@
}
/*Tokenize.*/
checkpoint=*_stack;
- ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,zzbuffer,nonzero+1,
+ ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
_stack,qti?0:3);
- dc=data[0];
/*Reconstruct.
TODO: nonzero may need to be adjusted after tokenization.*/
if(nonzero==0){
@@ -778,12 +777,12 @@
int ci;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
- p=(ogg_int16_t)(data[0]*(ogg_int32_t)dc_dequant+15>>5);
+ p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
/*LOOP VECTORIZES.*/
for(ci=0;ci<64;ci++)data[ci]=p;
}
else{
- data[0]*=dc_dequant;
+ data[0]=dc*dc_dequant;
oc_idct8x8(&_enc->state,data,nonzero+1,nonzero+1);
}
if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/enc/tokenize.c 2009-07-23 12:50:50 UTC (rev 16325)
@@ -208,21 +208,6 @@
int qc;
};
-#if defined(OC_X86_ASM)
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
- each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
- 0, 8, 1, 2, 9,16,24,17,
- 10, 3,32,11,18,25, 4,12,
- 5,26,19,40,33,34,41,48,
- 27, 6,13,20,28,21,14, 7,
- 56,49,42,35,43,50,57,36,
- 15,22,29,30,23,44,37,58,
- 51,59,38,45,52,31,60,53,
- 46,39,47,54,61,62,55,63,
-};
-#endif
-
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -233,6 +218,7 @@
ogg_uint32_t d2_accum[64];
oc_quant_token tokens[64][2];
ogg_uint16_t *eob_run;
+ const unsigned char *dct_fzig_zag;
ogg_uint32_t cost;
int bits;
int eob;
@@ -246,6 +232,7 @@
int qc;
huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
eob_run=_enc->eob_run[_pli];
+ dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
memset(tokens[0],0,sizeof(tokens[0]));
best_flags=nzflags=0;
zflags=1;
@@ -270,7 +257,7 @@
qc=_qdct[zzi];
s=-(qc<0);
qc=qc+s^s;
- c=_dct[zzi];
+ c=_dct[OC_FZIG_ZAG[zzi]];
if(qc<=1){
ogg_uint32_t sum_d2;
int nzeros;
@@ -344,7 +331,7 @@
token=OC_DCT_RUN_CAT1B+cat;
eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
}
- e=(_dct[zzj]+val_s^val_s)-_dequant[zzj];
+ e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
cost=d2+lambda*bits+tokens[zzk][tk].cost;
@@ -363,7 +350,7 @@
token=OC_DCT_RUN_CAT2A+cat;
bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
val=2+((val+val_s^val_s)>2);
- e=(_dct[zzj]+val_s^val_s)-_dequant[zzj]*val;
+ e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
cost=d2+lambda*bits+tokens[zzk][tk].cost;
if(cost<=best_cost){
@@ -670,11 +657,7 @@
next=tokens[zzi][ti].next;
qc=tokens[zzi][ti].qc;
zzj=(next>>1)-1&63;
-#if defined(OC_X86_ASM)
- _qdct[OC_FZIG_ZAG_MMX[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
-#else
- _qdct[OC_FZIG_ZAG[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
-#endif
+ _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
zzi=next>>1;
ti=next&1;
}
Modified: branches/theora-gumboot/lib/internal.h
===================================================================
--- branches/theora-gumboot/lib/internal.h 2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/internal.h 2009-07-23 12:50:50 UTC (rev 16325)
@@ -65,6 +65,7 @@
typedef struct oc_fragment oc_fragment;
typedef struct oc_fragment_plane oc_fragment_plane;
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
+typedef struct oc_base_opt_data oc_base_opt_data;
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
typedef struct oc_theora_state oc_theora_state;
@@ -297,6 +298,11 @@
void (*restore_fpu)(void);
};
+/*The shared (encoder and decoder) tables that vary according to which variants
+ of the above functions are used.*/
+struct oc_base_opt_data{
+ const unsigned char *dct_fzig_zag;
+};
/*State information common to both the encoder and decoder.*/
@@ -305,6 +311,8 @@
th_info info;
/*Table for shared accelerated functions.*/
oc_base_opt_vtable opt_vtable;
+ /*Table for shared data used by accelerated functions.*/
+ oc_base_opt_data opt_data;
/*CPU flags to detect the presence of extended instruction sets.*/
ogg_uint32_t cpu_flags;
/*The fragment plane descriptions.*/
More information about the commits
mailing list