[Theora-dev] [PATCH] remove some FZIGZAG

Rudolf Marek r.marek at sh.cvut.cz
Sat Aug 20 02:54:03 PDT 2005


Hello,

As we discussed with derf some time ago, it seems it is not neccessary to enforce "forward" order of dct_coeffs.
This patch gains .99366902855226196000% so approx 1% speedup.

Meausurement method:
time nice -n -19 ./dump  /mnt/disc4/theora/unix/gripen.ogg > /dev/null
Ogg logical stream 310b2968 is Theora 720x480 29.97 fps video
Encoded frame content is 720x480 with 0x0 offset
12460 frames

This patch is as I had it working. I just want to submit it before I will forgot about it :)

Derf please consider applaying this into SVN in some form.

Thanks,
Regards
Rudolf
-------------- next part --------------
diff -Naur ../mergeSTATE/test/lib/decode.c test/lib/decode.c
--- ../mergeSTATE/test/lib/decode.c	2005-08-17 09:58:23.000000000 +0200
+++ test/lib/decode.c	2005-08-20 11:19:04.052143250 +0200
@@ -1083,53 +1083,53 @@
                 to.
                This is updated before the function returns.*/
 typedef void (*oc_token_expand_func)(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi);
+ ogg_int16_t _dct_coeffs[128],int *_zzi);
 
 /*Expands a zero run token.*/
 void oc_token_expand_zrl(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
   int zzi;
   zzi=*_zzi;
-  do _dct_coeffs[OC_FZIG_ZAG[zzi++]]=0;
+  do _dct_coeffs[zzi++]=0;
   while(_extra_bits-->0);
   *_zzi=zzi;
 }
 
 /*Expands a constant, single-value token.*/
 void oc_token_expand_const(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
-  _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]=
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
+  _dct_coeffs[(*_zzi)++]=
    (ogg_int16_t)oc_token_dec1val_const(_token);
 }
 
 /*Expands category 2 single-valued tokens.*/
 void oc_token_expand_cat2(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
-  _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]=
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
+  _dct_coeffs[(*_zzi)++]=
    (ogg_int16_t)oc_token_dec1val_cat2(_token,_extra_bits);
 }
 
 /*Expands category 3 through 8 single-valued tokens.*/
 void oc_token_expand_cati(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
-  _dct_coeffs[OC_FZIG_ZAG[(*_zzi)++]]=
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
+  _dct_coeffs[(*_zzi)++]=
    (ogg_int16_t)oc_token_dec1val_cati(_token,_extra_bits);
 }
 
 /*Expands a category 1a zero run/value combo token.*/
 void oc_token_expand_run_cat1a(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
   int zzi;
   int rl;
   zzi=*_zzi;
-  for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[OC_FZIG_ZAG[zzi++]]=0;
-  _dct_coeffs[OC_FZIG_ZAG[zzi++]]=(ogg_int16_t)(1-(_extra_bits<<1));
+  for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[zzi++]=0;
+  _dct_coeffs[zzi++]=(ogg_int16_t)(1-(_extra_bits<<1));
   *_zzi=zzi;
 }
 
 /*Expands all other zero run/value combo tokens.*/
 void oc_token_expand_run(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[64],int *_zzi){
+ ogg_int16_t _dct_coeffs[128],int *_zzi){
   static const int NZEROS_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
     6,10,1,2
   };
@@ -1154,11 +1154,11 @@
   _token-=OC_DCT_RUN_CAT1B;
   rl=(_extra_bits&NZEROS_MASK[_token])+NZEROS_ADJUST[_token];
   zzi=*_zzi;
-  while(rl-->0)_dct_coeffs[OC_FZIG_ZAG[zzi++]]=0;
+  while(rl-->0)_dct_coeffs[zzi++]=0;
   valsigned[0]=VALUE_ADJUST[_token]+
    (_extra_bits>>VALUE_SHIFT[_token]&VALUE_MASK[_token]);
   valsigned[1]=-valsigned[0];
-  _dct_coeffs[OC_FZIG_ZAG[zzi++]]=(ogg_int16_t)valsigned[
+  _dct_coeffs[zzi++]=(ogg_int16_t)valsigned[
    _extra_bits>>SIGN_SHIFT[_token]];
   *_zzi=zzi;
 }
diff -Naur ../mergeSTATE/test/lib/internal.h test/lib/internal.h
--- ../mergeSTATE/test/lib/internal.h	2005-08-17 10:05:34.000000000 +0200
+++ test/lib/internal.h	2005-08-20 11:39:38.797310000 +0200
@@ -239,7 +239,7 @@
   void (*state_frag_copy)(const oc_theora_state *_state,
    const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
   void (*state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag,
-   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
   void (*restore_fpu)(void);
   void (*oc_state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
@@ -409,7 +409,7 @@
 void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu(const oc_theora_state *_state);
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
@@ -426,7 +426,7 @@
 void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
 void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_c(void);
 void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
diff -Naur ../mergeSTATE/test/lib/state.c test/lib/state.c
--- ../mergeSTATE/test/lib/state.c	2005-08-17 10:29:23.797763500 +0200
+++ test/lib/state.c	2005-08-20 11:22:10.211777500 +0200
@@ -788,14 +788,14 @@
 }
 
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   _state->opt_vtable.state_frag_recon(_state,_frag,_pli,_dct_coeffs,
    _last_zzi,_ncoefs,_dc_iquant,_ac_iquant);
 }
 
 void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t dct_buf[64];
   ogg_int16_t res_buf[64];
@@ -841,7 +841,7 @@
     for(zzi=1;zzi<_ncoefs;zzi++){
       int ci;
       ci=OC_FZIG_ZAG[zzi];
-      dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]);
+      dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]);
     }
     /*Then, fill in the remainder of the coefficients with 0's, and perform
        the iDCT.*/
diff -Naur ../mergeSTATE/test/lib/x86/mmxstate.c test/lib/x86/mmxstate.c
--- ../mergeSTATE/test/lib/x86/mmxstate.c	2005-08-17 21:03:14.000000000 +0200
+++ test/lib/x86/mmxstate.c	2005-08-20 11:39:04.899191500 +0200
@@ -29,7 +29,7 @@
 
 
 void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t  __attribute__((aligned(8))) res_buf[64];
   int dst_framei;
@@ -131,7 +131,7 @@
     for(zzi=1;zzi<_ncoefs;zzi++){
       int ci;
       ci=OC_FZIG_ZAG[zzi];
-      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*
+      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
        _ac_iquant[ci]);
     }
     if(_last_zzi<10){
diff -Naur ../mergeSTATE/test/lib/x86/x86int.h test/lib/x86/x86int.h
--- ../mergeSTATE/test/lib/x86/x86int.h	2005-08-17 10:11:36.000000000 +0200
+++ test/lib/x86/x86int.h	2005-08-20 11:38:53.890503500 +0200
@@ -14,7 +14,7 @@
 void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
 void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,                                                             
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_mmx(void);
 void oc_idct8x8_mmx(ogg_int16_t _y[64]);


More information about the Theora-dev mailing list