[xiph-commits] r16325 - in branches/theora-gumboot/lib: . dec dec/x86 dec/x86_vc enc

gumboot at svn.xiph.org gumboot at svn.xiph.org
Thu Jul 23 05:50:50 PDT 2009


Author: gumboot
Date: 2009-07-23 05:50:50 -0700 (Thu, 23 Jul 2009)
New Revision: 16325

Modified:
   branches/theora-gumboot/lib/dec/decode.c
   branches/theora-gumboot/lib/dec/state.c
   branches/theora-gumboot/lib/dec/x86/mmxidct.c
   branches/theora-gumboot/lib/dec/x86/x86state.c
   branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
   branches/theora-gumboot/lib/dec/x86_vc/x86state.c
   branches/theora-gumboot/lib/enc/
   branches/theora-gumboot/lib/enc/analyze.c
   branches/theora-gumboot/lib/enc/tokenize.c
   branches/theora-gumboot/lib/internal.h
Log:
Drop zzbuffer encoder optimisation, because there's no visible benefit.  move OC_FZIG_ZAG_MMX to x86state.c and expose it alongside the vtable stuff (I think this slows it down marginally, but it's more presentable).  Integrate change 16324.



Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================
--- branches/theora-gumboot/lib/dec/decode.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/decode.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -1319,29 +1319,6 @@
    (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
 }
 
-#if defined(OC_X86_ASM)
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[128]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-};
-#endif
-
 /*Reconstructs all coded fragments in a single MCU (one or two super block
    rows).
   This requires that each coded fragment have a proper macro block mode and
@@ -1354,6 +1331,7 @@
 static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
   unsigned char     *dct_tokens;
+  const unsigned char *dct_fzig_zag;
   ogg_uint16_t       dc_quant[2];
   const oc_fragment *frags;
   const ptrdiff_t   *coded_fragis;
@@ -1363,6 +1341,7 @@
   ptrdiff_t         *eob_runs;
   int                qti;
   dct_tokens=_dec->dct_tokens;
+  dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
   frags=_dec->state.frags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
@@ -1418,11 +1397,7 @@
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
         zzi+=rlen;
-#if defined(OC_X86_ASM)
-        dct_coeffs[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
-#else
-        dct_coeffs[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
-#endif
+        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
         zzi+=(eob==0);
       }
     }

Modified: branches/theora-gumboot/lib/dec/state.c
===================================================================
--- branches/theora-gumboot/lib/dec/state.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/state.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -591,6 +591,7 @@
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
 }
 
 /*Initialize the accelerated function pointers.*/

Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -528,19 +528,6 @@
   );
 }
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/

Modified: branches/theora-gumboot/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/x86state.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86/x86state.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -21,6 +21,27 @@
 
 #include "../../cpu.c"
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -34,6 +55,7 @@
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   else oc_state_vtable_init_c(_state);
 }

Modified: branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -526,19 +526,6 @@
   }
 }
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/

Modified: branches/theora-gumboot/lib/dec/x86_vc/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/x86state.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/dec/x86_vc/x86state.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -21,6 +21,27 @@
 
 #include "../../cpu.c"
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -34,6 +55,7 @@
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   else oc_state_vtable_init_c(_state);
 }


Property changes on: branches/theora-gumboot/lib/enc
___________________________________________________________________
Modified: svn:mergeinfo
   - /branches/theora-thusnelda/lib/enc:16321
   + /branches/theora-thusnelda/lib/enc:16321,16323-16324

Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/enc/analyze.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -636,8 +636,7 @@
 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
  oc_rd_metric *_mo,oc_token_checkpoint **_stack){
-  OC_ALIGN16(ogg_int16_t  buffer[64]);
-  OC_ALIGN16(ogg_int16_t zzbuffer[64]);
+  OC_ALIGN16(ogg_int16_t  dct[64]);
   OC_ALIGN16(ogg_int16_t  data[64]);
   ogg_uint16_t            dc_dequant;
   const ogg_uint16_t     *dequant;
@@ -675,15 +674,16 @@
   borderi=frags[_fragi].borderi;
   qii=frags[_fragi].qii;
   if(qii&~3){
-#if 1
-    /*Enable early skip detection.*/
-    frags[_fragi].coded=0;
-    return 0;
-#else
-    /*Try and code the fragment anyway.*/
-    qii&=3;
-    frags[_fragi].qii=qii;
-#endif
+    if(!_pli){
+      /*Enable early skip detection only for luma blocks.*/
+      frags[_fragi].coded=0;
+      return 0;
+    }
+    else{
+      /*Try and code chroma blocks anyway.*/
+      qii&=3;
+      frags[_fragi].qii=qii;
+    }
   }
   mb_mode=frags[_fragi].mb_mode;
   ref=_enc->state.ref_frame_data[
@@ -731,25 +731,25 @@
   }
 #endif
   /*Transform:*/
-  oc_enc_fdct8x8(_enc,buffer,data);
+  oc_enc_fdct8x8(_enc,dct,data);
   /*Quantize the DC coefficient:*/
   qti=mb_mode!=OC_MODE_INTRA;
   enquant=_pipe->enquant[_pli][0][qti];
   dc_dequant=_pipe->dequant[_pli][0][qti][0];
-  v=buffer[0];
+  v=dct[0];
   val=v<<1;
   s=OC_SIGNMASK(val);
   val+=dc_dequant+s^s;
   val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
-  data[0]=OC_CLAMPI(-580,val,580);
+  dc=OC_CLAMPI(-580,val,580);
+  data[0]=dc;
   nonzero=0;
   /*Quantize the AC coefficients:*/
   dequant=_pipe->dequant[_pli][qii][qti];
   enquant=_pipe->enquant[_pli][qii][qti];
   for(zzi=1;zzi<64;zzi++){
-    v=buffer[OC_FZIG_ZAG[zzi]];
+    v=dct[OC_FZIG_ZAG[zzi]];
     d=dequant[zzi];
-    zzbuffer[zzi]=v;
     val=v<<1;
     v=abs(val);
     if(v>=d){
@@ -768,9 +768,8 @@
   }
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,zzbuffer,nonzero+1,
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
    _stack,qti?0:3);
-  dc=data[0];
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
   if(nonzero==0){
@@ -778,12 +777,12 @@
     int ci;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
-    p=(ogg_int16_t)(data[0]*(ogg_int32_t)dc_dequant+15>>5);
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
     /*LOOP VECTORIZES.*/
     for(ci=0;ci<64;ci++)data[ci]=p;
   }
   else{
-    data[0]*=dc_dequant;
+    data[0]=dc*dc_dequant;
     oc_idct8x8(&_enc->state,data,nonzero+1,nonzero+1);
   }
   if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);

Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/enc/tokenize.c	2009-07-23 12:50:50 UTC (rev 16325)
@@ -208,21 +208,6 @@
   int           qc;
 };
 
-#if defined(OC_X86_ASM)
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63,
-};
-#endif
-
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -233,6 +218,7 @@
   ogg_uint32_t         d2_accum[64];
   oc_quant_token       tokens[64][2];
   ogg_uint16_t        *eob_run;
+  const unsigned char *dct_fzig_zag;
   ogg_uint32_t         cost;
   int                  bits;
   int                  eob;
@@ -246,6 +232,7 @@
   int                  qc;
   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
   eob_run=_enc->eob_run[_pli];
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   memset(tokens[0],0,sizeof(tokens[0]));
   best_flags=nzflags=0;
   zflags=1;
@@ -270,7 +257,7 @@
     qc=_qdct[zzi];
     s=-(qc<0);
     qc=qc+s^s;
-    c=_dct[zzi];
+    c=_dct[OC_FZIG_ZAG[zzi]];
     if(qc<=1){
       ogg_uint32_t sum_d2;
       int          nzeros;
@@ -344,7 +331,7 @@
                 token=OC_DCT_RUN_CAT1B+cat;
                 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
               }
-              e=(_dct[zzj]+val_s^val_s)-_dequant[zzj];
+              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
               cost=d2+lambda*bits+tokens[zzk][tk].cost;
@@ -363,7 +350,7 @@
               token=OC_DCT_RUN_CAT2A+cat;
               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
               val=2+((val+val_s^val_s)>2);
-              e=(_dct[zzj]+val_s^val_s)-_dequant[zzj]*val;
+              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               cost=d2+lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
@@ -670,11 +657,7 @@
     next=tokens[zzi][ti].next;
     qc=tokens[zzi][ti].qc;
     zzj=(next>>1)-1&63;
-#if defined(OC_X86_ASM)
-    _qdct[OC_FZIG_ZAG_MMX[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
-#else
-    _qdct[OC_FZIG_ZAG[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
-#endif
+    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
   }

Modified: branches/theora-gumboot/lib/internal.h
===================================================================
--- branches/theora-gumboot/lib/internal.h	2009-07-23 04:01:29 UTC (rev 16324)
+++ branches/theora-gumboot/lib/internal.h	2009-07-23 12:50:50 UTC (rev 16325)
@@ -65,6 +65,7 @@
 typedef struct oc_fragment              oc_fragment;
 typedef struct oc_fragment_plane        oc_fragment_plane;
 typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_base_opt_data         oc_base_opt_data;
 typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
 typedef struct oc_theora_state          oc_theora_state;
 
@@ -297,6 +298,11 @@
   void (*restore_fpu)(void);
 };
 
+/*The shared (encoder and decoder) tables that vary according to which variants
+   of the above functions are used.*/
+struct oc_base_opt_data{
+  const unsigned char *dct_fzig_zag;
+};
 
 
 /*State information common to both the encoder and decoder.*/
@@ -305,6 +311,8 @@
   th_info             info;
   /*Table for shared accelerated functions.*/
   oc_base_opt_vtable  opt_vtable;
+  /*Table for shared data used by accelerated functions.*/
+  oc_base_opt_data    opt_data;
   /*CPU flags to detect the presence of extended instruction sets.*/
   ogg_uint32_t        cpu_flags;
   /*The fragment plane descriptions.*/



More information about the commits mailing list