[xiph-commits] r16360 - in branches/theora-thusnelda/lib: . dec dec/x86 dec/x86_vc enc

Wed Jul 29 06:44:25 PDT 2009

Author: tterribe
Date: 2009-07-29 06:44:25 -0700 (Wed, 29 Jul 2009)
New Revision: 16360

Modified:
   branches/theora-thusnelda/lib/dec/idct.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86/x86int.h
   branches/theora-thusnelda/lib/dec/x86/x86state.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86_vc/x86int.h
   branches/theora-thusnelda/lib/dec/x86_vc/x86state.c
   branches/theora-thusnelda/lib/enc/analyze.c
   branches/theora-thusnelda/lib/enc/tokenize.c
   branches/theora-thusnelda/lib/internal.h
Log:
Move dequantization back out of the idct.
This sets up further optimizations, though it does not actually make anything
 faster in and of itself.
Modified version of a patch from Simon Hosie.


Modified: branches/theora-thusnelda/lib/dec/idct.c
===================================================================

--- branches/theora-thusnelda/lib/dec/idct.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/idct.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -295,23 +295,15 @@
   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 }
 
-void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
-  (*_state->opt_vtable.dequant_idct8x8)(_y,_x,_last_zzi,_ncoefs,
-   _dc_quant,_ac_quant);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ int _last_zzi,int _ncoefs){
+  (*_state->opt_vtable.idct8x8)(_y,_last_zzi,_ncoefs);
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
-  int ci;
+   version of the transform.*/
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -330,41 +322,14 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    ogg_int16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_y[ci]=p;
-  }
-  else{
-    int zzi;
-    /*First, dequantize the coefficients.*/
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    if(_last_zzi<3){
-      for(;zzi<3;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_3(_y,_y);
-    }
-    else if(_last_zzi<10){
-      for(;zzi<10;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_10(_y,_y);
-    }
-    else{
-      for(;zzi<64;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_slow(_y,_y);
-    }
-  }
+  /*Then perform the iDCT.*/
+  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
+  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
+  else oc_idct8x8_slow(_y,_y);
 }

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -26,7 +26,7 @@
    block.
   All zig zag indices beyond 63 are sent to coefficient 64, so that zero runs
    past the end of a block in bogus streams get mapped to a known location.*/
-const unsigned char OC_FZIG_ZAG[64]={
+const unsigned char OC_FZIG_ZAG[128]={
    0, 1, 8,16, 9, 2, 3,10,
   17,24,32,25,18,11, 4, 5,
   12,19,26,33,40,48,41,34,
@@ -34,7 +34,15 @@
   35,42,49,56,57,50,43,36,
   29,22,15,23,30,37,44,51,
   58,59,52,45,38,31,39,46,
-  53,60,61,54,47,55,62,63
+  53,60,61,54,47,55,62,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
 };
 
 /*A map from the coefficient number in a block to its index in the zig zag

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -585,12 +585,13 @@
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
-  _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_c;
+  _state->opt_vtable.idct8x8=oc_idct8x8_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
   _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
 }
 
 /*Initialize the accelerated function pointers.*/
@@ -869,8 +870,39 @@
   int            ystride;
   int            mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8(_state,res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    int ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)res_buf[ci]=p;
+  }
+  else{
+    const unsigned char *dct_fzig_zag;
+    int                  zzi;
+    /*First, dequantize the coefficients.*/
+    dct_fzig_zag=_state->opt_data.dct_fzig_zag;
+    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      res_buf[dct_fzig_zag[zzi]]=
+       (ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
+    }
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+    if(_last_zzi<3){
+      for(;zzi<3;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
+    }
+    else if(_last_zzi<10){
+      for(;zzi<10;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
+    }
+    else{
+      for(;zzi<64;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
+    }
+    oc_idct8x8(_state,res_buf,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;

Modified: branches/theora-thusnelda/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -528,28 +528,10 @@
   );
 }
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -568,84 +550,15 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    /*Note that this value must be unsigned, to keep the __asm__ block from
-       sign-extending it when it puts it in a register.*/
-    ogg_uint16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*Fill _y with p.*/
-    __asm__ __volatile__(
-      /*mm0=0000 0000 0000 AAAA*/
-      "movd %[p],%%mm0\n\t"
-      /*mm0=0000 0000 AAAA AAAA*/
-      "punpcklwd %%mm0,%%mm0\n\t"
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(_y),[p]"r"((unsigned)p)
-      :"memory"
-    );
-  }
-  else{
-    int zzi;
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(_y)
-      :"memory"
-    );
-    /*Dequantize the coefficients.*/
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then perform the iDCT.*/
-    if(_last_zzi<10)oc_idct8x8_10(_y);
-    else oc_idct8x8_slow(_y);
-  }
+  /*Then perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
 }
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -24,6 +24,19 @@
 
 #if defined(OC_X86_ASM)
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
@@ -33,8 +46,76 @@
   int                     ystride;
   int                     mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8_mmx(res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm__ __volatile__(
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpckldq %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(res_buf),[p]"r"((unsigned)p)
+      :"memory"
+    );
+  }
+  else{
+    int zzi;
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(res_buf)
+      :"memory"
+    );
+    /*Dequantize the coefficients.*/
+    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      res_buf[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
+    }
+    oc_idct8x8_mmx(res_buf,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;

Modified: branches/theora-thusnelda/lib/dec/x86/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-07-29 13:44:25 UTC (rev 16360)
@@ -29,9 +29,7 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);

Modified: branches/theora-thusnelda/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -21,6 +21,27 @@
 
 #include "../../cpu.c"
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -28,12 +49,13 @@
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   else oc_state_vtable_init_c(_state);
 }

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -526,28 +526,10 @@
   }
 }
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -566,87 +548,15 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    /*Note that this value must be unsigned, to keep the __asm__ block from
-       sign-extending it when it puts it in a register.*/
-    ogg_uint16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*Fill _y with p.*/
-    __asm{
-#define Y eax
-#define P ecx
-      mov Y,_y
-      movd P,p
-      /*mm0=0000 0000 0000 AAAA*/
-      movd mm0,P
-      /*mm0=0000 0000 AAAA AAAA*/
-      punpcklwd mm0,mm0
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      punpckldq mm0,mm0
-      movq [Y],mm0
-      movq [8+Y],mm0
-      movq [16+Y],mm0
-      movq [24+Y],mm0
-      movq [32+Y],mm0
-      movq [40+Y],mm0
-      movq [48+Y],mm0
-      movq [56+Y],mm0
-      movq [64+Y],mm0
-      movq [72+Y],mm0
-      movq [80+Y],mm0
-      movq [88+Y],mm0
-      movq [96+Y],mm0
-      movq [104+Y],mm0
-      movq [112+Y],mm0
-      movq [120+Y],mm0
-#undef Y
-#undef P
-    }
-  }
-  else{
-    int zzi;
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm{
-#define Y eax
-      mov Y,_y
-      pxor mm0,mm0
-      movq [Y],mm0
-      movq [8+Y],mm0
-      movq [16+Y],mm0
-      movq [24+Y],mm0
-      movq [32+Y],mm0
-      movq [40+Y],mm0
-      movq [48+Y],mm0
-      movq [56+Y],mm0
-      movq [64+Y],mm0
-      movq [72+Y],mm0
-      movq [80+Y],mm0
-      movq [88+Y],mm0
-      movq [96+Y],mm0
-      movq [104+Y],mm0
-      movq [112+Y],mm0
-      movq [120+Y],mm0
-#undef Y
-    }
-    /*Dequantize the coefficients.*/
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then perform the iDCT.*/
-    if(_last_zzi<10)oc_idct8x8_10(_y);
-    else oc_idct8x8_slow(_y);
-  }
+  /*Perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
 }
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -24,6 +24,19 @@
 
 #if defined(OC_X86_ASM)
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
@@ -33,8 +46,79 @@
   int                     ystride;
   int                     mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8_mmx(res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill res_buf with p.*/
+    __asm{
+#define Y eax
+#define P ecx
+      mov Y,res_buf
+      movd P,p
+      /*mm0=0000 0000 0000 AAAA*/
+      movd mm0,P
+      /*mm0=0000 0000 AAAA AAAA*/
+      punpcklwd mm0,mm0
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      punpckldq mm0,mm0
+      movq [Y],mm0
+      movq [8+Y],mm0
+      movq [16+Y],mm0
+      movq [24+Y],mm0
+      movq [32+Y],mm0
+      movq [40+Y],mm0
+      movq [48+Y],mm0
+      movq [56+Y],mm0
+      movq [64+Y],mm0
+      movq [72+Y],mm0
+      movq [80+Y],mm0
+      movq [88+Y],mm0
+      movq [96+Y],mm0
+      movq [104+Y],mm0
+      movq [112+Y],mm0
+      movq [120+Y],mm0
+#undef Y
+#undef P
+    }
+  }
+  else{
+    int zzi;
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+    __asm{
+#define Y eax
+      mov Y,res_buf
+      pxor mm0,mm0
+      movq [Y],mm0
+      movq [8+Y],mm0
+      movq [16+Y],mm0
+      movq [24+Y],mm0
+      movq [32+Y],mm0
+      movq [40+Y],mm0
+      movq [48+Y],mm0
+      movq [56+Y],mm0
+      movq [64+Y],mm0
+      movq [72+Y],mm0
+      movq [80+Y],mm0
+      movq [88+Y],mm0
+      movq [96+Y],mm0
+      movq [104+Y],mm0
+      movq [112+Y],mm0
+      movq [120+Y],mm0
+#undef Y
+    }
+    /*Dequantize the coefficients.*/
+    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      res_buf[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
+    }
+    oc_idct8x8_mmx(res_buf,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;

Modified: branches/theora-thusnelda/lib/dec/x86_vc/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/x86int.h	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86_vc/x86int.h	2009-07-29 13:44:25 UTC (rev 16360)
@@ -29,9 +29,7 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);

Modified: branches/theora-thusnelda/lib/dec/x86_vc/x86state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/x86state.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/dec/x86_vc/x86state.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -21,6 +21,27 @@
 
 #include "../../cpu.c"
 
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
@@ -28,12 +49,13 @@
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   else oc_state_vtable_init_c(_state);
 }

Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/enc/analyze.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -636,7 +636,7 @@
 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
  oc_rd_metric *_mo,oc_token_checkpoint **_stack){
-  OC_ALIGN16(ogg_int16_t  buffer[64]);
+  OC_ALIGN16(ogg_int16_t  dct[64]);
   OC_ALIGN16(ogg_int16_t  data[64]);
   ogg_uint16_t            dc_dequant;
   const ogg_uint16_t     *dequant;
@@ -666,6 +666,7 @@
   int                     val;
   int                     d;
   int                     s;
+  int                     dc;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
@@ -730,23 +731,23 @@
   }
 #endif
   /*Transform:*/
-  oc_enc_fdct8x8(_enc,buffer,data);
+  oc_enc_fdct8x8(_enc,dct,data);
   /*Quantize the DC coefficient:*/
   qti=mb_mode!=OC_MODE_INTRA;
   enquant=_pipe->enquant[_pli][0][qti];
   dc_dequant=_pipe->dequant[_pli][0][qti][0];
-  v=buffer[0];
+  v=dct[0];
   val=v<<1;
   s=OC_SIGNMASK(val);
   val+=dc_dequant+s^s;
   val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
-  data[0]=OC_CLAMPI(-580,val,580);
+  dc=OC_CLAMPI(-580,val,580);
   nonzero=0;
   /*Quantize the AC coefficients:*/
   dequant=_pipe->dequant[_pli][qii][qti];
   enquant=_pipe->enquant[_pli][qii][qti];
   for(zzi=1;zzi<64;zzi++){
-    v=buffer[OC_FZIG_ZAG[zzi]];
+    v=dct[OC_FZIG_ZAG[zzi]];
     d=dequant[zzi];
     val=v<<1;
     v=abs(val);
@@ -766,16 +767,27 @@
   }
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,buffer,nonzero+1,
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
    _stack,qti?0:3);
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
-  oc_dequant_idct8x8(&_enc->state,buffer,data,
-   nonzero+1,nonzero+1,dc_dequant,(ogg_uint16_t *)dequant);
-  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,buffer);
+  if(nonzero==0){
+    ogg_int16_t p;
+    int         ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)data[ci]=p;
+  }
   else{
+    data[0]=dc*dc_dequant;
+    oc_idct8x8(&_enc->state,data,nonzero+1,nonzero+1);
+  }
+  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
+  else{
     oc_enc_frag_recon_inter(_enc,dst,
-     nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
   }
   frame_type=_enc->state.frame_type;
 #if !defined(OC_COLLECT_METRICS)
@@ -783,20 +795,20 @@
 #endif
   {
     /*In retrospect, should we have skipped this block?*/
-    oc_enc_frag_sub(_enc,buffer,src,dst,ystride);
+    oc_enc_frag_sub(_enc,data,src,dst,ystride);
     coded_ssd=coded_dc=0;
     if(borderi<0){
       for(pi=0;pi<64;pi++){
-        coded_ssd+=buffer[pi]*buffer[pi];
-        coded_dc+=buffer[pi];
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
       }
     }
     else{
       ogg_int64_t mask;
       mask=_enc->state.borders[borderi].mask;
       for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        coded_ssd+=buffer[pi]*buffer[pi];
-        coded_dc+=buffer[pi];
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
       }
     }
     /*Scale to match DCT domain.*/
@@ -834,7 +846,7 @@
     _mo->ac_bits+=ac_bits;
   }
   oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii);
-  frags[_fragi].dc=data[0];
+  frags[_fragi].dc=dc;
   frags[_fragi].coded=1;
   return 1;
 }

Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/enc/tokenize.c	2009-07-29 13:44:25 UTC (rev 16360)
@@ -208,6 +208,9 @@
   int           qc;
 };
 
+/*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
+   dequantizes and de-zig-zags the result.
+  The DC coefficient is not preserved; it should be restored by the caller.*/
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -218,6 +221,7 @@
   ogg_uint32_t         d2_accum[64];
   oc_quant_token       tokens[64][2];
   ogg_uint16_t        *eob_run;
+  const unsigned char *dct_fzig_zag;
   ogg_uint32_t         cost;
   int                  bits;
   int                  eob;
@@ -626,6 +630,10 @@
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
+  /*We blow away the first entry here so that things vectorize better.
+    The DC coefficient is not actually stored in the array yet.*/
+  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   zzi=1;
   ti=best_flags>>1&1;
   bits=tokens[zzi][ti].bits;
@@ -641,7 +649,7 @@
       /*We don't include the actual EOB cost for this block in the return value.
         It will be paid for by the fragment that terminates the EOB run.*/
       bits-=tokens[zzi][ti].bits;
-      for(;zzi<_zzi;zzi++)_qdct[zzi]=0;
+      zzi=_zzi;
       break;
     }
     /*Emit pending EOB run if any.*/
@@ -653,8 +661,9 @@
     next=tokens[zzi][ti].next;
     qc=tokens[zzi][ti].qc;
     zzj=(next>>1)-1&63;
-    for(;zzi<zzj;zzi++)_qdct[zzi]=0;
-    _qdct[zzj]=qc;
+    /*TODO: It may be worth saving the dequantized coefficient in the trellis
+       above; we had to compute it to measure the error anyway.*/
+    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
   }

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-07-29 11:16:55 UTC (rev 16359)
+++ branches/theora-thusnelda/lib/internal.h	2009-07-29 13:44:25 UTC (rev 16360)
@@ -65,6 +65,7 @@
 typedef struct oc_fragment              oc_fragment;
 typedef struct oc_fragment_plane        oc_fragment_plane;
 typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_base_opt_data         oc_base_opt_data;
 typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
 typedef struct oc_theora_state          oc_theora_state;
 
@@ -277,9 +278,7 @@
    const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
   void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
    const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*dequant_idct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64],
-   int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
-   const ogg_uint16_t _ac_quant[64]);
+  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
   void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
    int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
@@ -291,6 +290,11 @@
   void (*restore_fpu)(void);
 };
 
+/*The shared (encoder and decoder) tables that vary according to which variants
+   of the above functions are used.*/
+struct oc_base_opt_data{
+  const unsigned char *dct_fzig_zag;
+};
 
 
 /*State information common to both the encoder and decoder.*/
@@ -299,6 +303,8 @@
   th_info             info;
   /*Table for shared accelerated functions.*/
   oc_base_opt_vtable  opt_vtable;
+  /*Table for shared data used by accelerated functions.*/
+  oc_base_opt_data    opt_data;
   /*CPU flags to detect the presence of extended instruction sets.*/
   ogg_uint32_t        cpu_flags;
   /*The fragment plane descriptions.*/
@@ -391,7 +397,7 @@
 
 /*A map from the index in the zig zag scan to the coefficient number in a
    block.*/
-extern const unsigned char OC_FZIG_ZAG[64];
+extern const unsigned char OC_FZIG_ZAG[128];
 /*A map from the coefficient number in a block to its index in the zig zag
    scan.*/
 extern const unsigned char OC_IZIG_ZAG[64];
@@ -451,9 +457,8 @@
 void oc_frag_recon_inter2(const oc_theora_state *_state,
  unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
  int _ystride,const ogg_int16_t _residue[64]);
-void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ int _last_zzi,int _ncoefs);
 void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
@@ -473,9 +478,7 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);