[xiph-commits] r16262 - in branches/theora-gumboot/lib: . dec dec/x86 dec/x86_vc enc

Sat Jul 11 19:38:51 PDT 2009

Author: gumboot
Date: 2009-07-11 19:38:50 -0700 (Sat, 11 Jul 2009)
New Revision: 16262

Modified:
   branches/theora-gumboot/lib/dec/decode.c
   branches/theora-gumboot/lib/dec/idct.c
   branches/theora-gumboot/lib/dec/state.c
   branches/theora-gumboot/lib/dec/x86/mmxidct.c
   branches/theora-gumboot/lib/dec/x86/mmxstate.c
   branches/theora-gumboot/lib/dec/x86/x86int.h
   branches/theora-gumboot/lib/dec/x86/x86state.c
   branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
   branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c
   branches/theora-gumboot/lib/dec/x86_vc/x86int.h
   branches/theora-gumboot/lib/dec/x86_vc/x86state.c
   branches/theora-gumboot/lib/enc/analyze.c
   branches/theora-gumboot/lib/enc/tokenize.c
   branches/theora-gumboot/lib/internal.h
Log:
Clean out a lot of preprocessor cruft surrounding speculative changes.  Try (and fail) to move the DC dequantisation out to the same place as the AC dequantisation -- causing lots of argument list changes and related annoyances.  Allow the IDCTs to operate in-place because they all do anyway (having lost the dequantisation copy), and it's slightly inconvenient to make them stop doing that.



Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================

--- branches/theora-gumboot/lib/dec/decode.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/decode.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -1307,7 +1307,7 @@
    (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
 }
 
-#if 1
+#if defined(OC_X86_ASM)
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@@ -1364,9 +1364,9 @@
     ptrdiff_t   fragi;
     int         last_zzi;
     int         zzi;
-    fragi=coded_fragis[fragii];
-#if 1
     ogg_uint16_t const*ac_quant;
+    fragi=coded_fragis[fragii];
+#if defined(OC_X86_ASM)
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
     __asm__ __volatile__(
@@ -1391,9 +1391,11 @@
       :[y]"r"(dct_coeffs)
       :"memory"
     );
+#else
+    memset(dct_coeffs,0,64*sizeof(*dct_coeffs));
+#endif
     qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
-#endif
     /*Decode the AC coefficients.*/
     for(zzi=0;zzi<64;){
       int token;
@@ -1428,12 +1430,11 @@
         coeff=(cw>>OC_DCT_CW_MAG_SHIFT);
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
-#if 0
-        while(--rlen>=0)dct_coeffs[zzi++]=0;
-        dct_coeffs[zzi]=coeff;
-#else
         zzi+=rlen;
+#if defined(OC_X86_ASM)
         dct_coeffs[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+#else
+        dct_coeffs[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
 #endif
         zzi+=(eob==0);
       }
@@ -1442,11 +1443,9 @@
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
     dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
-    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
-    oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,
-     dc_quant[qti],_pipe->dequant[_pli][frags[fragi].qii][qti]);
+    oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,dc_quant[qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/

Modified: branches/theora-gumboot/lib/dec/idct.c
===================================================================
--- branches/theora-gumboot/lib/dec/idct.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/idct.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -295,23 +295,15 @@
   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 }
 
-void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
-  (*_state->opt_vtable.dequant_idct8x8)(_y,_x,_last_zzi,_ncoefs,
-   _dc_quant,_ac_quant);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ int _last_zzi,int _ncoefs){
+  (*_state->opt_vtable.idct8x8)(_y,_last_zzi,_ncoefs);
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
-  int ci;
+   version of the transform.*/
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -330,41 +322,17 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    ogg_int16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_y[ci]=p;
-  }
-  else{
-    int zzi;
-    /*First, dequantize the coefficients.*/
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    if(_last_zzi<3){
-      for(;zzi<3;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_3(_y,_y);
-    }
-    else if(_last_zzi<10){
-      for(;zzi<10;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_10(_y,_y);
-    }
-    else{
-      for(;zzi<64;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_slow(_y,_y);
-    }
-  }
+  /*Then perform the iDCT.*/
+  if(_last_zzi<3)
+    oc_idct8x8_3(_y,_y);
+  else if(_last_zzi<10)
+    oc_idct8x8_10(_y,_y);
+  else
+    oc_idct8x8_slow(_y,_y);
 }

Modified: branches/theora-gumboot/lib/dec/state.c
===================================================================
--- branches/theora-gumboot/lib/dec/state.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/state.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -585,7 +585,7 @@
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
-  _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_c;
+  _state->opt_vtable.idct8x8=oc_idct8x8_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
   _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
   _state->opt_vtable.state_loop_filter_frag_rows=
@@ -855,28 +855,39 @@
 
 void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
+ ogg_uint16_t _dc_quant){
   _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+   _last_zzi,_ncoefs,_dc_quant);
 }
 
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant, const ogg_uint16_t _ac_quant[64]){
-  ogg_int16_t    res_buf[64];
+ ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
   int            mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8(_state,res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    int ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+  }
+  else{
+    _dct_coeffs[0]*=_dc_quant;
+    oc_idct8x8(_state,_dct_coeffs,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -886,9 +897,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2(_state,
-       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,res_buf);
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
     }
-    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -543,13 +543,8 @@
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -568,94 +563,15 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    /*Note that this value must be unsigned, to keep the __asm__ block from
-       sign-extending it when it puts it in a register.*/
-    ogg_uint16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-#if 0
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-#else
-    p=(ogg_int16_t)(_y[0]*(ogg_int32_t)_dc_quant+15>>5);
-#endif
-    /*Fill _y with p.*/
-    __asm__ __volatile__(
-      /*mm0=0000 0000 0000 AAAA*/
-      "movd %[p],%%mm0\n\t"
-      /*mm0=0000 0000 AAAA AAAA*/
-      "punpcklwd %%mm0,%%mm0\n\t"
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(_y),[p]"r"((unsigned)p)
-      :"memory"
-    );
-  }
-  else{
-    int zzi;
-#if 0
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(_y)
-      :"memory"
-    );
-#endif
-    /*Dequantize the coefficients.*/
-#if 0
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-#else
-    _y[0]=(ogg_int16_t)(_y[0]*(int)_dc_quant);
-#endif
-    /*Then perform the iDCT.*/
-    if(_last_zzi<10)oc_idct8x8_10(_y);
-    else oc_idct8x8_slow(_y);
-  }
+  /*Then perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
 }
 
 #endif

Modified: branches/theora-gumboot/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -26,25 +26,59 @@
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
-#if 0
-  OC_ALIGN8(ogg_int16_t    res_buf[64]);
-#else
-  ogg_int16_t *res_buf = _dct_coeffs;
-#endif
+ ogg_uint16_t _dc_quant){
   unsigned char          *dst;
   ptrdiff_t               frag_buf_off;
   int                     ystride;
   int                     mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8_mmx(res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm__ __volatile__(
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpckldq %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
+      :"memory"
+    );
+  }
+  else{
+    _dct_coeffs[0]*=_dc_quant;
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -54,9 +88,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       res_buf);
+       _dct_coeffs);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-gumboot/lib/dec/x86/x86int.h
===================================================================
--- branches/theora-gumboot/lib/dec/x86/x86int.h	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86/x86int.h	2009-07-12 02:38:50 UTC (rev 16262)
@@ -29,12 +29,10 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);

Modified: branches/theora-gumboot/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/x86state.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86/x86state.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -28,7 +28,7 @@
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=

Modified: branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86_vc/mmxidct.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -541,13 +541,8 @@
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.
-  _y: The buffer to store the result in.
-      This must not be the same as _x.
-  _x: The input coefficients.*/
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]){
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -566,87 +561,15 @@
      but we still process the DC coefficient, which might have a non-zero value
      due to DC prediction.
     Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
     It could be smarter... multiple separate zero runs at the end of a block
      will fool it, but an encoder that generates these really deserves what it
      gets.
     Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    /*Note that this value must be unsigned, to keep the __asm__ block from
-       sign-extending it when it puts it in a register.*/
-    ogg_uint16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*Fill _y with p.*/
-    __asm{
-#define Y eax
-#define P ecx
-      mov Y,_y
-      movd P,p
-      /*mm0=0000 0000 0000 AAAA*/
-      movd mm0,P
-      /*mm0=0000 0000 AAAA AAAA*/
-      punpcklwd mm0,mm0
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      punpckldq mm0,mm0
-      movq [Y],mm0
-      movq [8+Y],mm0
-      movq [16+Y],mm0
-      movq [24+Y],mm0
-      movq [32+Y],mm0
-      movq [40+Y],mm0
-      movq [48+Y],mm0
-      movq [56+Y],mm0
-      movq [64+Y],mm0
-      movq [72+Y],mm0
-      movq [80+Y],mm0
-      movq [88+Y],mm0
-      movq [96+Y],mm0
-      movq [104+Y],mm0
-      movq [112+Y],mm0
-      movq [120+Y],mm0
-#undef Y
-#undef P
-    }
-  }
-  else{
-    int zzi;
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm{
-#define Y eax
-      mov Y,_y
-      pxor mm0,mm0
-      movq [Y],mm0
-      movq [8+Y],mm0
-      movq [16+Y],mm0
-      movq [24+Y],mm0
-      movq [32+Y],mm0
-      movq [40+Y],mm0
-      movq [48+Y],mm0
-      movq [56+Y],mm0
-      movq [64+Y],mm0
-      movq [72+Y],mm0
-      movq [80+Y],mm0
-      movq [88+Y],mm0
-      movq [96+Y],mm0
-      movq [104+Y],mm0
-      movq [112+Y],mm0
-      movq [120+Y],mm0
-#undef Y
-    }
-    /*Dequantize the coefficients.*/
-    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then perform the iDCT.*/
-    if(_last_zzi<10)oc_idct8x8_10(_y);
-    else oc_idct8x8_slow(_y);
-  }
+  /*Perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
 }
 
 #endif

Modified: branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86_vc/mmxstate.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -26,21 +26,62 @@
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
-  OC_ALIGN8(ogg_int16_t   res_buf[64]);
+ ogg_uint16_t _dc_quant){
   unsigned char          *dst;
   ptrdiff_t               frag_buf_off;
   int                     ystride;
   int                     mb_mode;
   /*Dequantize and apply the inverse transform.*/
-  oc_dequant_idct8x8_mmx(res_buf,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm{
+#define Y eax
+#define P ecx
+      mov Y,_dct_coeffs
+      movd P,p
+      /*mm0=0000 0000 0000 AAAA*/
+      movd mm0,P
+      /*mm0=0000 0000 AAAA AAAA*/
+      punpcklwd mm0,mm0
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      punpckldq mm0,mm0
+      movq [Y],mm0
+      movq [8+Y],mm0
+      movq [16+Y],mm0
+      movq [24+Y],mm0
+      movq [32+Y],mm0
+      movq [40+Y],mm0
+      movq [48+Y],mm0
+      movq [56+Y],mm0
+      movq [64+Y],mm0
+      movq [72+Y],mm0
+      movq [80+Y],mm0
+      movq [88+Y],mm0
+      movq [96+Y],mm0
+      movq [104+Y],mm0
+      movq [112+Y],mm0
+      movq [120+Y],mm0
+#undef Y
+#undef P
+    }
+  }
+  else{
+    _dct_coeffs[0]*=_dc_quant;
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi,_ncoefs);
+  }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -50,9 +91,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       res_buf);
+       _dct_coeffs);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-gumboot/lib/dec/x86_vc/x86int.h
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/x86int.h	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86_vc/x86int.h	2009-07-12 02:38:50 UTC (rev 16262)
@@ -29,12 +29,10 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);

Modified: branches/theora-gumboot/lib/dec/x86_vc/x86state.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86_vc/x86state.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/dec/x86_vc/x86state.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -28,7 +28,7 @@
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=

Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/enc/analyze.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -541,6 +541,7 @@
   int                  borderi;
   int                  pi;
   int                  zzi;
+  int                  dc;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
@@ -674,17 +675,22 @@
   checkpoint=*_stack;
   ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,zzbuffer,nonzero+1,
    _stack,mb_mode==OC_MODE_INTRA?3:0);
+  dc=data[0];
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
-#if 0
-  oc_dequant_idct8x8(&_enc->state,buffer,data,
-   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
-#else
-//  memcpy(buffer, data, sizeof(buffer));
-  int dc=data[0];
-  oc_dequant_idct8x8(&_enc->state,data,data,
-   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
-#endif
+  if(nonzero==0){
+    ogg_int16_t p;
+    int ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(data[0]*(ogg_int32_t)dequant[0]+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)data[ci]=p;
+  }
+  else{
+    data[0]*=dequant[0];
+    oc_idct8x8(&_enc->state,data,nonzero+1,nonzero+1);
+  }
   if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
   else{
     oc_enc_frag_recon_inter(_enc,dst,
@@ -745,11 +751,7 @@
       _mo->ac_bits+=ac_bits;
     }
   }
-#if 0
-  frags[_fragi].dc=data[0];
-#else
   frags[_fragi].dc=dc;
-#endif
   frags[_fragi].coded=1;
   return 1;
 }

Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/enc/tokenize.c	2009-07-12 02:38:50 UTC (rev 16262)
@@ -208,10 +208,10 @@
   int           qc;
 };
 
-#if 1
+#if defined(OC_X86_ASM)
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[128]={
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
    0, 8, 1, 2, 9,16,24,17,
   10, 3,32,11,18,25, 4,12,
    5,26,19,40,33,34,41,48,
@@ -220,14 +220,6 @@
   15,22,29,30,23,44,37,58,
   51,59,38,45,52,31,60,53,
   46,39,47,54,61,62,55,63,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
 };
 #endif
 
@@ -649,7 +641,7 @@
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
-#if 1
+#if defined(OC_X86_ASM)
   int dc=_qdct[0];
   __asm__ __volatile__(
     "pxor %%mm0,%%mm0\n\t"
@@ -674,6 +666,8 @@
     :"memory"
   );
   _qdct[0]=dc;
+#else
+  memset(_qdct+1,0,63*sizeof(*_qdct));
 #endif
   zzi=1;
   ti=best_flags>>1&1;
@@ -691,11 +685,7 @@
       /*We don't include the actual EOB cost for this block in the return value.
         It will be paid for by the fragment that terminates the EOB run.*/
       bits-=tokens[zzi][ti].bits;
-#if 0
-      for(;zzi<_zzi;zzi++)_qdct[zzi]=0;
-#else
       zzi=_zzi;
-#endif
       break;
     }
     /*Emit pending EOB run if any.*/
@@ -707,11 +697,10 @@
     next=tokens[zzi][ti].next;
     qc=tokens[zzi][ti].qc;
     zzj=(next>>1)-1&63;
-#if 0
-    for(;zzi<zzj;zzi++)_qdct[zzi]=0;
-    _qdct[zzj]=qc;
-#else
+#if defined(OC_X86_ASM)
     _qdct[OC_FZIG_ZAG_MMX[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+#else
+    _qdct[OC_FZIG_ZAG[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
 #endif
     zzi=next>>1;
     ti=next&1;

Modified: branches/theora-gumboot/lib/internal.h
===================================================================
--- branches/theora-gumboot/lib/internal.h	2009-07-11 20:21:36 UTC (rev 16261)
+++ branches/theora-gumboot/lib/internal.h	2009-07-12 02:38:50 UTC (rev 16262)
@@ -285,12 +285,10 @@
    const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
   void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
    const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*dequant_idct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64],
-   int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
-   const ogg_uint16_t _ac_quant[64]);
+  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
   void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
    int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
-   ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+   ogg_uint16_t _dc_quant);
   void (*state_frag_copy_list)(const oc_theora_state *_state,
    const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
    int _dst_frame,int _src_frame,int _pli);
@@ -464,12 +462,11 @@
 void oc_frag_recon_inter2(const oc_theora_state *_state,
  unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
  int _ystride,const ogg_int16_t _residue[64]);
-void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ int _last_zzi,int _ncoefs);
 void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);
@@ -486,12 +483,10 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
- int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
- const ogg_uint16_t _ac_quant[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_c(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);