[xiph-commits] r17307 - in experimental/derf/theora-ptalarbvorm/lib: . x86

Sat Jun 26 23:02:15 PDT 2010

Author: tterribe
Date: 2010-06-26 23:02:15 -0700 (Sat, 26 Jun 2010)
New Revision: 17307

Modified:
   experimental/derf/theora-ptalarbvorm/lib/analyze.c
   experimental/derf/theora-ptalarbvorm/lib/encint.h
   experimental/derf/theora-ptalarbvorm/lib/encode.c
   experimental/derf/theora-ptalarbvorm/lib/enquant.c
   experimental/derf/theora-ptalarbvorm/lib/quant.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
Log:
More cleanup/refactoring of the quantization code.
The post-quantization clamps have been entirely removed; they should be
 unnecessary given the range of the input and the minimum quantizers allowed.
This knocks 10-11 cycles off oc_enc_quantize_sse2.
The quantizer tables also now have the proper DC coefficient injected once per
 frame, instead of for every block.
This adds a bunch of extra initialization code to deal with the per-arch
 quantization matrix format, but simplifies the code that executes for every
 coded block (oc_enc_quantize_sse2 alone drops by almost 100 bytes).
Together with some additional code-size reductions this knocks another 4-5
 cycles off oc_enc_quantize_sse2.
The total encode time, despite saving at least 14-15 cycles/block, remains
 almost entirely unchanged.


Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================

--- experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -560,10 +560,6 @@
   int                 bounding_values[256];
   oc_fr_state         fr[3];
   oc_qii_state        qs[3];
-  /*Condensed dequantization tables.*/
-  const ogg_uint16_t *dequant[3][3][2];
-  /*Condensed quantization tables.*/
-  const oc_iquant    *enquant[3][3][2];
   /*Skip SSD storage for the current MCU in each plane.*/
   unsigned           *skip_ssd[3];
   /*Coded/uncoded fragment lists for each plane for the current MCU.*/
@@ -597,7 +593,9 @@
   int        hdec;
   int        vdec;
   int        pli;
+  int        nqis;
   int        qii;
+  int        qi0;
   int        qti;
   /*Initialize the per-plane coded block flag trackers.
     These are used for bit-estimation purposes only; the real flag bits span
@@ -626,16 +624,25 @@
   memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
   memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
   /*Set up condensed quantizer tables.*/
+  qi0=_enc->state.qis[0];
+  nqis=_enc->state.nqis;
   for(pli=0;pli<3;pli++){
-    for(qii=0;qii<_enc->state.nqis;qii++){
+    for(qii=0;qii<nqis;qii++){
       int qi;
       qi=_enc->state.qis[qii];
       for(qti=0;qti<2;qti++){
-        _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
-        _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+        /*Set the DC coefficient in the dequantization table.*/
+        _enc->state.dequant_tables[qi][pli][qti][0]=
+         _enc->dequant_dc[qi0][pli][qti];
+        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+        /*Copy over the quantization table.*/
+        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
+         _enc->opt_data.enquant_table_size);
       }
     }
   }
+  /*Fix up the DC coefficients in the quantization tables.*/
+  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
   /*Initialize the tokenization state.*/
   for(pli=0;pli<3;pli++){
     _pipe->ndct_tokens1[pli]=0;
@@ -737,7 +744,8 @@
   OC_ALIGN16(ogg_int16_t  dct[64]);
   OC_ALIGN16(ogg_int16_t  data[64]);
   oc_qii_state            qs;
-  ogg_uint16_t            dc_dequant;
+  const ogg_uint16_t     *dequant;
+  ogg_uint16_t            dequant_dc;
   ptrdiff_t               frag_offs;
   int                     ystride;
   const unsigned char    *src;
@@ -828,18 +836,16 @@
   oc_enc_fdct8x8(_enc,dct,data);
   /*Quantize:*/
   qti=mb_mode!=OC_MODE_INTRA;
-  dc_dequant=_pipe->dequant[_pli][0][qti][0];
-  nonzero=oc_enc_quantize(_enc,data,dct,
-   dc_dequant,_pipe->dequant[_pli][qii][qti],
-   _pipe->enquant[_pli][0][qti],_pipe->enquant[_pli][qii][qti]);
+  dequant=_enc->dequant[_pli][qii][qti];
+  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
   dc=data[0];
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,
-   _pipe->dequant[_pli][qii][qti],dct,nonzero+1,_stack,
-   OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+   _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
+  dequant_dc=dequant[0];
   if(nonzero==0){
     ogg_int16_t p;
     int         ci;
@@ -847,7 +853,7 @@
     int         qi12;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
-    p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
     /*LOOP VECTORIZES.*/
     for(ci=0;ci<64;ci++)data[ci]=p;
     /*We didn't code any AC coefficients, so don't change the quantizer.*/
@@ -857,7 +863,7 @@
     else if(qi01>=0)qii=0;
   }
   else{
-    data[0]=dc*dc_dequant;
+    data[0]=dc*dequant_dc;
     oc_idct8x8(&_enc->state,data,nonzero+1);
   }
   oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);

Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-06-27 06:02:15 UTC (rev 17307)
@@ -146,9 +146,9 @@
    const unsigned char *_src1,const unsigned char *_src2,int _ystride);
   void     (*enquant_table_init)(void *_enquant,
    const ogg_uint16_t _dequant[64]);
+  void     (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
   int      (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
-   ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
-   const void *_dc_enquant,const void *_ac_enquant);
+   const ogg_uint16_t _dequant[64],const void *_enquant);
   void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
    const ogg_int16_t _residue[64]);
   void     (*frag_recon_inter)(unsigned char *_dst,
@@ -450,7 +450,15 @@
   th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
   /*The quantization parameters in use.*/
   th_quant_info            qinfo;
-  oc_iquant               *enquant_tables[64][3][2];
+  /*The original DC coefficients saved off from the dequatization tables.*/
+  ogg_uint16_t             dequant_dc[64][3][2];
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t      *dequant[3][3][2];
+  /*Condensed quantization tables.*/
+  void                    *enquant[3][3][2];
+  /*The full set of quantization tables.*/
+  void                    *enquant_tables[64][3][2];
+  /*Storage for the quantization tables.*/
   unsigned char           *enquant_table_data;
   /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
     This is used to paramterize the rate control decisions.
@@ -560,12 +568,13 @@
  ogg_int64_t _mask);
 void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
- const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,
+ void *_enquant,const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup(const oc_enc_ctx *_enc,
+ void *_enquant[3][3][2],int _nqis);
 int oc_enc_quantize(const oc_enc_ctx *_enc,
  ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
 void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
  unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
 void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
@@ -601,9 +610,9 @@
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
 void oc_enc_enquant_table_init_c(void *_enquant,
  const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
 int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
 void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/encode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encode.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/encode.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -950,6 +950,7 @@
   _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
   _enc->opt_data.enquant_table_alignment=16;
   _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
+  _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
   _enc->opt_vtable.quantize=oc_enc_quantize_c;
   _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
@@ -1054,61 +1055,41 @@
   return 0;
 }
 
-void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
- const ogg_uint16_t _dequant[64]){
-  (*_enc->opt_vtable.enquant_table_init)(_enquant,_dequant);
-}
-
 static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
  const th_quant_info *_qinfo){
   unsigned char *etd;
   size_t         ets;
   int            align;
+  int            qii;
   int            qi;
   int            pli;
   int            qti;
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+  }
+  /*Initialize the dequantization tables.*/
+  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
+  /*And save off the DC values.*/
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
+  }
+  /*Set up storage for the quantization tables.*/
   etd=_enc->enquant_table_data;
   ets=_enc->opt_data.enquant_table_size;
   align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
   etd+=align;
+  /*Set up the main tables.*/
   for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
-    _enc->state.dequant_tables[qi][pli][qti]=
-     _enc->state.dequant_table_data[qi][pli][qti];
-    _enc->enquant_tables[qi][pli][qti]=etd+((qi*3+pli)*2+qti)*ets;
+    _enc->enquant_tables[qi][pli][qti]=etd;
+    oc_enc_enquant_table_init(_enc,etd,
+     _enc->state.dequant_tables[qi][pli][qti]);
+    etd+=ets;
   }
-  /*Initialize the dequantization tables first.*/
-  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
-  /*Derive the quantization tables directly from the dequantization tables.*/
-  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-    int plj;
-    int qtj;
-    int dupe;
-    dupe=0;
-    for(qtj=0;qtj<=qti;qtj++){
-      for(plj=0;plj<(qtj<qti?3:pli);plj++){
-        if(_enc->state.dequant_tables[qi][pli][qti]==
-         _enc->state.dequant_tables[qi][plj][qtj]){
-          dupe=1;
-          break;
-        }
-      }
-      if(dupe)break;
-    }
-    if(dupe){
-      _enc->enquant_tables[qi][pli][qti]=_enc->enquant_tables[qi][plj][qtj];
-    }
-    /*In the original VP3.2 code, the rounding offset and the size of the
-       dead zone around 0 were controlled by a "sharpness" parameter.
-      We now R-D optimize the tokens for each block after quantization,
-       so the rounding offset should always be 1/2, and an explicit dead
-       zone is unnecessary.
-      Hence, all of that VP3.2 code is gone from here, and the remaining
-       floating point code has been implemented as equivalent integer
-       code with exact precision.*/
-    else{
-      oc_enc_enquant_table_init(_enc,_enc->enquant_tables[qi][pli][qti],
-       _enc->state.dequant_tables[qi][pli][qti]);
-    }
+  /*Set up storage for the local copies we modify for each frame.*/
+  for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
+    _enc->enquant[pli][qii][qti]=etd;
+    etd+=ets;
   }
 }
 
@@ -1190,7 +1171,7 @@
   oc_enc_vtable_init_c(_enc);
 #endif
   _enc->enquant_table_data=(unsigned char *)_ogg_malloc(
-   64*3*2*_enc->opt_data.enquant_table_size
+   (64+3)*3*2*_enc->opt_data.enquant_table_size
    +_enc->opt_data.enquant_table_alignment-1);
   _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
   _enc->state.qis[0]=_enc->state.info.quality;

Modified: experimental/derf/theora-ptalarbvorm/lib/enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/enquant.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/enquant.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -129,49 +129,62 @@
   _this->l=l;
 }
 
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  (*_enc->opt_vtable.enquant_table_init)(_enquant,_dequant);
+}
+
 void oc_enc_enquant_table_init_c(void *_enquant,
  const ogg_uint16_t _dequant[64]){
   oc_iquant *enquant;
   int        zzi;
+  /*In the original VP3.2 code, the rounding offset and the size of the
+     dead zone around 0 were controlled by a "sharpness" parameter.
+    We now R-D optimize the tokens for each block after quantization,
+     so the rounding offset should always be 1/2, and an explicit dead
+     zone is unnecessary.
+    Hence, all of that VP3.2 code is gone from here, and the remaining
+     floating point code has been implemented as equivalent integer
+     code with exact precision.*/
   enquant=(oc_iquant *)_enquant;
   for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
 }
 
+void oc_enc_enquant_table_fixup(const oc_enc_ctx *_enc,
+ void *_enquant[3][3][2],int _nqis){
+  (*_enc->opt_vtable.enquant_table_fixup)(_enquant,_nqis);
+}
+
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){
+  int pli;
+  int qii;
+  int qti;
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    *((oc_iquant *)_enquant[pli][qii][qti])=
+     *((oc_iquant *)_enquant[pli][0][qti]);
+  }
+}
+
 int oc_enc_quantize(const oc_enc_ctx *_enc,
  ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
-  return (*_enc->opt_vtable.quantize)(_qdct,_dct,
-   _dc_dequant,_ac_dequant,_dc_enquant,_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+  return (*_enc->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant);
 }
 
 int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
+ const ogg_uint16_t _dequant[64],const void *_enquant){
   const oc_iquant *enquant;
   int              nonzero;
   int              zzi;
-  int              v;
   int              val;
   int              d;
   int              s;
-  /*Quantize the DC coefficient:*/
-  enquant=(const oc_iquant *)_dc_enquant;
-  v=_dct[0];
-  val=v<<1;
-  s=OC_SIGNMASK(val);
-  val+=_dc_dequant+s^s;
-  val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
-  _qdct[0]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
-  nonzero=0;
-  /*Quantize the AC coefficients:*/
-  enquant=(const oc_iquant *)_ac_enquant;
-  for(zzi=1;zzi<64;zzi++){
-    v=_dct[OC_FZIG_ZAG[zzi]];
-    d=_ac_dequant[zzi];
-    val=v<<1;
-    v=abs(val);
-    if(v>=d){
+  enquant=(const oc_iquant *)_enquant;
+  for(zzi=0;zzi<64;zzi++){
+    val=_dct[OC_FZIG_ZAG[zzi]];
+    d=_dequant[zzi];
+    val=val<<1;
+    if(abs(val)>=d){
       s=OC_SIGNMASK(val);
       /*The bias added here rounds ties away from zero, since token
          optimization can only decrease the magnitude of the quantized
@@ -180,7 +193,7 @@
       /*Note the arithmetic right shift is not guaranteed by ANSI C.
         Hopefully no one still uses ones-complement architectures.*/
       val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
-      _qdct[zzi]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
+      _qdct[zzi]=(ogg_int16_t)val;
       nonzero=zzi;
     }
     else _qdct[zzi]=0;

Modified: experimental/derf/theora-ptalarbvorm/lib/quant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/quant.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/quant.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -21,6 +21,14 @@
 #include "quant.h"
 #include "decint.h"
 
+/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
+  These minimum quantizers ensure the result after quantization (and after
+   prediction for DC) will be no more than +/- 510.
+  The tokenization system can handle values up to +/- 580, so there is no need
+   to do any coefficient clamping.
+  I would rather have allowed smaller quantizers and had to clamp, but these
+   minimums were required when constructing the original VP3 matrices and have
+   been formalized in the spec.*/
 static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
 static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
 

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -52,6 +52,7 @@
     _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
     _enc->opt_data.enquant_table_alignment=16;
     _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
     _enc->opt_vtable.quantize=oc_enc_quantize_sse2;
   }
 }

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h	2010-06-27 06:02:15 UTC (rev 17307)
@@ -51,9 +51,9 @@
  const unsigned char *_src,int _ystride);
 void oc_enc_enquant_table_init_x86(void *_enquant,
  const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
 int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
 void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c	2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c	2010-06-27 06:02:15 UTC (rev 17307)
@@ -21,13 +21,6 @@
 
 
 
-/*The maximum quantized coefficient value.*/
-static const ogg_uint16_t __attribute__((aligned(16))) OC_COEFF_MAX_SSE2[8]={
-  580,580,580,580,580,580,580,580
-};
-
-
-
 /*The default enquant table is not quite suitable for SIMD purposes.
   First, the m and l parameters need to be separated so that an entire row full
    of m's or l's can be loaded at a time.
@@ -46,16 +39,28 @@
     oc_iquant_init(&q,_dequant[zzi]);
     m[zzi]=q.m;
     /*q.l must be at least 2 for this to work; fortunately, once all the scale
-       factors are baked in, the minimum quantizer is much larger.*/
+       factors are baked in, the minimum quantizer is much larger than that.*/
     l[zzi]=1<<16-q.l;
   }
 }
 
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+  int pli;
+  int qii;
+  int qti;
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
+  }
+}
+
 /*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
    and store them in %[qdct].
-  The index of each output element in the original 64-element array should be
-   the following 8x8 array (the letters indicate the order we compute each
-   4-tuple below):
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
     A  0  1  8 16   9  2  3 10 B
     C 17 24 32 25  18 11  4  5 D
     E 12 19 26 33  40 48 41 34 I
@@ -65,7 +70,7 @@
     P 58 59 52 45  38 31 39 46 L
     N 53 60 61 54  47 55 62 63 O
   The order of the coefficients within each tuple is reversed in the comments
-   below to reflect the usual MSB to LSB ordering.*/
+   below to reflect the usual MSB to LSB notation.*/
 #define OC_ZIG_ZAG_MMXEXT \
   "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
   "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
@@ -158,33 +163,22 @@
   "movq %%mm7,0x60(%[qdct])\n\t" \
 
 int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
+ const ogg_uint16_t _dequant[64],const void *_enquant){
   ptrdiff_t r;
-  /*Load the first rows of the quantizer data and inject the DC terms.
-    We do this early to reduce general-purpose register pressure and because
-     pinsrw has a very long latency.*/
   __asm__ __volatile__(
-    "movdqa 0x00(%[dq]),%%xmm2\n\t"
-    "movdqa 0x00(%[q]),%%xmm4\n\t"
-    "movdqa 0x80(%[q]),%%xmm5\n\t"
-    "pinsrw $0,%k[dc_dq],%%xmm2\n\t"
-    "pinsrw $0,0x00(%[dc_q]),%%xmm4\n\t"
-    "pinsrw $0,0x80(%[dc_q]),%%xmm5\n\t"
-    :[dq]"+r"(_ac_dequant),[dc_q]"+r"(_dc_enquant),[q]"+r"(_ac_enquant)
-    :[dc_dq]"r"(_dc_dequant)
-  );
-  __asm__ __volatile__(
     /*Put the input in zig-zag order.*/
     OC_ZIG_ZAG_MMXEXT
-    /*Loading the first two rows of data and the second dequant row.*/
-    "movdqa 0x00(%[qdct]),%%xmm0\n\t"
-    "movdqa 0x10(%[qdct]),%%xmm1\n\t"
-    "movdqa 0x10(%[dq]),%%xmm3\n\t"
-    "mov $-0x60,%[r]\n\t"
+    "xor %[r],%[r]\n\t"
     /*Loop through two rows at a time.*/
     ".p2align 4\n\t"
     "0:\n\t"
+    /*Load the first two rows of the data and the quant matrices.*/
+    "movdqa 0x00(%[qdct],%[r]),%%xmm0\n\t"
+    "movdqa 0x10(%[qdct],%[r]),%%xmm1\n\t"
+    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
+    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
+    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
+    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
     /*Double the input and propagate its sign to the rounding factor.
       Using SSSE3's psignw would help here, but we need the mask later anyway.*/
     "movdqa %%xmm0,%%xmm6\n\t"
@@ -199,64 +193,42 @@
     "pxor %%xmm1,%%xmm3\n\t"
     /*Add the rounding factor and perform the first multiply.*/
     "paddw %%xmm2,%%xmm6\n\t"
-    "movdqa 0x70(%[q],%[r]),%%xmm2\n\t"
     "paddw %%xmm3,%%xmm7\n\t"
-    "movdqa 0xF0(%[q],%[r]),%%xmm3\n\t"
     "pmulhw %%xmm6,%%xmm4\n\t"
-    "pmulhw %%xmm7,%%xmm2\n\t"
+    "pmulhw %%xmm7,%%xmm5\n\t"
+    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
+    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
     "paddw %%xmm4,%%xmm6\n\t"
-    "paddw %%xmm2,%%xmm7\n\t"
+    "paddw %%xmm5,%%xmm7\n\t"
     /*Emulate an element-wise right-shift via a second multiply.*/
-    "pmulhw %%xmm5,%%xmm6\n\t"
+    "pmulhw %%xmm2,%%xmm6\n\t"
     "pmulhw %%xmm3,%%xmm7\n\t"
-    /*Load the bounds for the clamp operation.
-      It would be nice to keep these around across iterations, but there aren't
-       enough registers, and it's not like we're doing anything else while
-       waiting for the multiplies to finish.*/
-    "movdqa %[c],%%xmm2\n\t"
-    "pxor %%xmm3,%%xmm3\n\t"
     "add $32,%[r]\n\t"
-    "psubw %%xmm2,%%xmm3\n\t"
+    "cmp $96,%[r]\n\t"
     /*Correct for the sign.*/
     "psubw %%xmm0,%%xmm6\n\t"
     "psubw %%xmm1,%%xmm7\n\t"
-    /*Clamp into the valid range.*/
-    "pminsw %%xmm2,%%xmm6\n\t"
-    "pminsw %%xmm2,%%xmm7\n\t"
-    "pmaxsw %%xmm3,%%xmm6\n\t"
-    "pmaxsw %%xmm3,%%xmm7\n\t"
     /*Save the result.*/
-    "movdqa %%xmm6,0x40(%[qdct],%[r])\n\t"
-    "movdqa %%xmm7,0x50(%[qdct],%[r])\n\t"
-    "jg 1f\n\t"
-    /*Start loading the data for the next iteration.*/
-    "movdqa 0x60(%[qdct],%[r]),%%xmm0\n\t"
-    "movdqa 0x70(%[qdct],%[r]),%%xmm1\n\t"
-    "movdqa 0x60(%[dq],%[r]),%%xmm2\n\t"
-    "movdqa 0x70(%[dq],%[r]),%%xmm3\n\t"
-    "movdqa 0x60(%[q],%[r]),%%xmm4\n\t"
-    "movdqa 0xE0(%[q],%[r]),%%xmm5\n\t"
-    "jmp 0b\n\t"
-    ".p2align 4\n\t"
-    "1:\n\t"
+    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
+    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
+    "jle 0b\n\t"
     /*Now find the location of the last non-zero value.*/
     "movdqa 0x50(%[qdct]),%%xmm5\n\t"
     "movdqa 0x40(%[qdct]),%%xmm4\n\t"
     "packsswb %%xmm7,%%xmm6\n\t"
-    "pxor %%xmm0,%%xmm0\n\t"
-    "mov $0xFFFFFFFF,%[dq]\n\t"
     "packsswb %%xmm5,%%xmm4\n\t"
+    "pxor %%xmm0,%%xmm0\n\t"
+    "mov $-1,%k[dq]\n\t"
     "pcmpeqb %%xmm0,%%xmm6\n\t"
     "pcmpeqb %%xmm0,%%xmm4\n\t"
-    "pmovmskb %%xmm6,%[q]\n\t"
-    "pmovmskb %%xmm4,%[r]\n\t"
-    "shl $16,%[q]\n\t"
-    "or %[r],%[q]\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
     "mov $32,%[r]\n\t"
-    /*We have to use xor here instead of not in order to set the flags.
-      This also makes it easy to flip just the lower 32 bits on x86-64.*/
-    "xor %[dq],%[q]\n\t"
-    "jnz 2f\n\t"
+    /*We have to use xor here instead of not in order to set the flags.*/
+    "xor %k[dq],%k[q]\n\t"
+    "jnz 1f\n\t"
     "movdqa 0x30(%[qdct]),%%xmm7\n\t"
     "movdqa 0x20(%[qdct]),%%xmm6\n\t"
     "movdqa 0x10(%[qdct]),%%xmm5\n\t"
@@ -265,19 +237,18 @@
     "packsswb %%xmm5,%%xmm4\n\t"
     "pcmpeqb %%xmm0,%%xmm6\n\t"
     "pcmpeqb %%xmm0,%%xmm4\n\t"
-    "pmovmskb %%xmm6,%[q]\n\t"
-    "pmovmskb %%xmm4,%[r]\n\t"
-    "shl $16,%[q]\n\t"
-    "or %[r],%[q]\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
     "xor %[r],%[r]\n\t"
-    "xor %[dq],%[q]\n\t"
-    "or $1,%[q]\n\t"
-    "2:\n\t"
-    "bsr %[q],%[q]\n\t"
-    "add %[q],%[r]\n\t"
-    :[r]"=&a"(r),[q]"+r"(_ac_enquant)
-    :[dct]"r"(_dct),[qdct]"r"(_qdct),[dq]"r"(_ac_dequant),
-     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_uint16_t,OC_COEFF_MAX_SSE2,8))
+    "not %k[q]\n\t"
+    "or $1,%k[q]\n\t"
+    "1:\n\t"
+    "bsr %k[q],%k[q]\n\t"
+    "add %k[q],%k[r]\n\t"
+    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+    :[dct]"r"(_dct),[qdct]"r"(_qdct)
     :"cc","memory"
   );
   return (int)r;