[xiph-commits] r17306 - in experimental/derf/theora-ptalarbvorm/lib: . x86

Sat Jun 26 11:21:33 PDT 2010

Author: tterribe
Date: 2010-06-26 11:21:33 -0700 (Sat, 26 Jun 2010)
New Revision: 17306

Added:
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
Modified:
   experimental/derf/theora-ptalarbvorm/lib/Makefile.am
   experimental/derf/theora-ptalarbvorm/lib/analyze.c
   experimental/derf/theora-ptalarbvorm/lib/encfrag.c
   experimental/derf/theora-ptalarbvorm/lib/encint.h
   experimental/derf/theora-ptalarbvorm/lib/encode.c
   experimental/derf/theora-ptalarbvorm/lib/enquant.c
   experimental/derf/theora-ptalarbvorm/lib/enquant.h
   experimental/derf/theora-ptalarbvorm/lib/huffdec.c
   experimental/derf/theora-ptalarbvorm/lib/tokenize.c
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
   experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
Log:
First-cut impementation of x86 SIMD for SSD, forward zig-zag, and quantization
 in the encoder.
Only SSE2 versions for now; MMX later.
This speeds up encoding 4 Mbps 1080p by about 8% on an i7.


Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================

--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-06-26 18:21:33 UTC (rev 17306)
@@ -11,6 +11,7 @@
 	x86/sse2trans.h \
 	x86/x86enc.c \
 	x86/x86enc.h \
+	x86/x86enquant.c \
 	x86/mmxfrag.c \
 	x86/mmxfrag.h \
 	x86/mmxidct.c \
@@ -33,8 +34,9 @@
 encoder_uniq_x86_sources = \
 	x86/mmxencfrag.c \
 	x86/mmxfdct.c \
-	x86/x86enc.c \
-	x86/sse2encfrag.c
+	x86/sse2encfrag.c \
+	x86/x86enquant.c \
+	x86/x86enc.c
 
 encoder_uniq_x86_64_sources = \
 	x86/sse2fdct.c

Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -738,8 +738,6 @@
   OC_ALIGN16(ogg_int16_t  data[64]);
   oc_qii_state            qs;
   ogg_uint16_t            dc_dequant;
-  const ogg_uint16_t     *dequant;
-  const oc_iquant        *enquant;
   ptrdiff_t               frag_offs;
   int                     ystride;
   const unsigned char    *src;
@@ -757,12 +755,6 @@
   int                     borderi;
   int                     qti;
   int                     qii;
-  int                     pi;
-  int                     zzi;
-  int                     v;
-  int                     val;
-  int                     d;
-  int                     s;
   int                     dc;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
@@ -834,43 +826,18 @@
 #endif
   /*Transform:*/
   oc_enc_fdct8x8(_enc,dct,data);
-  /*Quantize the DC coefficient:*/
+  /*Quantize:*/
   qti=mb_mode!=OC_MODE_INTRA;
-  enquant=_pipe->enquant[_pli][0][qti];
   dc_dequant=_pipe->dequant[_pli][0][qti][0];
-  v=dct[0];
-  val=v<<1;
-  s=OC_SIGNMASK(val);
-  val+=dc_dequant+s^s;
-  val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
-  dc=OC_CLAMPI(-580,val,580);
-  nonzero=0;
-  /*Quantize the AC coefficients:*/
-  dequant=_pipe->dequant[_pli][qii][qti];
-  enquant=_pipe->enquant[_pli][qii][qti];
-  for(zzi=1;zzi<64;zzi++){
-    v=dct[OC_FZIG_ZAG[zzi]];
-    d=dequant[zzi];
-    val=v<<1;
-    v=abs(val);
-    if(v>=d){
-      s=OC_SIGNMASK(val);
-      /*The bias added here rounds ties away from zero, since token
-         optimization can only decrease the magnitude of the quantized
-         value.*/
-      val+=d+s^s;
-      /*Note the arithmetic right shift is not guaranteed by ANSI C.
-        Hopefully no one still uses ones-complement architectures.*/
-      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
-      data[zzi]=OC_CLAMPI(-580,val,580);
-      nonzero=zzi;
-    }
-    else data[zzi]=0;
-  }
+  nonzero=oc_enc_quantize(_enc,data,dct,
+   dc_dequant,_pipe->dequant[_pli][qii][qti],
+   _pipe->enquant[_pli][0][qti],_pipe->enquant[_pli][qii][qti]);
+  dc=data[0];
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
-   _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,
+   _pipe->dequant[_pli][qii][qti],dct,nonzero+1,_stack,
+   OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
   if(nonzero==0){
@@ -906,19 +873,12 @@
 #endif
   {
     /*In retrospect, should we have skipped this block?*/
-    oc_enc_frag_sub(_enc,data,src,dst,ystride);
-    coded_ssd=0;
     if(borderi<0){
-      for(pi=0;pi<64;pi++){
-        coded_ssd+=data[pi]*data[pi];
-      }
+      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
     }
     else{
-      ogg_int64_t mask;
-      mask=_enc->state.borders[borderi].mask;
-      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        coded_ssd+=data[pi]*data[pi];
-      }
+      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
+       _enc->state.borders[borderi].mask);
     }
     /*Scale to match DCT domain.*/
     coded_ssd<<=4;
@@ -1946,27 +1906,24 @@
 
 static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
  unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
-  OC_ALIGN16(ogg_int16_t  buffer[64]);
-  const unsigned char    *src;
-  const unsigned char    *ref;
-  int                     ystride;
-  const oc_fragment      *frags;
-  const ptrdiff_t        *frag_buf_offs;
-  const ptrdiff_t        *sb_map;
-  const oc_mb_map_plane  *mb_map;
-  const unsigned char    *map_idxs;
-  oc_mv                  *mvs;
-  int                     map_nidxs;
-  ogg_int64_t             mask;
-  unsigned                uncoded_ssd;
-  int                     mapii;
-  int                     mapi;
-  int                     pli;
-  int                     bi;
-  ptrdiff_t               fragi;
-  ptrdiff_t               frag_offs;
-  int                     borderi;
-  int                     pi;
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const oc_fragment     *frags;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  oc_mv                 *mvs;
+  int                    map_nidxs;
+  unsigned               uncoded_ssd;
+  int                    mapii;
+  int                    mapi;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    borderi;
   src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
   ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
   ystride=_enc->state.ref_ystride[0];
@@ -1976,21 +1933,14 @@
   mvs=_enc->mb_info[_mbi].block_mv;
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
-    frag_offs=frag_buf_offs[fragi];
-    oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
     borderi=frags[fragi].borderi;
-    uncoded_ssd=0;
+    frag_offs=frag_buf_offs[fragi];
     if(borderi<0){
-      for(pi=0;pi<64;pi++){
-        uncoded_ssd+=buffer[pi]*buffer[pi];
-      }
+      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
     }
     else{
-      ogg_int64_t mask;
-      mask=_enc->state.borders[borderi].mask;
-      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        uncoded_ssd+=buffer[pi]*buffer[pi];
-      }
+      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
     }
     /*Scale to match DCT domain and RD.*/
     uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
@@ -2013,20 +1963,14 @@
       mapi=map_idxs[mapii];
       bi=mapi&3;
       fragi=mb_map[pli][bi];
-      frag_offs=frag_buf_offs[fragi];
-      oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
       borderi=frags[fragi].borderi;
-      uncoded_ssd=0;
+      frag_offs=frag_buf_offs[fragi];
       if(borderi<0){
-        for(pi=0;pi<64;pi++){
-          uncoded_ssd+=buffer[pi]*buffer[pi];
-        }
+        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
       }
       else{
-        mask=_enc->state.borders[borderi].mask;
-        for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-          uncoded_ssd+=buffer[pi]*buffer[pi];
-        }
+        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
+         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
       }
       /*Scale to match DCT domain and RD.*/
       uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);

Modified: experimental/derf/theora-ptalarbvorm/lib/encfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encfrag.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/encfrag.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -359,6 +359,47 @@
   return oc_hadamard_sad(_dc,buf);
 }
 
+unsigned oc_enc_frag_ssd(const oc_enc_ctx *_enc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return (*_enc->opt_vtable.frag_ssd)(_src,_ref,_ystride);
+}
+
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
+}
+
+unsigned oc_enc_frag_border_ssd(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ ogg_int64_t _mask){
+  return (*_enc->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask);
+}
+
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  unsigned ret;
+  int      y;
+  int      x;
+  ret=0;
+  for(y=0;y<8;y++){
+    for(x=0;x<8;x++,_mask>>=1){
+      if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
+    }
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return ret;
+}
+
 void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride){
   (*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);

Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-06-26 18:21:33 UTC (rev 17306)
@@ -29,6 +29,7 @@
 typedef oc_mv                         oc_mv2[2];
 
 typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
+typedef struct oc_enc_opt_data        oc_enc_opt_data;
 typedef struct oc_mb_enc_info         oc_mb_enc_info;
 typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
 typedef struct oc_mode_rd             oc_mode_rd;
@@ -120,6 +121,10 @@
 
 /*Encoder specific functions with accelerated variants.*/
 struct oc_enc_opt_vtable{
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
   unsigned (*frag_sad)(const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
   unsigned (*frag_sad_thresh)(const unsigned char *_src,
@@ -133,12 +138,17 @@
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
   unsigned (*frag_intra_satd)(unsigned *_dc,const unsigned char *_src,
    int _ystride);
-  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+  unsigned (*frag_ssd)(const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
-  void     (*frag_sub_128)(ogg_int16_t _diff[64],
-   const unsigned char *_src,int _ystride);
+  unsigned (*frag_border_ssd)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
   void     (*frag_copy2)(unsigned char *_dst,
    const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*enquant_table_init)(void *_enquant,
+   const ogg_uint16_t _dequant[64]);
+  int      (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+   ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+   const void *_dc_enquant,const void *_ac_enquant);
   void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
    const ogg_int16_t _residue[64]);
   void     (*frag_recon_inter)(unsigned char *_dst,
@@ -147,6 +157,18 @@
 };
 
 
+/*Encoder specific data that varies according to which variants of the above
+   functions are used.*/
+struct oc_enc_opt_data{
+  /*The size of a single quantizer table.
+    This must be a multiple of enquant_table_alignment.*/
+  size_t               enquant_table_size;
+  /*The alignment required for the quantizer tables.
+    This must be a positive power of two.*/
+  int                  enquant_table_alignment;
+};
+
+
 void oc_enc_vtable_init(oc_enc_ctx *_enc);
 
 
@@ -429,7 +451,7 @@
   /*The quantization parameters in use.*/
   th_quant_info            qinfo;
   oc_iquant               *enquant_tables[64][3][2];
-  oc_iquant_table          enquant_table_data[64][3][2];
+  unsigned char           *enquant_table_data;
   /*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
     This is used to paramterize the rate control decisions.
     They are kept in the log domain to simplify later processing.
@@ -453,6 +475,8 @@
   oc_rc_state              rc;
   /*Table for encoder acceleration functions.*/
   oc_enc_opt_vtable        opt_vtable;
+  /*Table for encoder data used by accelerated functions.*/
+  oc_enc_opt_data          opt_data;
 };
 
 
@@ -529,8 +553,19 @@
  const unsigned char *_ref2,int _ystride);
 unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,unsigned *_dc,
  const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd(const oc_enc_ctx *_enc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ ogg_int64_t _mask);
 void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+int oc_enc_quantize(const oc_enc_ctx *_enc,
+ ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant);
 void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
  unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
 void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
@@ -545,8 +580,6 @@
  const unsigned char *_src,const unsigned char *_ref,int _ystride);
 void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
  const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2_c(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
 unsigned oc_enc_frag_sad_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
@@ -560,6 +593,17 @@
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
 unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,const unsigned char *_src,
  int _ystride);
+unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_enquant_table_init_c(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant);
 void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/encode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encode.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/encode.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -936,15 +936,21 @@
 void oc_enc_vtable_init_c(oc_enc_ctx *_enc){
   /*The implementations prefixed with oc_enc_ are encoder-specific.
     The rest we re-use from the decoder.*/
+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
   _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
   _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
   _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
   _enc->opt_vtable.frag_satd=oc_enc_frag_satd_c;
   _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_c;
   _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
-  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
-  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_c;
+  _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_c;
   _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
+  _enc->opt_data.enquant_table_alignment=16;
+  _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
+  _enc->opt_vtable.quantize=oc_enc_quantize_c;
   _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
@@ -1048,6 +1054,64 @@
   return 0;
 }
 
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  (*_enc->opt_vtable.enquant_table_init)(_enquant,_dequant);
+}
+
+static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  unsigned char *etd;
+  size_t         ets;
+  int            align;
+  int            qi;
+  int            pli;
+  int            qti;
+  etd=_enc->enquant_table_data;
+  ets=_enc->opt_data.enquant_table_size;
+  align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
+  etd+=align;
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+    _enc->enquant_tables[qi][pli][qti]=etd+((qi*3+pli)*2+qti)*ets;
+  }
+  /*Initialize the dequantization tables first.*/
+  oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
+  /*Derive the quantization tables directly from the dequantization tables.*/
+  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    int plj;
+    int qtj;
+    int dupe;
+    dupe=0;
+    for(qtj=0;qtj<=qti;qtj++){
+      for(plj=0;plj<(qtj<qti?3:pli);plj++){
+        if(_enc->state.dequant_tables[qi][pli][qti]==
+         _enc->state.dequant_tables[qi][plj][qtj]){
+          dupe=1;
+          break;
+        }
+      }
+      if(dupe)break;
+    }
+    if(dupe){
+      _enc->enquant_tables[qi][pli][qti]=_enc->enquant_tables[qi][plj][qtj];
+    }
+    /*In the original VP3.2 code, the rounding offset and the size of the
+       dead zone around 0 were controlled by a "sharpness" parameter.
+      We now R-D optimize the tokens for each block after quantization,
+       so the rounding offset should always be 1/2, and an explicit dead
+       zone is unnecessary.
+      Hence, all of that VP3.2 code is gone from here, and the remaining
+       floating point code has been implemented as equivalent integer
+       code with exact precision.*/
+    else{
+      oc_enc_enquant_table_init(_enc,_enc->enquant_tables[qi][pli][qti],
+       _enc->state.dequant_tables[qi][pli][qti]);
+    }
+  }
+}
+
 /*Sets the quantization parameters to use.
   This may only be called before the setup header is written.
   If it is called multiple times, only the last call has any effect.
@@ -1057,21 +1121,12 @@
            will be used.*/
 static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
  const th_quant_info *_qinfo){
-  int qi;
-  int pli;
-  int qti;
   if(_enc==NULL)return TH_EFAULT;
   if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
   if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
   /*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/
   memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo));
-  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
-    _enc->state.dequant_tables[qi][pli][qti]=
-     _enc->state.dequant_table_data[qi][pli][qti];
-    _enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti];
-  }
-  oc_enquant_tables_init(_enc->state.dequant_tables,
-   _enc->enquant_tables,_qinfo);
+  oc_enc_enquant_tables_init(_enc,_qinfo);
   memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
    sizeof(_enc->state.loop_filter_limits));
   oc_enquant_qavg_init(_enc->log_qavg,_enc->log_plq,_enc->chroma_rd_scale,
@@ -1134,6 +1189,9 @@
 #else
   oc_enc_vtable_init_c(_enc);
 #endif
+  _enc->enquant_table_data=(unsigned char *)_ogg_malloc(
+   64*3*2*_enc->opt_data.enquant_table_size
+   +_enc->opt_data.enquant_table_alignment-1);
   _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
   _enc->state.qis[0]=_enc->state.info.quality;
   _enc->state.nqis=1;
@@ -1176,6 +1234,7 @@
   int pli;
   oc_rc_state_clear(&_enc->rc);
   oggpackB_writeclear(&_enc->opb);
+  _ogg_free(_enc->enquant_table_data);
 #if defined(OC_COLLECT_METRICS)
   /*Save the collected metrics from this run.
     Use tools/process_modedec_stats to actually generate modedec.h from the

Modified: experimental/derf/theora-ptalarbvorm/lib/enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/enquant.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/enquant.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -119,7 +119,7 @@
   }
 }
 
-static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
+void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
   ogg_uint32_t t;
   int          l;
   _d<<=1;
@@ -129,48 +129,63 @@
   _this->l=l;
 }
 
-/*See comments at oc_dequant_tables_init() for how the quantization tables'
-   storage should be initialized.*/
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){
-  int qi;
-  int pli;
-  int qti;
-  /*Initialize the dequantization tables first.*/
-  oc_dequant_tables_init(_dequant,NULL,_qinfo);
-  /*Derive the quantization tables directly from the dequantization tables.*/
-  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-    int zzi;
-    int plj;
-    int qtj;
-    int dupe;
-    dupe=0;
-    for(qtj=0;qtj<=qti;qtj++){
-      for(plj=0;plj<(qtj<qti?3:pli);plj++){
-        if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){
-          dupe=1;
-          break;
-        }
-      }
-      if(dupe)break;
+void oc_enc_enquant_table_init_c(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  oc_iquant *enquant;
+  int        zzi;
+  enquant=(oc_iquant *)_enquant;
+  for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
+}
+
+int oc_enc_quantize(const oc_enc_ctx *_enc,
+ ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant){
+  return (*_enc->opt_vtable.quantize)(_qdct,_dct,
+   _dc_dequant,_ac_dequant,_dc_enquant,_ac_enquant);
+}
+
+int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant){
+  const oc_iquant *enquant;
+  int              nonzero;
+  int              zzi;
+  int              v;
+  int              val;
+  int              d;
+  int              s;
+  /*Quantize the DC coefficient:*/
+  enquant=(const oc_iquant *)_dc_enquant;
+  v=_dct[0];
+  val=v<<1;
+  s=OC_SIGNMASK(val);
+  val+=_dc_dequant+s^s;
+  val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
+  _qdct[0]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
+  nonzero=0;
+  /*Quantize the AC coefficients:*/
+  enquant=(const oc_iquant *)_ac_enquant;
+  for(zzi=1;zzi<64;zzi++){
+    v=_dct[OC_FZIG_ZAG[zzi]];
+    d=_ac_dequant[zzi];
+    val=v<<1;
+    v=abs(val);
+    if(v>=d){
+      s=OC_SIGNMASK(val);
+      /*The bias added here rounds ties away from zero, since token
+         optimization can only decrease the magnitude of the quantized
+         value.*/
+      val+=d+s^s;
+      /*Note the arithmetic right shift is not guaranteed by ANSI C.
+        Hopefully no one still uses ones-complement architectures.*/
+      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
+      _qdct[zzi]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
+      nonzero=zzi;
     }
-    if(dupe){
-      _enquant[qi][pli][qti]=_enquant[qi][plj][qtj];
-      continue;
-    }
-    /*In the original VP3.2 code, the rounding offset and the size of the
-       dead zone around 0 were controlled by a "sharpness" parameter.
-      We now R-D optimize the tokens for each block after quantization,
-       so the rounding offset should always be 1/2, and an explicit dead
-       zone is unnecessary.
-      Hence, all of that VP3.2 code is gone from here, and the remaining
-       floating point code has been implemented as equivalent integer
-       code with exact precision.*/
-    for(zzi=0;zzi<64;zzi++){
-      oc_iquant_init(_enquant[qi][pli][qti]+zzi,
-       _dequant[qi][pli][qti][zzi]);
-    }
+    else _qdct[zzi]=0;
   }
+  return nonzero;
 }
 
 
@@ -226,7 +241,7 @@
    relative to the total, scaled by 2**16, for each pixel format.
   These values were measured after motion-compensated prediction, before
    quantization, over a large set of test video encoded at all possible rates.
-  TODO: These values are only from INTER frames; it should be re-measured for
+  TODO: These values are only from INTER frames; they should be re-measured for
    INTRA frames.*/
 static const ogg_uint16_t OC_PCD[4][3]={
   {59926, 3038, 2572},

Modified: experimental/derf/theora-ptalarbvorm/lib/enquant.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/enquant.h	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/enquant.h	2010-06-26 18:21:33 UTC (rev 17306)
@@ -14,13 +14,10 @@
   ogg_int16_t l;
 };
 
-typedef oc_iquant        oc_iquant_table[64];
 
 
-
 void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
-void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
- oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
+void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d);
 void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
  ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _pl_rd_scale[2][64][2],
  ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);

Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -491,17 +491,20 @@
   for(;;){
     n=_tree[node];
     if(n>available){
-      for(;;){
+      int shift;
+      shift=OC_PB_WINDOW_SIZE-8-available;
+      do{
         /*We don't bother setting eof because we won't check for it after we've
            started decoding DCT tokens.*/
         if(ptr>=stop){
           available=OC_LOTS_OF_BITS;
           break;
         }
-        if(available>OC_PB_WINDOW_SIZE-8)break;
         available+=8;
-        window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+        window|=(oc_pb_window)*ptr++<<shift;
+        shift-=8;
       }
+      while(shift>=0);
       /*Note: We never request more than 24 bits, so there's no need to fill in
          the last partial byte here.*/
     }

Modified: experimental/derf/theora-ptalarbvorm/lib/tokenize.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -49,7 +49,7 @@
   return (0x20820C41U>>_token*5&0x1F)+_eb;
 }
 
-/*TODO: This is now only used during DCT tokenization, and never for runs; it
+/*TODO: This is now only used during DC tokenization, and never for runs; it
    should be simplified.*/
 static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
   int neg;

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -20,6 +20,156 @@
 
 #if defined(OC_X86_ASM)
 
+/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
+   16-bit differences.
+  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
+  xmm4 and xmm5 are clobbered.*/
+#define OC_LOAD_SUB_4x8(_m0) \
+ "#OC_LOAD_SUB_4x8\n\t" \
+ /*Load the first three rows.*/ \
+ "movq (%[src]),"_m0"\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
+ /*Unpack and subtract.*/ \
+ "punpcklbw %%xmm4,"_m0"\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm4,"_m0"\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+ /*Load the last row.*/ \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
+ /*Unpack, subtract, and advance the pointers.*/ \
+ "punpcklbw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ystride],4),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm3\n\t" \
+
+/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
+  On output, xmm0 contains the sum of two of the rows, and the other two are
+   added to xmm7.*/
+#define OC_SSD_4x8(_m0) \
+ "pmaddwd "_m0","_m0"\n\t" \
+ "pmaddwd %%xmm1,%%xmm1\n\t" \
+ "pmaddwd %%xmm2,%%xmm2\n\t" \
+ "pmaddwd %%xmm3,%%xmm3\n\t" \
+ "paddd %%xmm1,"_m0"\n\t" \
+ "paddd %%xmm3,%%xmm2\n\t" \
+ "paddd %%xmm2,%%xmm7\n\t" \
+
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_4x8("%%xmm7")
+    OC_SSD_4x8("%%xmm7")
+    OC_LOAD_SUB_4x8("%%xmm0")
+    OC_SSD_4x8("%%xmm0")
+    "paddd %%xmm0,%%xmm7\n\t"
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+  );
+  return ret;
+}
+
+static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+};
+
+/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
+   horizontal sums as well as their 16-bit differences subject to a mask.
+  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+#define OC_LOAD_SUB_MASK_2x8 \
+ "#OC_LOAD_SUB_MASK_2x8\n\t" \
+ /*Start the loads and expand the next 8 bits of the mask.*/ \
+ "shl $8,%[m]\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "movq (%[ref]),%%xmm2\n\t" \
+ "movd %[m],%%xmm4\n\t" \
+ "shr $8,%[m]\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ /*Perform the masking.*/ \
+ "pand %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm2\n\t" \
+ /*Finish the loads while unpacking the first set of rows, and expand the next
+    8 bits of the mask.*/ \
+ "movd %[m],%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm0\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm2\n\t" \
+ /*Mask and unpack the second set of rows.*/ \
+ "pand %%xmm4,%%xmm1\n\t" \
+ "pand %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  ptrdiff_t ystride;
+  unsigned  ret;
+  int       i;
+  ystride=_ystride;
+  __asm__ __volatile__(
+    "pxor %%xmm7,%%xmm7\n\t"
+    "movq %[c],%%xmm6\n\t"
+    :
+    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+  );
+  for(i=0;i<4;i++){
+    unsigned m;
+    m=_mask&0xFFFF;
+    _mask>>=16;
+    if(m){
+      __asm__ __volatile__(
+        OC_LOAD_SUB_MASK_2x8
+        "pmaddwd %%xmm0,%%xmm0\n\t"
+        "pmaddwd %%xmm1,%%xmm1\n\t"
+        "paddd %%xmm0,%%xmm7\n\t"
+        "paddd %%xmm1,%%xmm7\n\t"
+        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+      );
+    }
+    _src+=2*ystride;
+    _ref+=2*ystride;
+  }
+  __asm__ __volatile__(
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+  );
+  return ret;
+}
+
+
 /*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
    16-bit difference in %%xmm0...%%xmm7.*/
 #define OC_LOAD_SUB_8x8 \

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -44,9 +44,15 @@
 # if defined(OC_X86_64_ASM)
     _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
 # endif
+    _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
+    _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
     _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
     _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+    _enc->opt_vtable.quantize=oc_enc_quantize_sse2;
   }
 }
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h	2010-06-26 18:21:33 UTC (rev 17306)
@@ -22,6 +22,14 @@
 
 void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
 
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
@@ -31,25 +39,27 @@
  unsigned _thresh);
 unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
 unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
  const unsigned char *_src,int _ystride);
-void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,const unsigned char *_y,int _stride);
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,int _stride);
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant);
 void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
 void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
- const unsigned char *_src,int _ystride);
+
 # if defined(OC_X86_64_ASM)
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 # endif

Added: experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c	2010-06-26 18:21:33 UTC (rev 17306)
@@ -0,0 +1,286 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+
+ ********************************************************************/
+
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+
+
+/*The maximum quantized coefficient value.*/
+static const ogg_uint16_t __attribute__((aligned(16))) OC_COEFF_MAX_SSE2[8]={
+  580,580,580,580,580,580,580,580
+};
+
+
+
+/*The default enquant table is not quite suitable for SIMD purposes.
+  First, the m and l parameters need to be separated so that an entire row full
+   of m's or l's can be loaded at a time.
+  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
+   emulate one with a multiply.
+  Therefore we translate the shift count into a scale factor.*/
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  ogg_int16_t *m;
+  ogg_int16_t *l;
+  int          zzi;
+  m=(ogg_int16_t *)_enquant;
+  l=m+64;
+  for(zzi=0;zzi<64;zzi++){
+    oc_iquant q;
+    oc_iquant_init(&q,_dequant[zzi]);
+    m[zzi]=q.m;
+    /*q.l must be at least 2 for this to work; fortunately, once all the scale
+       factors are baked in, the minimum quantizer is much larger.*/
+    l[zzi]=1<<16-q.l;
+  }
+}
+
+/*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
+   and store them in %[qdct].
+  The index of each output element in the original 64-element array should be
+   the following 8x8 array (the letters indicate the order we compute each
+   4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB ordering.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
+ const void *_dc_enquant,const void *_ac_enquant){
+  ptrdiff_t r;
+  /*Load the first rows of the quantizer data and inject the DC terms.
+    We do this early to reduce general-purpose register pressure and because
+     pinsrw has a very long latency.*/
+  __asm__ __volatile__(
+    "movdqa 0x00(%[dq]),%%xmm2\n\t"
+    "movdqa 0x00(%[q]),%%xmm4\n\t"
+    "movdqa 0x80(%[q]),%%xmm5\n\t"
+    "pinsrw $0,%k[dc_dq],%%xmm2\n\t"
+    "pinsrw $0,0x00(%[dc_q]),%%xmm4\n\t"
+    "pinsrw $0,0x80(%[dc_q]),%%xmm5\n\t"
+    :[dq]"+r"(_ac_dequant),[dc_q]"+r"(_dc_enquant),[q]"+r"(_ac_enquant)
+    :[dc_dq]"r"(_dc_dequant)
+  );
+  __asm__ __volatile__(
+    /*Put the input in zig-zag order.*/
+    OC_ZIG_ZAG_MMXEXT
+    /*Loading the first two rows of data and the second dequant row.*/
+    "movdqa 0x00(%[qdct]),%%xmm0\n\t"
+    "movdqa 0x10(%[qdct]),%%xmm1\n\t"
+    "movdqa 0x10(%[dq]),%%xmm3\n\t"
+    "mov $-0x60,%[r]\n\t"
+    /*Loop through two rows at a time.*/
+    ".p2align 4\n\t"
+    "0:\n\t"
+    /*Double the input and propagate its sign to the rounding factor.
+      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
+    "movdqa %%xmm0,%%xmm6\n\t"
+    "psraw $15,%%xmm0\n\t"
+    "movdqa %%xmm1,%%xmm7\n\t"
+    "paddw %%xmm6,%%xmm6\n\t"
+    "psraw $15,%%xmm1\n\t"
+    "paddw %%xmm7,%%xmm7\n\t"
+    "paddw %%xmm0,%%xmm2\n\t"
+    "paddw %%xmm1,%%xmm3\n\t"
+    "pxor %%xmm0,%%xmm2\n\t"
+    "pxor %%xmm1,%%xmm3\n\t"
+    /*Add the rounding factor and perform the first multiply.*/
+    "paddw %%xmm2,%%xmm6\n\t"
+    "movdqa 0x70(%[q],%[r]),%%xmm2\n\t"
+    "paddw %%xmm3,%%xmm7\n\t"
+    "movdqa 0xF0(%[q],%[r]),%%xmm3\n\t"
+    "pmulhw %%xmm6,%%xmm4\n\t"
+    "pmulhw %%xmm7,%%xmm2\n\t"
+    "paddw %%xmm4,%%xmm6\n\t"
+    "paddw %%xmm2,%%xmm7\n\t"
+    /*Emulate an element-wise right-shift via a second multiply.*/
+    "pmulhw %%xmm5,%%xmm6\n\t"
+    "pmulhw %%xmm3,%%xmm7\n\t"
+    /*Load the bounds for the clamp operation.
+      It would be nice to keep these around across iterations, but there aren't
+       enough registers, and it's not like we're doing anything else while
+       waiting for the multiplies to finish.*/
+    "movdqa %[c],%%xmm2\n\t"
+    "pxor %%xmm3,%%xmm3\n\t"
+    "add $32,%[r]\n\t"
+    "psubw %%xmm2,%%xmm3\n\t"
+    /*Correct for the sign.*/
+    "psubw %%xmm0,%%xmm6\n\t"
+    "psubw %%xmm1,%%xmm7\n\t"
+    /*Clamp into the valid range.*/
+    "pminsw %%xmm2,%%xmm6\n\t"
+    "pminsw %%xmm2,%%xmm7\n\t"
+    "pmaxsw %%xmm3,%%xmm6\n\t"
+    "pmaxsw %%xmm3,%%xmm7\n\t"
+    /*Save the result.*/
+    "movdqa %%xmm6,0x40(%[qdct],%[r])\n\t"
+    "movdqa %%xmm7,0x50(%[qdct],%[r])\n\t"
+    "jg 1f\n\t"
+    /*Start loading the data for the next iteration.*/
+    "movdqa 0x60(%[qdct],%[r]),%%xmm0\n\t"
+    "movdqa 0x70(%[qdct],%[r]),%%xmm1\n\t"
+    "movdqa 0x60(%[dq],%[r]),%%xmm2\n\t"
+    "movdqa 0x70(%[dq],%[r]),%%xmm3\n\t"
+    "movdqa 0x60(%[q],%[r]),%%xmm4\n\t"
+    "movdqa 0xE0(%[q],%[r]),%%xmm5\n\t"
+    "jmp 0b\n\t"
+    ".p2align 4\n\t"
+    "1:\n\t"
+    /*Now find the location of the last non-zero value.*/
+    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "pxor %%xmm0,%%xmm0\n\t"
+    "mov $0xFFFFFFFF,%[dq]\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%[q]\n\t"
+    "pmovmskb %%xmm4,%[r]\n\t"
+    "shl $16,%[q]\n\t"
+    "or %[r],%[q]\n\t"
+    "mov $32,%[r]\n\t"
+    /*We have to use xor here instead of not in order to set the flags.
+      This also makes it easy to flip just the lower 32 bits on x86-64.*/
+    "xor %[dq],%[q]\n\t"
+    "jnz 2f\n\t"
+    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
+    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
+    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%[q]\n\t"
+    "pmovmskb %%xmm4,%[r]\n\t"
+    "shl $16,%[q]\n\t"
+    "or %[r],%[q]\n\t"
+    "xor %[r],%[r]\n\t"
+    "xor %[dq],%[q]\n\t"
+    "or $1,%[q]\n\t"
+    "2:\n\t"
+    "bsr %[q],%[q]\n\t"
+    "add %[q],%[r]\n\t"
+    :[r]"=&a"(r),[q]"+r"(_ac_enquant)
+    :[dct]"r"(_dct),[qdct]"r"(_qdct),[dq]"r"(_ac_dequant),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_uint16_t,OC_COEFF_MAX_SSE2,8))
+    :"cc","memory"
+  );
+  return (int)r;
+}
+
+#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-06-26 14:39:59 UTC (rev 17305)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-06-26 18:21:33 UTC (rev 17306)
@@ -43,9 +43,22 @@
    clobber all of "memory" and lets us access local buffers directly using the
    stack pointer, without allocating a separate register to point to them.*/
 #define OC_ARRAY_OPERAND(_type,_ptr,_size) \
-  (*({struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
-   array_addr__;}))
+  (*({ \
+    struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
+    array_addr__; \
+  }))
 
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    const struct{_type array_value__[_size];} *array_addr__= \
+     (const void *)_ptr; \
+    array_addr__; \
+  }))
+
 extern const short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
 
 void oc_state_vtable_init_x86(oc_theora_state *_state);