[xiph-commits] r15802 - in branches/theora-thusnelda: examples lib lib/dec lib/enc tests

Thu Mar 19 20:32:25 PDT 2009

Author: tterribe
Date: 2009-03-19 20:32:25 -0700 (Thu, 19 Mar 2009)
New Revision: 15802

Added:
   branches/theora-thusnelda/lib/enc/enquant.h
Modified:
   branches/theora-thusnelda/examples/Makefile.am
   branches/theora-thusnelda/examples/encoder_example.c
   branches/theora-thusnelda/examples/player_example.c
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/dec/quant.c
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct_decode.c
   branches/theora-thusnelda/lib/enc/encode.c
   branches/theora-thusnelda/lib/enc/encoder_quant.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/frinit.c
   branches/theora-thusnelda/lib/enc/mcenc.c
   branches/theora-thusnelda/lib/enc/mode.c
   branches/theora-thusnelda/tests/Makefile.am
Log:
Initial encoder clean-up and rate control implementation.
There is still a substantial amount of reorganization and optimization to do,
 but this appears to work well enough to trounce mainline in quality.


Modified: branches/theora-thusnelda/examples/Makefile.am
===================================================================

--- branches/theora-thusnelda/examples/Makefile.am	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/examples/Makefile.am	2009-03-20 03:32:25 UTC (rev 15802)
@@ -10,7 +10,7 @@
 AM_CFLAGS = $(OGG_CFLAGS)
 LDADD = ../lib/libtheora.la $(OGG_LIBS)
 LDADDDEC = ../lib/libtheoradec.la $(OGG_LIBS)
-LDADDENC = ../lib/libtheoraenc.la ../lib/libtheoradec.la $(OGG_LIBS)
+LDADDENC = ../lib/libtheoraenc.la ../lib/libtheoradec.la $(OGG_LIBS) -lm
 
 dump_video_SOURCES = dump_video.c
 EXTRA_dump_video_SOURCES = getopt.c getopt1.c getopt.h

Modified: branches/theora-thusnelda/examples/encoder_example.c
===================================================================
--- branches/theora-thusnelda/examples/encoder_example.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/examples/encoder_example.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -116,7 +116,7 @@
 /*The amount to read into the auxilliary buffer.*/
 size_t y4m_aux_buf_read_sz;
 
-/*The function used perform chroma conversion.*/
+/*The function used to perform chroma conversion.*/
 typedef void (*y4m_convert_func)(unsigned char *_dst,unsigned char *_aux);
 
 y4m_convert_func y4m_convert=NULL;

Modified: branches/theora-thusnelda/examples/player_example.c
===================================================================
--- branches/theora-thusnelda/examples/player_example.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/examples/player_example.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -599,7 +599,7 @@
       int arg = 0xffff;
       theora_control(&td,TH_DECCTL_SET_TELEMETRY_MBMODE,&arg,sizeof(arg));
       theora_control(&td,TH_DECCTL_SET_TELEMETRY_MV,&arg,sizeof(arg));
-      }*/
+    }*/
   }else{
     /* tear down the partial theora setup */
     theora_info_clear(&ti);

Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================
--- branches/theora-thusnelda/lib/Makefile.am	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/Makefile.am	2009-03-20 03:32:25 UTC (rev 15802)
@@ -126,7 +126,7 @@
   Version_script-enc
 libtheoraenc_la_LDFLAGS = \
   -version-info @THENC_LIB_CURRENT@:@THENC_LIB_REVISION@:@THENC_LIB_AGE@ \
-  @THEORAENC_LDFLAGS@ $(OGG_LIBS)
+  @THEORAENC_LDFLAGS@ $(OGG_LIBS) -lm
 
 libtheora_la_SOURCES = \
 	$(decoder_arch_sources) \
@@ -136,7 +136,7 @@
   Version_script
 libtheora_la_LDFLAGS = \
   -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
-  @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS)
+  @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS) -lm
 
 debug:
 	$(MAKE) all CFLAGS="@DEBUG@" 

Modified: branches/theora-thusnelda/lib/dec/quant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/dec/quant.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -39,85 +39,83 @@
    qi values change between frames (this is what VP3 did).*/
 void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
  int _pp_dc_scale[64],const th_quant_info *_qinfo){
-  /*coding mode: intra or inter.*/
+  /*Coding mode: intra or inter.*/
   int          qti;
   /*Y', C_b, C_r*/
   int          pli;
-  for(qti=0;qti<2;qti++){
-    for(pli=0;pli<3;pli++){
-      /*Quality index.*/
-      int qi;
-      /*Range iterator.*/
-      int qri;
-      for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
-        oc_quant_table *qtables;
-        th_quant_base   base;
-        ogg_uint32_t    q;
-        int             qi_start;
-        int             qi_end;
-        int             ci;
-        qtables=_dequant[qti][pli];
-        memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
-         sizeof(base));
-        qi_start=qi;
-        if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
-        else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
-        /*Iterate over quality indicies in this range.*/
-        for(;;){
-          ogg_uint32_t  qfac;
-          /*In the original VP3.2 code, the rounding offset and the size of the
-             dead zone around 0 were controlled by a "sharpness" parameter.
-            The size of our dead zone is now controlled by the per-coefficient
-             quality thresholds returned by our HVS module.
-            We round down from a more accurate value when the quality of the
-             reconstruction does not fall below our threshold and it saves bits.
-            Hence, all of that VP3.2 code is gone from here, and the remaining
-             floating point code has been implemented as equivalent integer code
-             with exact precision.*/
-          qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
-          /*For postprocessing, not dequantization.*/
-          if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
-          /*Scale DC the coefficient from the proper table.*/
-          q=(qfac/100)<<2;
-          q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-          qtables[qi][0]=(ogg_uint16_t)q;
-          /*Now scale AC coefficients from the proper table.*/
-          for(ci=1;ci<64;ci++){
-            q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
-            q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-            qtables[qi][ci]=(ogg_uint16_t)q;
-          }
-          if(++qi>=qi_end)break;
-          /*Interpolate the next base matrix.*/
-          for(ci=0;ci<64;ci++){
-            base[ci]=(unsigned char)(
-             (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-             (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-             +_qinfo->qi_ranges[qti][pli].sizes[qri])/
-             (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
-          }
+  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    /*Quality index.*/
+    int qi;
+    /*Range iterator.*/
+    int qri;
+    for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
+      oc_quant_table *qtables;
+      th_quant_base   base;
+      ogg_uint32_t    q;
+      int             qi_start;
+      int             qi_end;
+      int             ci;
+      qtables=_dequant[qti][pli];
+      memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
+       sizeof(base));
+      qi_start=qi;
+      if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+      else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
+      /*Iterate over quality indicies in this range.*/
+      for(;;){
+        ogg_uint32_t  qfac;
+        /*In the original VP3.2 code, the rounding offset and the size of the
+           dead zone around 0 were controlled by a "sharpness" parameter.
+          The size of our dead zone is now controlled by the per-coefficient
+           quality thresholds returned by our HVS module.
+          We round down from a more accurate value when the quality of the
+           reconstruction does not fall below our threshold and it saves bits.
+          Hence, all of that VP3.2 code is gone from here, and the remaining
+           floating point code has been implemented as equivalent integer code
+           with exact precision.*/
+        qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
+        /*For postprocessing, not dequantization.*/
+        if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
+        /*Scale DC the coefficient from the proper table.*/
+        q=(qfac/100)<<2;
+        q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+        qtables[qi][0]=(ogg_uint16_t)q;
+        /*Now scale AC coefficients from the proper table.*/
+        for(ci=1;ci<64;ci++){
+          q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
+          q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          qtables[qi][ci]=(ogg_uint16_t)q;
         }
+        if(++qi>=qi_end)break;
+        /*Interpolate the next base matrix.*/
+        for(ci=0;ci<64;ci++){
+          base[ci]=(unsigned char)(
+           (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+           (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+           +_qinfo->qi_ranges[qti][pli].sizes[qri])/
+           (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
+        }
       }
-      /*Staging matricies complete; commit to memory only if this isn't a
-         duplicate of a preceeding plane.
-        This simple check helps us improve cache coherency later.*/
-      {
-        int dupe;
-        int qtj;
-        int plj;
-        dupe=0;
-        for(qtj=0;qtj<=qti;qtj++){
-          for(plj=0;plj<(qtj<qti?3:pli);plj++){
-            if(!memcmp(_dequant[qti][pli],_dequant[qtj][plj],
-             sizeof(oc_quant_tables))){
-              dupe=1;
-              break;
-            }
+    }
+    /*Staging matrices complete; commit to memory only if this isn't a
+       duplicate of a preceeding set of matrices.
+      This simple check helps us improve cache coherency later.*/
+    {
+      int dupe;
+      int qtj;
+      int plj;
+      dupe=0;
+      for(qtj=0;qtj<=qti;qtj++){
+        for(plj=0;plj<(qtj<qti?3:pli);plj++){
+          if(!memcmp(_dequant[qti][pli],_dequant[qtj][plj],
+           sizeof(oc_quant_tables))){
+            dupe=1;
+            break;
           }
-          if(dupe)break;
         }
-        if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
+        if(dupe)break;
       }
+      if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
     }
   }
 }

Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2009-03-20 03:32:25 UTC (rev 15802)
@@ -26,6 +26,7 @@
 
 #include "theora/theora.h"
 #include "encoder_huffman.h"
+#include "../dec/ocintrin.h"
 typedef struct CP_INSTANCE CP_INSTANCE;
 #include "dsp.h"
 
@@ -100,18 +101,15 @@
   ogg_uint32_t       Frequency;
 } HUFF_ENTRY;
 
-typedef struct{
-  ogg_int32_t   x;
-  ogg_int32_t   y;
-} mv_t;
+typedef struct mc_state mc_state;
 
-typedef struct {
-  mv_t               candidates[12];
+struct mc_state{
+  int                candidates[12][2];
   int                setb0;
   int                ncandidates;
   ogg_int32_t        mvapw1[2];
   ogg_int32_t        mvapw2[2];
-} mc_state;
+};
 
 typedef struct macroblock {
   /* the blocks comprising this macroblock */
@@ -120,17 +118,21 @@
   int ysb;
   int usb;
   int vsb;
-    
-  int cneighbors[4];      
+
+  int cneighbors[4];
   int ncneighbors;
   int pneighbors[4];
   int npneighbors; 
 
   coding_mode_t mode;
 
+  oc_mv block_mv[4];
+  oc_mv ref_mv[4];
   /* per-block final motion vectors */
   /* raster order */
-  mv_t mv[4];
+  oc_mv mv[4];
+  /*Per-block final chroma motion vectors.*/
+  oc_mv cbmvs[4];
 
   /* Motion vectors for a macro block for the current frame and the
      previous two frames.
@@ -140,12 +142,14 @@
      and constant acceleration.
 
      Uninitialized MVs are (0,0).*/
-  mv_t   analysis_mv[3][2]; /* [cur,prev,prev2][frame,golden] */
+  oc_mv analysis_mv[3][2]; /* [cur,prev,prev2][frame,golden] */
+  oc_mv unref_mv[2];
   /*Minimum motion estimation error from the analysis stage.*/
   int    aerror;
   int    gerror;
 
   char coded;
+  char refined;
 } macroblock_t;
 
 #define SB_MB_BLFRAG(sb,mbnum) ((sb).f[ ((mbnum)<2? ((mbnum)==0?0:4) : ((mbnum)==2?8:14)) ])
@@ -157,11 +161,11 @@
 typedef ogg_int16_t    quant_table[64]; 
 typedef quant_table    quant_tables[64]; /* [zigzag][qi] */
 
-typedef ogg_int32_t    iquant_table[64];  
-typedef iquant_table   iquant_tables[64]; /* [qi][coeff] */
+#include "enquant.h"
 
-typedef struct {
-  const unsigned char *mode_bits[8];
+typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+
+struct oc_mode_scheme_chooser{
   /*Pointers to the a list containing the index of each mode in the mode
     alphabet used by each scheme.
     The first entry points to the dynamic scheme0_ranks, while the remaining
@@ -179,8 +183,22 @@
   unsigned char        scheme_list[8];
   /*The number of bits used by each mode coding scheme.*/
   int                  scheme_bits[8];
-} oc_mode_scheme_chooser;
+};
 
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
+
+typedef struct oc_rc_state oc_rc_state;
+
+struct oc_rc_state{
+  ogg_int64_t bits_per_frame;
+  ogg_int64_t fullness;
+  ogg_int64_t target;
+  ogg_int64_t max;
+  unsigned    exp[2];
+  unsigned    scale[2];
+  int         buf_delay;
+};
+
 /* Encoder (Compressor) instance -- installed in a theora_state */
 struct CP_INSTANCE {
   /*This structure must be first.
@@ -241,10 +259,8 @@
   ogg_uint32_t     dc_bits[2][DC_HUFF_CHOICES];
   ogg_uint32_t     ac1_bits[2][AC_HUFF_CHOICES];
   ogg_uint32_t     acN_bits[2][AC_HUFF_CHOICES];
-
   ogg_uint32_t     MVBits_0; /* count of bits used by MV coding mode 0 */
   ogg_uint32_t     MVBits_1; /* count of bits used by MV coding mode 1 */
-
   oc_mode_scheme_chooser chooser;
 
   /*********************************************************************/
@@ -295,6 +311,7 @@
   int              keyframe_granule_shift;
   int              lambda;
   int              BaseQ;
+  int              MinQ;
   int              GoldenFrameEnabled;
   int              InterPrediction;
   int              MotionCompensation;
@@ -308,9 +325,23 @@
 
   th_quant_info    quant_info;
   quant_tables     quant_tables[2][3];
-  iquant_tables    iquant_tables[2][3];
-
-
+  oc_iquant_tables iquant_tables[2][3];
+  /*An "average" quantizer for each quantizer type (INTRA or INTER) and QI
+     value.
+    This is used to paramterize the rate control decisions.
+    It is scaled by a factor of 8, which is necessary to gain sufficient
+     resolution to distinguish the original VP3 quantizers at the low end (even
+     then some INTRA quantizers are indistinguishable, but they really _are_
+     essentially the same, which is an unfortunate effect of VP3 a) using the
+     same DC scale for many QI values and b) lopping off the two fractional
+     bits of quantizer precision for essentially no reason and then spacing its
+     AC scale factors very closely.
+    Keep in mind these are in the DCT domain, and so are scaled by an
+     additional factor of 4 from the pixel domain, for a total scale factor of
+     32.*/
+  ogg_uint16_t     qavg[2][64];
+  /*The buffer state used to drive rate control.*/
+  oc_rc_state      rc;
   DspFunctions     dsp;  /* Selected functions for this platform */
 
 };
@@ -353,7 +384,6 @@
 extern void dct_tokenize_finish (CP_INSTANCE *cpi);
 extern void dct_tokenize_mark_ac_chroma (CP_INSTANCE *cpi);
 
-extern void WriteQTables(CP_INSTANCE *cpi,oggpack_buffer *opb);
 extern void InitQTables( CP_INSTANCE *cpi );
 extern void InitHuffmanSet( CP_INSTANCE *cpi );
 extern void ClearHuffmanSet( CP_INSTANCE *cpi );
@@ -371,7 +401,7 @@
 			    mc_state *_mcenc,
 			    int _mbi,
 			    int _goldenp,
-			    mv_t *_bmvs,
+			    oc_mv _bmvs[4],
 			    int *best_err,
 			    int best_block_err[4]);
 

Modified: branches/theora-thusnelda/lib/enc/dct_decode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_decode.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/dct_decode.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -216,21 +216,19 @@
 }
 
 void ReconRefFrames (CP_INSTANCE *cpi){
-  unsigned char *temp = cpi->lastrecon;
-
-  /* swap */
+  unsigned char *temp;
+  /*Swap.*/
+  temp=cpi->lastrecon;
   cpi->lastrecon=cpi->recon;
   cpi->recon=temp;
-
   /* Apply a loop filter to edge pixels of updated blocks */
   dsp_LoopFilter(cpi->dsp, cpi, cpi->quant_info.loop_filter_limits[cpi->BaseQ] /* temp */);
-
   /* We may need to update the UMV border */
   UpdateUMVBorder(cpi, cpi->lastrecon);
-  
-  if ( cpi->FrameType == KEY_FRAME )
-    memcpy(cpi->golden,cpi->lastrecon,sizeof(*cpi->lastrecon)*cpi->frame_size);
-
+  /*Swap back.*/
+  temp=cpi->lastrecon;
+  cpi->lastrecon=cpi->recon;
+  cpi->recon=temp;
 }
 
 void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)

Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/encode.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -21,18 +21,18 @@
 #include "encoder_lookup.h"
 
 static int predict_frag(int wpc,
-			ogg_int16_t *dc,
-			ogg_int16_t *down,
-			int *last){
-  
+                        ogg_int16_t *dc,
+                        ogg_int16_t *down,
+                        int *last){
+
   if(wpc){
     ogg_int16_t DC = 0;
-    
+
     if(wpc&0x1) DC += pc[wpc][0]* *(dc-1);
     if(wpc&0x2) DC += pc[wpc][1]* *(down-1);
     if(wpc&0x4) DC += pc[wpc][2]* *(down);
     if(wpc&0x8) DC += pc[wpc][3]* *(down+1);
-    
+
     /* if we need to do a shift */
     if(pc[wpc][4]) {
       /* If negative add in the negative correction factor */
@@ -40,18 +40,18 @@
       /* Shift in lieu of a divide */
       DC >>= pc[wpc][4];
     }
-    
+
     /* check for outranging on the two predictors that can outrange */
     if((wpc&(PU|PUL|PL)) == (PU|PUL|PL)){
       if( abs(DC - *down) > 128) {
-	DC = *down;
+        DC = *down;
       } else if( abs(DC - *(dc-1)) > 128) {
-	DC = *(dc-1);
+        DC = *(dc-1);
       } else if( abs(DC - *(down-1)) > 128) {
-	DC = *(down-1);
+        DC = *(down-1);
       }
     }
-    
+
     *last = *dc;
     return *dc - DC;
   }else{
@@ -86,21 +86,20 @@
       memcpy(dc,cpi->frag_dc+fi,sizeof(dc));
 
       for (x=0; x<h; x++, fi++) {
-	if(cp[fi]) {
-	  int wpc=0;
-	  int wf = Mode2Frame[mb_row[x>>subh].mode];
-	  
-	  if(x>0){ 
-	    if(cp[fi-1] && Mode2Frame[mb_row[(x-1)>>subh].mode] == wf) wpc|=1; /* left */
-	    if(y>0 && cp[fi-h-1] && Mode2Frame[mb_down[(x-1)>>subh].mode] == wf) wpc|=2; /* down left */
-	  }
-	  if(y>0){
-	    if(cp[fi-h] && Mode2Frame[mb_down[x>>subh].mode] == wf) wpc|=4; /* down */
-	    if(x+1<h && cp[fi-h+1] && Mode2Frame[mb_down[(x+1)>>subh].mode] == wf) wpc|=8; /* down right */
-	  }
+        if(cp[fi]) {
+          int wpc=0;
+          int wf = Mode2Frame[mb_row[x>>subh].mode];
 
-	  cpi->frag_dc[fi]=predict_frag(wpc,dc+x,down+x,last+wf);
-	}
+          if(x>0){
+            if(cp[fi-1] && Mode2Frame[mb_row[(x-1)>>subh].mode] == wf) wpc|=1; /* left */
+            if(y>0 && cp[fi-h-1] && Mode2Frame[mb_down[(x-1)>>subh].mode] == wf) wpc|=2; /* down left */
+          }
+          if(y>0){
+            if(cp[fi-h] && Mode2Frame[mb_down[x>>subh].mode] == wf) wpc|=4; /* down */
+            if(x+1<h && cp[fi-h+1] && Mode2Frame[mb_down[(x+1)>>subh].mode] == wf) wpc|=8; /* down right */
+          }
+          cpi->frag_dc[fi]=predict_frag(wpc,dc+x,down+x,last+wf);
+        }
       }
     }
   }
@@ -110,7 +109,7 @@
   int interp = (cpi->FrameType!=KEY_FRAME);
   int i,plane;
   int best;
-  
+
   for(plane = 0; plane<2; plane++){
 
     /* Work out which table options are best for DC */
@@ -118,49 +117,49 @@
     cpi->huffchoice[interp][0][plane] = DC_HUFF_OFFSET;
     for ( i = 1; i < DC_HUFF_CHOICES; i++ ) {
       if ( cpi->dc_bits[plane][i] < best ) {
-	best = cpi->dc_bits[plane][i];
-	cpi->huffchoice[interp][0][plane] = i + DC_HUFF_OFFSET;
+        best = cpi->dc_bits[plane][i];
+        cpi->huffchoice[interp][0][plane] = i + DC_HUFF_OFFSET;
       }
     }
-  
+
     /* Work out which table options are best for AC */
     best = cpi->ac1_bits[plane][0]+cpi->acN_bits[plane][0];
     cpi->huffchoice[interp][1][plane] = AC_HUFF_OFFSET;
     for ( i = 1; i < AC_HUFF_CHOICES; i++ ) {
       int test = cpi->ac1_bits[plane][i] + cpi->acN_bits[plane][i];
       if ( test < best ){
-	best = test;
-	cpi->huffchoice[interp][1][plane] = i + AC_HUFF_OFFSET;
+        best = test;
+        cpi->huffchoice[interp][1][plane] = i + AC_HUFF_OFFSET;
       }
     }
   }
 }
 
-static void EncodeTokenGroup(CP_INSTANCE *cpi, 
-			     int group, 
-			     int huffY,
-			     int huffC){
+static void EncodeTokenGroup(CP_INSTANCE *cpi,
+                             int group,
+                             int huffY,
+                             int huffC){
 
   int i;
   oggpack_buffer *opb=cpi->oggbuffer;
   unsigned char *token = cpi->dct_token[group];
   ogg_uint16_t *eb = cpi->dct_token_eb[group];
- 
+
   for(i=0; i<cpi->dct_token_ycount[group]; i++){
     if(token[i] < DCT_NOOP){
       oggpackB_write( opb, cpi->HuffCodeArray_VP3x[huffY][token[i]],
-		      cpi->HuffCodeLengthArray_VP3x[huffY][token[i]] );
-      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0) 
-	oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
+                      cpi->HuffCodeLengthArray_VP3x[huffY][token[i]] );
+      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0)
+        oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
     }
   }
 
   for(; i<cpi->dct_token_count[group]; i++){
     if(token[i] < DCT_NOOP){
       oggpackB_write( opb, cpi->HuffCodeArray_VP3x[huffC][token[i]],
-		      cpi->HuffCodeLengthArray_VP3x[huffC][token[i]] );
-      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0) 
-	oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
+                      cpi->HuffCodeLengthArray_VP3x[huffC][token[i]] );
+      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0)
+        oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
     }
   }
 }
@@ -185,22 +184,22 @@
 
   bits1 = oggpackB_bits(opb);
   for(i=1;i<=AC_TABLE_2_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0], 
-		     cpi->huffchoice[interp][1][1]);
+    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0],
+                     cpi->huffchoice[interp][1][1]);
 
   for(;i<=AC_TABLE_3_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES, 
-		     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES);
+    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES,
+                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES);
 
   for(;i<=AC_TABLE_4_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*2, 
-		     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*2);
+    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*2,
+                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*2);
 
   for(;i<BLOCK_SIZE;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*3, 
-		     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*3);
+    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*3,
+                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*3);
   bits1 = oggpackB_bits(opb)-bits1;
-  
+
   return bits1;
 }
 
@@ -249,10 +248,10 @@
     for ( MB=0; MB<4; MB++ ) {
       macroblock_t *mbp = &cpi->macro[sp->m[MB]];
       if(mbp->coded){
-	/* Add the appropriate mode entropy token. */
-	int index = ModeScheme[mbp->mode];
-	oggpackB_write( opb, ModeWords[index],
-			(ogg_uint32_t)ModeBits[index] );
+        /* Add the appropriate mode entropy token. */
+        int index = ModeScheme[mbp->mode];
+        oggpackB_write( opb, ModeWords[index],
+                        (ogg_uint32_t)ModeBits[index] );
       }
     }
   }
@@ -286,23 +285,23 @@
       if(!mbp->coded) continue;
 
       if(mbp->mode==CODE_INTER_PLUS_MV || mbp->mode==CODE_GOLDEN_MV){
-	/* One MV for the macroblock */
-	for(B=0; B<4; B++ ){
-	  if(mbp->coded & (1<<B)){
-	    oggpackB_write( opb, MvPatternPtr[mbp->mv[B].x], MvBitsPtr[mbp->mv[B].x] );
-	    oggpackB_write( opb, MvPatternPtr[mbp->mv[B].y], MvBitsPtr[mbp->mv[B].y] );
-	    break;
-	  }
-	}
+        /* One MV for the macroblock */
+        for(B=0; B<4; B++ ){
+          if(mbp->coded & (1<<B)){
+            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][0]], MvBitsPtr[mbp->mv[B][0]] );
+            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][1]], MvBitsPtr[mbp->mv[B][1]] );
+            break;
+          }
+        }
 
       }else if (mbp->mode == CODE_INTER_FOURMV){
-	/* MV for each codedblock */
-	for(B=0; B<4; B++ ){
-	  if(mbp->coded & (1<<B)){
-	    oggpackB_write( opb, MvPatternPtr[mbp->mv[B].x], MvBitsPtr[mbp->mv[B].x] );
-	    oggpackB_write( opb, MvPatternPtr[mbp->mv[B].y], MvBitsPtr[mbp->mv[B].y] );
-	  }
-	}
+        /* MV for each codedblock */
+        for(B=0; B<4; B++ ){
+          if(mbp->coded & (1<<B)){
+            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][0]], MvBitsPtr[mbp->mv[B][0]] );
+            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][1]], MvBitsPtr[mbp->mv[B][1]] );
+          }
+        }
       }
     }
   }
@@ -327,14 +326,12 @@
     PackMotionVectors (cpi);
     mvbits = oggpackB_bits(cpi->oggbuffer)-prebits;
   }
-
   ChooseTokenTables(cpi);
   {
     int prebits = oggpackB_bits(cpi->oggbuffer);
     EncodeTokenList(cpi);
     dctbits = oggpackB_bits(cpi->oggbuffer)-prebits;
   }
-  
   bits = oggpackB_bits(cpi->oggbuffer);
   ReconRefFrames(cpi);
 
@@ -355,14 +352,14 @@
       int stride = cpi->stride[pi];
       int h = cpi->frag_h[pi]*8;
       int v = cpi->frag_v[pi]*8;
-      
+
       for(y=0;y<v;y++){
-	int lssd=0;
-	for(x=0;x<h;x++)
-	  lssd += (frame[x]-recon[x])*(frame[x]-recon[x]);
-	ssd+=lssd;
-	frame+=stride;
-	recon+=stride;
+        int lssd=0;
+        for(x=0;x<h;x++)
+          lssd += (frame[x]-recon[x])*(frame[x]-recon[x]);
+        ssd+=lssd;
+        frame+=stride;
+        recon+=stride;
       }
       fi+=cpi->frag_n[pi];
     }
@@ -370,38 +367,38 @@
     minimize = ssd + (float)bits*cpi->token_lambda*16;
 
     fprintf(stdout,"%d %d %d %d %f %f %f %ld %ld %ld %ld %f %f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  \n",
-	    (int)cpi->CurrentFrame, // 0
-	    cpi->BaseQ,             // 1
-	    cpi->token_lambda,      // 2
-	    cpi->skip_lambda,       // 3
-	    (double)cpi->rho_count[cpi->BaseQ]/total,           // 4
-	    (double)cpi->rho_postop/total,                      // 5
-	    (double)cpi->rho_postop/cpi->rho_count[cpi->BaseQ], // 6
- 	    modebits,               // 7
-	    mvbits,                 // 8
-	    dctbits,                // 9
-	    oggpackB_bits(cpi->oggbuffer), // 10
-	    (double)ssd,              // 11
-	    (double)0,
-	    (double)cpi->dist_dist[0][0],//13
-	    (double)cpi->dist_dist[0][1],
-	    (double)cpi->dist_dist[0][2],
-	    (double)cpi->dist_dist[0][3],
-	    (double)cpi->dist_dist[0][4],
-	    (double)cpi->dist_dist[0][5],
-	    (double)cpi->dist_dist[0][6],
-	    (double)cpi->dist_dist[0][7],
-	    (double)(cpi->dist_bits[0][0]>>7),//21
-	    (double)(cpi->dist_bits[0][1]>>7),
-	    (double)(cpi->dist_bits[0][2]>>7),
-	    (double)(cpi->dist_bits[0][3]>>7),
-	    (double)(cpi->dist_bits[0][4]>>7),
-	    (double)(cpi->dist_bits[0][5]>>7),
-	    (double)(cpi->dist_bits[0][6]>>7),
-	    (double)(cpi->dist_bits[0][7]>>7)
-	    
+            (int)cpi->CurrentFrame, // 0
+            cpi->BaseQ,             // 1
+            cpi->token_lambda,      // 2
+            cpi->skip_lambda,       // 3
+            (double)cpi->rho_count[cpi->BaseQ]/total,           // 4
+            (double)cpi->rho_postop/total,                      // 5
+            (double)cpi->rho_postop/cpi->rho_count[cpi->BaseQ], // 6
+            modebits,               // 7
+            mvbits,                 // 8
+            dctbits,                // 9
+            oggpackB_bits(cpi->oggbuffer), // 10
+            (double)ssd,              // 11
+            (double)0,
+            (double)cpi->dist_dist[0][0],//13
+            (double)cpi->dist_dist[0][1],
+            (double)cpi->dist_dist[0][2],
+            (double)cpi->dist_dist[0][3],
+            (double)cpi->dist_dist[0][4],
+            (double)cpi->dist_dist[0][5],
+            (double)cpi->dist_dist[0][6],
+            (double)cpi->dist_dist[0][7],
+            (double)(cpi->dist_bits[0][0]>>7),//21
+            (double)(cpi->dist_bits[0][1]>>7),
+            (double)(cpi->dist_bits[0][2]>>7),
+            (double)(cpi->dist_bits[0][3]>>7),
+            (double)(cpi->dist_bits[0][4]>>7),
+            (double)(cpi->dist_bits[0][5]>>7),
+            (double)(cpi->dist_bits[0][6]>>7),
+            (double)(cpi->dist_bits[0][7]>>7)
 
-	    );               
+
+            );
   }
 #endif
 #endif
@@ -413,7 +410,7 @@
 
   /* Output the frame type (base/key frame or inter frame) */
   oggpackB_write( opb, cpi->FrameType, 1 );
-  
+
   /* Write out details of the current value of Q... variable resolution. */
   oggpackB_write( opb, cpi->BaseQ, 6 ); // temporary
 

Modified: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -23,9 +23,6 @@
 #define OC_QUANT_MAX        (1024<<2)
 static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
 static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
-#define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
-#define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
-#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
 
 static int ilog(unsigned _v){
   int ret;
@@ -33,10 +30,25 @@
   return ret;
 }
 
-void WriteQTables(CP_INSTANCE *cpi,oggpack_buffer* _opb) {
+/*Reciprocal square root.
+  Adapted from libcelt, which is (C) 2002-2008 Jean-Marc Valin.
+  Return: A 4th-order polynomial approximation of 2**15/sqrt(_x).*/
+static ogg_uint16_t oc_rsqrt(ogg_uint32_t _x){
+  ogg_int32_t n;
+  int         k;
+  k=ilog(_x)-1>>1;
+  if(k>7)_x>>=k-7<<1;
+  else _x<<=7-k<<1;
+  n=_x-32768;
+  /*These can be implemented as 16x16->high 16 bit muls, but this is currently
+     not performance critical code.
+    Note the reliance on the arithmetic right shift, which is not guaranteed by
+     ANSI.*/
+  return (ogg_uint16_t)(
+   (n*((n*((n*((n*4100>>15)-9097)>>15)+9812)>>15)-11496)>>15)+23126>>k);
+}
 
-  th_quant_info *_qinfo = &cpi->quant_info;
-
+void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
   const th_quant_ranges *qranges;
   const th_quant_base   *base_mats[2*3*64];
   int                    indices[2][3][64];
@@ -51,7 +63,6 @@
   int                    plj;
   int                    bmi;
   int                    i;
-
   /*Unlike the scale tables, we can't assume the maximum value will be in
      index 0, so search for it here.*/
   i=_qinfo->loop_filter_limits[0];
@@ -61,11 +72,11 @@
   for(qi=0;qi<64;qi++){
     oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
   }
-  /* 580 bits for VP3.*/
+  /*580 bits for VP3.*/
   nbits=OC_MAXI(ilog(_qinfo->ac_scale[0]),1);
   oggpackB_write(_opb,nbits-1,4);
   for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
-  /* 516 bits for VP3.*/
+  /*516 bits for VP3.*/
   nbits=OC_MAXI(ilog(_qinfo->dc_scale[0]),1);
   oggpackB_write(_opb,nbits-1,4);
   for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
@@ -96,7 +107,7 @@
   }
   /*Now store quant ranges and their associated indices into the base matrix
      list.
-     46 bits for VP3 matrices.*/
+    46 bits for VP3 matrices.*/
   nbits=ilog(nbase_mats-1);
   for(i=0;i<6;i++){
     qti=i/3;
@@ -134,63 +145,171 @@
   }
 }
 
+static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
+  ogg_uint32_t t;
+  int          l;
+  _d<<=1;
+  l=ilog(_d)-1;
+  t=1+((ogg_uint32_t)1<<16+l)/_d;
+  _this->m=(ogg_int16_t)(t-0x10000);
+  _this->l=l;
+}
+
+/*This table gives the square root of the fraction of the squared magnitude of
+   each DCT coefficient relative to the total, scaled by 2**16, for both INTRA
+   and INTER modes.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video (from QCIF to 1080p) encoded at
+   all possible rates.
+  The DC coefficient takes into account the DPCM prediction (using the
+   quantized values from neighboring blocks, as the encoder does, but still
+   before quantization of the coefficient in the current block).
+  The results differ significantly from the expected variance (e.g., using an
+   AR(1) model of the signal with rho=0.95, as is frequently done to compute
+   the coding gain of the DCT).
+  We use them to estimate an "average" quantizer for a given quantizer matrix,
+   as this is used to parameterize a number of the rate control decisions.
+  These values are themselves probably quantizer-matrix dependent, since the
+   shape of the matrix affects the noise distribution in the reference frames,
+   but they should at least give us _some_ amount of adaptivity to different
+   matrices, as opposed to hard-coding a table of average Q values for the
+   current set.
+  The main features they capture are that a) only a few of the quantizers in
+   the upper-left corner contribute anything significant at all (though INTER
+   mode is significantly flatter) and b) the DPCM prediction of the DC
+   coefficient gives a very minor improvement in the INTRA case and a quite
+   significant one in the INTER case (over the expected variance).*/
+static ogg_uint16_t OC_RPSD[2][64]={
+  {
+    52725,17370,10399, 6867, 5115, 3798, 2942, 2076,
+    17370, 9900, 6948, 4994, 3836, 2869, 2229, 1619,
+    10399, 6948, 5516, 4202, 3376, 2573, 2015, 1461,
+     6867, 4994, 4202, 3377, 2800, 2164, 1718, 1243,
+     5115, 3836, 3376, 2800, 2391, 1884, 1530, 1091,
+     3798, 2869, 2573, 2164, 1884, 1495, 1212,  873,
+     2942, 2229, 2015, 1718, 1530, 1212, 1001,  704,
+     2076, 1619, 1461, 1243, 1091,  873,  704,  474
+  },
+  {
+    23411,15604,13529,11601,10683, 8958, 7840, 6142,
+    15604,11901,10718, 9108, 8290, 6961, 6023, 4487,
+    13529,10718, 9961, 8527, 7945, 6689, 5742, 4333,
+    11601, 9108, 8527, 7414, 7084, 5923, 5175, 3743,
+    10683, 8290, 7945, 7084, 6771, 5754, 4793, 3504,
+     8958, 6961, 6689, 5923, 5754, 4679, 3936, 2989,
+     7840, 6023, 5742, 5175, 4793, 3936, 3522, 2558,
+     6142, 4487, 4333, 3743, 3504, 2989, 2558, 1829
+  }
+};
+
+/*The fraction of the squared magnitude of the residuals in each color channel
+   relative to the total, scaled by 2**11, for each pixel format.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video encoded at all possible rates.
+  TODO: These values are only from INTER frames; it should be re-measured for
+   INTRA frames.*/
+static ogg_uint16_t OC_PCD[4][3]={
+ {1873,  95,  80},
+ {1725, 175, 148},
+ {1725, 175, 148},
+ {1490, 302, 256}
+};
+
+
+
 /* a copied/reconciled version of derf's theora-exp code; redundancy
    should be eliminated at some point */
 void InitQTables( CP_INSTANCE *cpi ){
-  int            qti; /* coding mode: intra or inter */
-  int            pli; /* Y U V */
+  /*Coding mode: intra or inter.*/
+  int qti;
+  /*Y', Cb, Cr.*/
+  int pli;
   th_quant_info *qinfo = &cpi->quant_info;
-
   for(qti=0;qti<2;qti++){
+    /*Quality index.*/
+    int qi;
+    int ci;
     for(pli=0;pli<3;pli++){
-      int qi;  /* quality index */
-      int qri; /* range iterator */
-
-      for(qi=0,qri=0; qri<=qinfo->qi_ranges[qti][pli].nranges; qri++){
+      /*Range iterator.*/
+      int qri;
+      for(qi=0,qri=0;qri<=qinfo->qi_ranges[qti][pli].nranges;qri++){
         th_quant_base base;
-
-        ogg_uint32_t      q;
-        int               qi_start;
-        int               qi_end;
-        int               ci;
+        ogg_uint32_t  q;
+        int           qi_start;
+        int           qi_end;
         memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
-               sizeof(base));
-
+         sizeof(base));
         qi_start=qi;
-        if(qri==qinfo->qi_ranges[qti][pli].nranges)
-          qi_end=qi+1;
-        else
-          qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
-
-        /* Iterate over quality indicies in this range */
+        if(qri==qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+        else qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
+        /*Iterate over quality indicies in this range.*/
         for(;;){
-
+          /*In the original VP3.2 code, the rounding offset and the size of the
+             dead zone around 0 were controlled by a "sharpness" parameter.
+            We now R-D optimize the tokens for each block after quantization,
+             so the rounding offset should always be 1/2, and an explicit dead
+             zone is unnecessary.
+            Hence, all of that VP3.2 code is gone from here, and the remaining
+             floating point code has been implemented as equivalent integer
+             code with exact precision.*/
           /*Scale DC the coefficient from the proper table.*/
           q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
           q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
           cpi->quant_tables[qti][pli][0][qi]=(ogg_uint16_t)q;
-          cpi->iquant_tables[qti][pli][qi][0]=(ogg_int32_t)(((1<<31))/q+1);
-
+          oc_iquant_init(cpi->iquant_tables[qti][pli][qi]+0,(ogg_uint16_t)q);
           /*Now scale AC coefficients from the proper table.*/
           for(ci=1;ci<64;ci++){
+            int zzi;
             q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
             q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-            cpi->quant_tables[qti][pli][zigzag_index[ci]][qi]=(ogg_uint16_t)q;
-            cpi->iquant_tables[qti][pli][qi][zigzag_index[ci]]=(ogg_int32_t)(((1<<31))/q+1);
+            zzi=zigzag_index[ci];
+            cpi->quant_tables[qti][pli][zzi][qi]=(ogg_uint16_t)q;
+            oc_iquant_init(cpi->iquant_tables[qti][pli][qi]+zzi,
+             (ogg_uint16_t)q);
           }
-
           if(++qi>=qi_end)break;
-
           /*Interpolate the next base matrix.*/
           for(ci=0;ci<64;ci++){
-            base[ci]=(unsigned char)
-              ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-                   (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-                +qinfo->qi_ranges[qti][pli].sizes[qri])/
-               (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
+            unsigned a;
+            unsigned b;
+            unsigned r;
+            unsigned s;
+            a=qinfo->qi_ranges[qti][pli].base_matrices[qri][ci];
+            b=qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci];
+            r=qi-qi_start;
+            s=qi_end-qi_start;
+            base[ci]=(unsigned char)((2*((s-r)*a+r*b)+s)/(2*s));
           }
         }
       }
     }
+    /*Now compute an "average" quantizer for each qi level.
+      We do one for INTER and one for INTRA, since their behavior is very
+       different, but average across chroma channels.
+      The basic approach is to compute a geometric average of the squared
+       quantizer, weighted by the expected squared magnitude of the DCT
+       coefficients.
+      Under the (not quite true) assumption that DCT coefficients are
+       Laplacian-distributed, this preserves the product Q*lambda, where
+       lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter.
+      The value Q*lambda completely determines the entropy of the
+       coefficients.*/
+    for(qi=0;qi<64;qi++){
+      ogg_uint32_t q;
+      q=0;
+      for(pli=0;pli<3;pli++){
+        ogg_uint32_t qp;
+        qp=0;
+        for(ci=0;ci<64;ci++){
+          unsigned rq;
+          unsigned qd;
+          qd=cpi->quant_tables[qti][pli][zigzag_index[ci]][qi];
+          rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
+          qp+=rq*(ogg_uint32_t)rq;
+        }
+        q+=OC_PCD[cpi->info.pixelformat][pli]*(qp+128>>8);
+      }
+      cpi->qavg[qti][qi]=oc_rsqrt(q+1024>>11);
+    }
   }
 }

Modified: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -26,9 +26,221 @@
 #include "dsp.h"
 #include "codec_internal.h"
 
+static int _ilog(unsigned int v){
+  int ret=0;
+  while(v){
+    ret++;
+    v>>=1;
+  }
+  return(ret);
+}
+
+static int oc_log2frac(ogg_uint32_t _val,int _frac_bits){
+  int l;
+  l=_ilog(_val);
+  if(l>16)_val>>=l-16;
+  else _val<<=16-l;
+  l--;
+  while(_frac_bits-->0){
+    int b;
+    _val=_val*_val>>15;
+    b=(int)(_val>>16);
+    l=l<<1|b;
+    _val>>=b;
+  }
+  return l;
+}
+
+static ogg_uint16_t oc_exp2(int _log){
+  int          ipart;
+  ogg_uint32_t fpart;
+  ipart=_log>>12;
+  if(ipart>15)return 0xFFFF;
+  else if(ipart<0)return 0;
+  fpart=_log-(ipart<<12)<<3;
+  /*3rd order polynomial approximation in Q15:
+     (((3*log(2)-2)*f+3-4*log(2))*f+log(2))*f+1*/
+  fpart=(fpart*((fpart*((fpart*2603>>15)+7452)>>15)+22713)>>15)+32768;
+  if(ipart<15)fpart+=1<<14-ipart;
+  return fpart>>15-ipart;
+}
+
+static void oc_enc_calc_lambda(CP_INSTANCE *cpi){
+  int l;
+  /*For now, lambda is fixed depending on the qi value and frame type:
+      lambda=1.125*(qavg[qti][qi]**1.5)
+    A more adaptive scheme might perform better, but Theora's behavior does not
+     seem to conform to existing models in the literature.*/
+  l=oc_log2frac(cpi->qavg[cpi->FrameType!=KEY_FRAME][cpi->BaseQ],12)-(3<<12);
+  l=oc_exp2(l+(l>>1));
+  cpi->lambda=l+(l>>3);
+}
+
+
+
+static void oc_rc_state_init(oc_rc_state *_rc,const theora_info *_info){
+  unsigned long npixels;
+  unsigned long ibpp;
+  /*TODO: These parameters should be exposed in a th_enc_ctl() API.*/
+  _rc->bits_per_frame=(_info->target_bitrate*
+   (ogg_int64_t)_info->fps_denominator+(_info->fps_numerator>>1))/
+   _info->fps_numerator;
+  /*Insane framerates or frame sizes mean insane bitrates.
+    Let's not get carried away.*/
+  if(_rc->bits_per_frame>0x40000000000000LL){
+    _rc->bits_per_frame=(ogg_int64_t)0x40000000000000LL;
+  }
+  else if(_rc->bits_per_frame<32)_rc->bits_per_frame=32;
+  /*The buffer size is set equal to the keyframe interval, clamped to the range
+     [8,256] frames.
+    The 8 frame minimum gives us some chance to distribute bit estimation
+     errors.
+    The 256 frame maximum means we'll require 8-10 seconds of pre-buffering at
+     24-30 fps, which is not unreasonable.*/
+  _rc->buf_delay=_info->keyframe_frequency_force>256?
+   256:_info->keyframe_frequency_force;
+  _rc->buf_delay=OC_MAXI(_rc->buf_delay,12);
+  _rc->max=_rc->bits_per_frame*_rc->buf_delay;
+  /*Start with a buffer fullness of 75%.
+    We can require fully half the buffer for a keyframe, and so this initial
+     level gives us maximum flexibility for over/under-shooting in subsequent
+     frames.*/
+  _rc->target=_rc->fullness=(_rc->max+1>>1)+(_rc->max+2>>2);
+  /*Pick exponents and initial scales for quantizer selection.
+    TODO: These still need to be tuned.*/
+  npixels=_info->width*(unsigned long)_info->height;
+  ibpp=(npixels+(_rc->bits_per_frame>>1))/_rc->bits_per_frame;
+  if(ibpp<10){
+    _rc->exp[0]=48;
+    _rc->scale[0]=2199;
+    _rc->exp[1]=77;
+    _rc->scale[1]=2500;
+  }
+  else if(ibpp<20){
+    _rc->exp[0]=51;
+    _rc->scale[0]=1781;
+    _rc->exp[1]=90;
+    _rc->scale[1]=1700;
+  }
+  else{
+    _rc->exp[0]=54;
+    _rc->scale[0]=870;
+    _rc->exp[1]=102;
+    _rc->scale[1]=1300;
+  }
+}
+
+static unsigned OC_RATE_SMOOTHING[2]={0x80,0x80};
+
+/*TODO: Convert the following entirely to fixed point.*/
+
+static void oc_enc_update_rc_state(CP_INSTANCE *cpi,
+ long _bits,int _qti,int _qi,int _trial){
+  unsigned      scale;
+  unsigned long npixels;
+  /*Compute the estimated scale factor for this frame type.*/
+  npixels=cpi->info.width*(unsigned long)cpi->info.height;
+  scale=(int)(256.0*_bits/(npixels*
+   pow(cpi->qavg[_qti][_qi]/32.0,cpi->rc.exp[_qti]/-64.0))+0.5);
+  /*Use it to set that factor directly if this was a trial.*/
+  if(_trial)cpi->rc.scale[_qti]=scale;
+  /*Otherwise update an exponential moving average.*/
+  else{
+    cpi->rc.scale[_qti]=(scale<<16)
+     +(cpi->rc.scale[_qti]-scale)*OC_RATE_SMOOTHING[_qti]>>16;
+  }
+  /*Update the buffer fullness level.*/
+  if(!_trial)cpi->rc.fullness+=cpi->rc.bits_per_frame-_bits;
+}
+
+static int oc_enc_select_qi(CP_INSTANCE *cpi,int _qti,int _trial){
+  ogg_int64_t  rate_total;
+  ogg_uint32_t next_key_frame;
+  int          nframes[2];
+  int          buf_delay;
+  /*Figure out how to re-distribute bits so that we hit our fullness target
+     before the last keyframe in our current buffer window (after the current
+     frame), or the end of the buffer window, whichever comes first.*/
+  next_key_frame=_qti?cpi->info.keyframe_frequency_force-cpi->LastKeyFrame:0;
+  nframes[0]=(cpi->rc.buf_delay-OC_MINI(next_key_frame,cpi->rc.buf_delay)
+   +cpi->info.keyframe_frequency_force-1)/cpi->info.keyframe_frequency_force;
+  if(nframes[0]+_qti>1){
+    buf_delay=next_key_frame+(nframes[0]-1)*cpi->info.keyframe_frequency_force;
+    nframes[0]--;
+  }
+  else buf_delay=cpi->rc.buf_delay;
+  nframes[1]=buf_delay-nframes[0];
+  rate_total=cpi->rc.fullness-cpi->rc.target
+   +buf_delay*cpi->rc.bits_per_frame;
+  /*If there aren't enough bits to achieve our desired fullness level, use the
+     minimum quality permitted.*/
+  if(rate_total<=0)return cpi->info.quality;
+  else{
+    static const double KEY_RATIO[2]={0.53125,1.0};
+    unsigned long npixels;
+    double        prevr;
+    double        curr;
+    int           qtarget;
+    int           best_qi;
+    int           best_qdiff;
+    int           qi;
+    int           i;
+    npixels=cpi->info.width*(unsigned long)cpi->info.height;
+    curr=rate_total/(double)buf_delay;
+    for(i=0;i<10;i++){
+      double rdiff;
+      double rderiv;
+      double exp;
+      double rpow;
+      prevr=curr;
+      exp=cpi->rc.exp[1-_qti]/(double)cpi->rc.exp[_qti];
+      rpow=pow(prevr*256.0/(npixels*(double)cpi->rc.scale[_qti]),exp);
+      rdiff=(nframes[_qti]*KEY_RATIO[_qti])*prevr
+       +nframes[1-_qti]*KEY_RATIO[1-_qti]*cpi->rc.scale[1-_qti]/256.0*npixels*
+       rpow-rate_total;
+      rderiv=nframes[_qti]*KEY_RATIO[_qti]+
+       (nframes[1-_qti]*KEY_RATIO[1-_qti]*cpi->rc.scale[1-_qti]/256.0*npixels*
+       rpow)*(exp/prevr);
+      curr=prevr-rdiff/rderiv;
+      if(curr<=0||KEY_RATIO[_qti]*curr>rate_total||fabs(prevr-curr)<1)break;
+    }
+    qtarget=(int)(32*pow(KEY_RATIO[_qti]*curr*256/
+     (npixels*(double)cpi->rc.scale[_qti]),-64.0/cpi->rc.exp[_qti])+0.5);
+    /*If this was not one of the initial frames, limit a change in quality.*/
+    if(!_trial){
+      int qmin;
+      int qmax;
+      qmin=cpi->qavg[_qti][cpi->BaseQ]*13>>4;
+      qmax=cpi->qavg[_qti][cpi->BaseQ]*5>>2;
+      qtarget=OC_CLAMPI(qmin,qtarget,qmax);
+    }
+    /*Search for the quantizer that matches the target most closely.
+      We don't assume a linear ordering, but when there are ties we do pick the
+       quantizer closest to the current one.*/
+    best_qi=cpi->info.quality;
+    best_qdiff=abs(cpi->qavg[_qti][best_qi]-qtarget);
+    for(qi=cpi->info.quality+1;qi<64;qi++){
+      int qdiff;
+      qdiff=abs(cpi->qavg[_qti][qi]-qtarget);
+      if(qdiff<best_qdiff||
+       qdiff==best_qdiff&&abs(qi-cpi->BaseQ)<abs(best_qi-cpi->BaseQ)){
+        best_qi=qi;
+        best_qdiff=qdiff;
+      }
+    }
+    return best_qi;
+  }
+}
+
+
+
 static void CompressKeyFrame(CP_INSTANCE *cpi, int recode){
   oggpackB_reset(cpi->oggbuffer);
   cpi->FrameType = KEY_FRAME;
+  if(cpi->info.target_bitrate>0){
+    cpi->BaseQ=oc_enc_select_qi(cpi,0,cpi->CurrentFrame==1);
+  }
+  oc_enc_calc_lambda(cpi);
   cpi->LastKeyFrame = 0;
 
   /* mark as video frame */
@@ -44,16 +256,22 @@
 static int CompressFrame( CP_INSTANCE *cpi, int recode ) {
   oggpackB_reset(cpi->oggbuffer);
   cpi->FrameType = DELTA_FRAME;
+  if(cpi->info.target_bitrate>0){
+    cpi->BaseQ=oc_enc_select_qi(cpi,1,0);
+  }
+  oc_enc_calc_lambda(cpi);
 
   /* mark as video frame */
   oggpackB_write(cpi->oggbuffer,0,1);
 
   WriteFrameHeader(cpi);
-  if(PickModes( cpi,recode )){
+  if(PickModes(cpi,recode)){
     /* mode analysis thinks this should have been a keyframe; start over and code as a keyframe instead */
 
     oggpackB_reset(cpi->oggbuffer);
     cpi->FrameType = KEY_FRAME;
+    if(cpi->info.target_bitrate>0)cpi->BaseQ=oc_enc_select_qi(cpi,0,0);
+    oc_enc_calc_lambda(cpi);
     cpi->LastKeyFrame = 0;
 
     /* mark as video frame */
@@ -71,6 +289,9 @@
 
   if(cpi->first_inter_frame == 0){
     cpi->first_inter_frame = 1;
+    EncodeData(cpi);
+    oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
+     1,cpi->BaseQ,1);
     CompressFrame(cpi,1);
     return 0;
   }
@@ -83,15 +304,6 @@
 
 /********************** The toplevel: encode ***********************/
 
-static int _ilog(unsigned int v){
-  int ret=0;
-  while(v){
-    ret++;
-    v>>=1;
-  }
-  return(ret);
-}
-
 static void theora_encode_dispatch_init(CP_INSTANCE *cpi);
 
 int theora_encode_init(theora_state *th, theora_info *c){
@@ -102,6 +314,7 @@
   if(c->pixelformat!=OC_PF_420)return OC_IMPL;
   th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
   theora_encode_dispatch_init(cpi);
+  oc_mode_scheme_chooser_init(&cpi->chooser);
 
   dsp_static_init (&cpi->dsp);
 
@@ -114,9 +327,6 @@
   if(c->target_bitrate<0)c->target_bitrate=0;
   cpi->BaseQ = c->quality;
 
-  /* temporary while the RD code is only partially complete */
-  cpi->lambda=200;
-
   /* Set encoder flags. */
   /* if not AutoKeyframing cpi->ForceKeyFrameEvery = is frequency */
   if(!c->keyframe_auto_p)
@@ -160,6 +370,7 @@
   /* This makes sure encoder version specific tables are initialised */
   memcpy(&cpi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
   InitQTables(cpi);
+  if(cpi->info.target_bitrate>0)oc_rc_state_init(&cpi->rc,&cpi->info);
 
   /* Indicate that the next frame to be compressed is the first in the
      current clip. */
@@ -224,19 +435,36 @@
      cpi->info.keyframe_frequency_force){
 
     CompressKeyFrame(cpi,0);
+    oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
+     0,cpi->BaseQ,1);
 
     /* On first frame, the previous was a initial dry-run to prime
        feed-forward statistics */
     if(cpi->CurrentFrame==1)CompressKeyFrame(cpi,1);
 
-  } else  {
-    /* Compress the frame. */
-    dropped = CompressFrame(cpi,0);
   }
+  else{
+    /*Compress the frame.*/
+    dropped=CompressFrame(cpi,0);
+  }
 
   /* Update stats variables. */
+  {
+    /* swap */
+    unsigned char *temp;
+    temp=cpi->lastrecon;
+    cpi->lastrecon=cpi->recon;
+    cpi->recon=temp;
+  }
+  if(cpi->FrameType==KEY_FRAME){
+    memcpy(cpi->golden,cpi->lastrecon,sizeof(*cpi->lastrecon)*cpi->frame_size);
+  }
   cpi->CurrentFrame++;
   cpi->packetflag=1;
+  if(cpi->info.target_bitrate>0){
+    oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
+     cpi->FrameType!=KEY_FRAME,cpi->BaseQ,0);
+  }
 
   t->granulepos=
     ((cpi->CurrentFrame - cpi->LastKeyFrame)<<cpi->keyframe_granule_shift)+
@@ -395,7 +623,7 @@
   oggpackB_write(cpi->oggbuffer,0x82,8);
   _tp_writebuffer(cpi->oggbuffer,"theora",6);
 
-  WriteQTables(cpi,cpi->oggbuffer);
+  oc_quant_params_pack(cpi->oggbuffer,&cpi->quant_info);
   WriteHuffmanTrees(cpi->HuffRoot_VP3x,cpi->oggbuffer);
 
   op->packet=oggpackB_get_buffer(cpi->oggbuffer);

Added: branches/theora-thusnelda/lib/enc/enquant.h
===================================================================
--- branches/theora-thusnelda/lib/enc/enquant.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/enquant.h	2009-03-20 03:32:25 UTC (rev 15802)
@@ -0,0 +1,24 @@
+#if !defined(_enquant_H)
+# define _enquant_H (1)
+# include "../dec/quant.h"
+
+typedef struct oc_iquant oc_iquant;
+
+/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
+   (i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
+  This is not an approximation; for 16-bit x and d, it is exact.*/
+struct oc_iquant{
+  ogg_int16_t m;
+  ogg_int16_t l;
+};
+
+typedef oc_iquant        oc_iquant_table[64];
+typedef oc_iquant_table  oc_iquant_tables[64];
+
+
+
+void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
+void oc_enquant_tables_init(oc_quant_table *_dequant[2][3],
+ oc_quant_table *_enquant[2][3],const th_quant_info *_qinfo);
+
+#endif

Modified: branches/theora-thusnelda/lib/enc/frinit.c
===================================================================
--- branches/theora-thusnelda/lib/enc/frinit.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/frinit.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -51,20 +51,20 @@
 
 /* A note to people reading and wondering why malloc returns aren't
    checked:
-   
+
    lines like the following that implement a general strategy of
    'check the return of malloc; a zero pointer means we're out of
    memory!'...:
-   
+
    if(!cpi->extra_fragments) { EDeleteFragmentInfo(cpi); return FALSE; }
-   
+
    ...are not useful.  It's true that many platforms follow this
    malloc behavior, but many do not.  The more modern malloc
    strategy is only to allocate virtual pages, which are not mapped
    until the memory on that page is touched.  At *that* point, if
    the machine is out of heap, the page fails to be mapped and a
    SEGV is generated.
-   
+
    That means that if we want to deal with out of memory conditions,
    we *must* be prepared to process a SEGV.  If we implement the
    SEGV handler, there's no reason to to check malloc return; it is
@@ -113,7 +113,7 @@
   cpi->super_total = cpi->super_n[0] + cpi->super_n[1] + cpi->super_n[2];
 
   /* +1; the last entry is the 'invalid' frag, which is always set to not coded as it doesn't really exist */
-  cpi->frag_coded = calloc(cpi->frag_total+1, sizeof(*cpi->frag_coded)); 
+  cpi->frag_coded = calloc(cpi->frag_total+1, sizeof(*cpi->frag_coded));
   cpi->frag_buffer_index = calloc(cpi->frag_total, sizeof(*cpi->frag_buffer_index));
   cpi->frag_dc = calloc(cpi->frag_total, sizeof(*cpi->frag_dc));
 
@@ -144,7 +144,7 @@
    cpi->dct_eob_fi_storage = _ogg_malloc(cpi->frag_total*BLOCK_SIZE*sizeof(*cpi->dct_eob_fi_storage));
  }
 #endif
-  
+
   /* fill in superblock fragment pointers; hilbert order */
   /* fill in macroblock superblock backpointers */
   {
@@ -159,74 +159,74 @@
     for(plane=0;plane<3;plane++){
 
       for(row=0;row<cpi->super_v[plane];row++){
-	for(col=0;col<cpi->super_h[plane];col++){
-	  int superindex = row*cpi->super_h[plane] + col;
-	  for(frag=0;frag<16;frag++){
-	    /* translate to fragment index */
-	    int frow = row*4 + fhilberty[frag];
-	    int fcol = col*4 + fhilbertx[frag];
-	    if(frow<cpi->frag_v[plane] && fcol<cpi->frag_h[plane]){
-	      int fragindex = frow*cpi->frag_h[plane] + fcol + offset;
-	      cpi->super[plane][superindex].f[frag] = fragindex;
-	    }else
-	      cpi->super[plane][superindex].f[frag] = cpi->frag_total; /* 'invalid' */
-	  }
-	}
+        for(col=0;col<cpi->super_h[plane];col++){
+          int superindex = row*cpi->super_h[plane] + col;
+          for(frag=0;frag<16;frag++){
+            /* translate to fragment index */
+            int frow = row*4 + fhilberty[frag];
+            int fcol = col*4 + fhilbertx[frag];
+            if(frow<cpi->frag_v[plane] && fcol<cpi->frag_h[plane]){
+              int fragindex = frow*cpi->frag_h[plane] + fcol + offset;
+              cpi->super[plane][superindex].f[frag] = fragindex;
+            }else
+              cpi->super[plane][superindex].f[frag] = cpi->frag_total; /* 'invalid' */
+          }
+        }
       }
       offset+=cpi->frag_n[plane];
     }
-    
+
     /* Y */
     for(row=0;row<cpi->super_v[0];row++){
       for(col=0;col<cpi->super_h[0];col++){
-	int superindex = row*cpi->super_h[0] + col;
-	for(mb=0;mb<4;mb++){
-	  /* translate to macroblock index */
-	  int mrow = row*2 + mhilberty[mb];
-	  int mcol = col*2 + mhilbertx[mb];
-	  if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-	    int macroindex = mrow*cpi->macro_h + mcol;
-	    cpi->super[0][superindex].m[mb] = macroindex;
-	    cpi->macro[macroindex].ysb = superindex;
-	  }else
-	    cpi->super[0][superindex].m[mb] = cpi->macro_total;
-	}
+        int superindex = row*cpi->super_h[0] + col;
+        for(mb=0;mb<4;mb++){
+          /* translate to macroblock index */
+          int mrow = row*2 + mhilberty[mb];
+          int mcol = col*2 + mhilbertx[mb];
+          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
+            int macroindex = mrow*cpi->macro_h + mcol;
+            cpi->super[0][superindex].m[mb] = macroindex;
+            cpi->macro[macroindex].ysb = superindex;
+          }else
+            cpi->super[0][superindex].m[mb] = cpi->macro_total;
+        }
       }
     }
 
     /* U (assuming 4:2:0 for now) */
     for(row=0;row<cpi->super_v[1];row++){
       for(col=0;col<cpi->super_h[1];col++){
-	int superindex = row*cpi->super_h[1] + col;
-	for(mb=0;mb<16;mb++){
-	  /* translate to macroblock index */
-	  int mrow = row*4 + fhilberty[mb];
-	  int mcol = col*4 + fhilbertx[mb];
-	  if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-	    int macroindex = mrow*cpi->macro_h + mcol;
-	    cpi->super[1][superindex].m[mb] = macroindex;
-	    cpi->macro[macroindex].usb = superindex + cpi->super_n[0];
-	  }else
-	    cpi->super[1][superindex].m[mb] = cpi->macro_total;
-	}
+        int superindex = row*cpi->super_h[1] + col;
+        for(mb=0;mb<16;mb++){
+          /* translate to macroblock index */
+          int mrow = row*4 + fhilberty[mb];
+          int mcol = col*4 + fhilbertx[mb];
+          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
+            int macroindex = mrow*cpi->macro_h + mcol;
+            cpi->super[1][superindex].m[mb] = macroindex;
+            cpi->macro[macroindex].usb = superindex + cpi->super_n[0];
+          }else
+            cpi->super[1][superindex].m[mb] = cpi->macro_total;
+        }
       }
     }
 
     /* V (assuming 4:2:0 for now) */
     for(row=0;row<cpi->super_v[2];row++){
       for(col=0;col<cpi->super_h[2];col++){
-	int superindex = row*cpi->super_h[2] + col;
-	for(mb=0;mb<16;mb++){
-	  /* translate to macroblock index */
-	  int mrow = row*4 + fhilberty[mb];
-	  int mcol = col*4 + fhilbertx[mb];
-	  if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-	    int macroindex = mrow*cpi->macro_h + mcol;
-	    cpi->super[2][superindex].m[mb] = macroindex;
-	    cpi->macro[macroindex].vsb = superindex + cpi->super_n[0] + cpi->super_n[1];
-	  }else
-	    cpi->super[2][superindex].m[mb] = cpi->macro_total;
-	}
+        int superindex = row*cpi->super_h[2] + col;
+        for(mb=0;mb<16;mb++){
+          /* translate to macroblock index */
+          int mrow = row*4 + fhilberty[mb];
+          int mcol = col*4 + fhilbertx[mb];
+          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
+            int macroindex = mrow*cpi->macro_h + mcol;
+            cpi->super[2][superindex].m[mb] = macroindex;
+            cpi->macro[macroindex].vsb = superindex + cpi->super_n[0] + cpi->super_n[1];
+          }else
+            cpi->super[2][superindex].m[mb] = cpi->macro_total;
+        }
       }
     }
 
@@ -242,62 +242,62 @@
     for(row=0;row<cpi->macro_v;row++){
       int baserow = row*2;
       for(col=0;col<cpi->macro_h;col++){
-	int basecol = col*2;
-	int macroindex = row*cpi->macro_h + col;
-	int hpos = (col&1) + (row&1)*2;
+        int basecol = col*2;
+        int macroindex = row*cpi->macro_h + col;
+        int hpos = (col&1) + (row&1)*2;
 
-	/* Y */
-	for(frag=0;frag<4;frag++){
-	  int Hrow = baserow + Hscany[hpos][frag];
-	  int Hcol = basecol + Hscanx[hpos][frag];
-	  int Rrow = baserow + ((frag>>1)&1);
-	  int Rcol = basecol + (frag&1);
+        /* Y */
+        for(frag=0;frag<4;frag++){
+          int Hrow = baserow + Hscany[hpos][frag];
+          int Hcol = basecol + Hscanx[hpos][frag];
+          int Rrow = baserow + ((frag>>1)&1);
+          int Rcol = basecol + (frag&1);
 
-	  cpi->macro[macroindex].Hyuv[0][frag] = cpi->frag_total; // default
-	  cpi->macro[macroindex].Ryuv[0][frag] = cpi->frag_total; //default
-	  if(Hrow<cpi->frag_v[0] && Hcol<cpi->frag_h[0]){
-	    cpi->macro[macroindex].Hyuv[0][frag] = Hrow*cpi->frag_h[0] + Hcol;	    
+          cpi->macro[macroindex].Hyuv[0][frag] = cpi->frag_total; // default
+          cpi->macro[macroindex].Ryuv[0][frag] = cpi->frag_total; //default
+          if(Hrow<cpi->frag_v[0] && Hcol<cpi->frag_h[0]){
+            cpi->macro[macroindex].Hyuv[0][frag] = Hrow*cpi->frag_h[0] + Hcol;
 #ifdef COLLECT_METRICS
-	    cpi->frag_mbi[Hrow*cpi->frag_h[0] + Hcol] = macroindex;
+            cpi->frag_mbi[Hrow*cpi->frag_h[0] + Hcol] = macroindex;
 #endif
-	  }
-	  if(Rrow<cpi->frag_v[0] && Rcol<cpi->frag_h[0])
-	    cpi->macro[macroindex].Ryuv[0][frag] = Rrow*cpi->frag_h[0] + Rcol;	    
-	}
+          }
+          if(Rrow<cpi->frag_v[0] && Rcol<cpi->frag_h[0])
+            cpi->macro[macroindex].Ryuv[0][frag] = Rrow*cpi->frag_h[0] + Rcol;
+        }
 
-	/* U */
-	cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][1] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][2] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][3] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][1] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][2] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][3] = cpi->frag_total;
-	if(row<cpi->frag_v[1] && col<cpi->frag_h[1]){
-	  cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_n[0] + macroindex;
-	  cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_n[0] + macroindex;
+        /* U */
+        cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[1][1] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[1][2] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[1][3] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[1][1] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[1][2] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[1][3] = cpi->frag_total;
+        if(row<cpi->frag_v[1] && col<cpi->frag_h[1]){
+          cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_n[0] + macroindex;
+          cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_n[0] + macroindex;
 #ifdef COLLECT_METRICS
-	  cpi->frag_mbi[cpi->frag_n[0] + macroindex] = macroindex;
+          cpi->frag_mbi[cpi->frag_n[0] + macroindex] = macroindex;
 #endif
-	}
-	
-	/* V */
-	cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][1] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][2] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][3] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][1] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][2] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][3] = cpi->frag_total;
-	if(row<cpi->frag_v[2] && col<cpi->frag_h[2]){
-	  cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
-	  cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
+        }
+
+        /* V */
+        cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[2][1] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[2][2] = cpi->frag_total;
+        cpi->macro[macroindex].Ryuv[2][3] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[2][1] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[2][2] = cpi->frag_total;
+        cpi->macro[macroindex].Hyuv[2][3] = cpi->frag_total;
+        if(row<cpi->frag_v[2] && col<cpi->frag_h[2]){
+          cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
+          cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
 #ifdef COLLECT_METRICS
-	  cpi->frag_mbi[cpi->frag_n[0] + cpi->frag_n[1] + macroindex] = macroindex;
+          cpi->frag_mbi[cpi->frag_n[0] + cpi->frag_n[1] + macroindex] = macroindex;
 #endif
-	}	
+        }
       }
     }
   }
@@ -308,68 +308,68 @@
 
     for(row=0;row<cpi->macro_v;row++){
       for(col=0;col<cpi->macro_h;col++){
-	int macroindex = row*cpi->macro_h + col;
-	int count=0;
+        int macroindex = row*cpi->macro_h + col;
+        int count=0;
 
-	/* cneighbors are of four possible already-filled-in neighbors
-	   from the eight-neighbor square for doing ME. The
-	   macroblocks are scanned in Hilbert order and the corner
-	   cases here are annoying, so we precompute. */
-	if(row&1){
-	  if(col&1){
-	    /* 2 */
-	    cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-	    cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-	  }else{
-	    /* 1 */
-	    if(col){
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-	    }
-	    cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-	  }
-	}else{
-	  if(col&1){
-	    /* 3; Could have up to six, fill in at most 4 */
-	    if(row && col+1<cpi->macro_h)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
-	    if(row)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-	    if(col && row)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-	    if(col)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-	    if(col && row+1<cpi->macro_v && count<4)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
-	    if(row+1<cpi->macro_v && count<4)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h;
-	  }else{
-	    /* 0; Could have up to five, fill in at most 4 */
-	    if(row && col+1<cpi->macro_h)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
-	    if(row)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-	    if(col && row)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-	    if(col)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-	    if(col && row+1<cpi->macro_v && count<4)
-	      cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
-	  }
-	}
-	cpi->macro[macroindex].ncneighbors=count;
+        /* cneighbors are of four possible already-filled-in neighbors
+           from the eight-neighbor square for doing ME. The
+           macroblocks are scanned in Hilbert order and the corner
+           cases here are annoying, so we precompute. */
+        if(row&1){
+          if(col&1){
+            /* 2 */
+            cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
+            cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
+          }else{
+            /* 1 */
+            if(col){
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
+            }
+            cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
+          }
+        }else{
+          if(col&1){
+            /* 3; Could have up to six, fill in at most 4 */
+            if(row && col+1<cpi->macro_h)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
+            if(row)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
+            if(col && row)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
+            if(col)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
+            if(col && row+1<cpi->macro_v && count<4)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
+            if(row+1<cpi->macro_v && count<4)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h;
+          }else{
+            /* 0; Could have up to five, fill in at most 4 */
+            if(row && col+1<cpi->macro_h)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
+            if(row)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
+            if(col && row)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
+            if(col)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
+            if(col && row+1<cpi->macro_v && count<4)
+              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
+          }
+        }
+        cpi->macro[macroindex].ncneighbors=count;
 
-	/* pneighbors are of the four possible direct neighbors (plus pattern), not the same as cneighbors */
-	count=0;
-	if(col)
-	  cpi->macro[macroindex].pneighbors[count++]=macroindex-1;
-	if(row)
-	  cpi->macro[macroindex].pneighbors[count++]=macroindex-cpi->macro_h;
-	if(col+1<cpi->macro_h)
-	  cpi->macro[macroindex].pneighbors[count++]=macroindex+1;
-	if(row+1<cpi->macro_v)
-	  cpi->macro[macroindex].pneighbors[count++]=macroindex+cpi->macro_h;
-	cpi->macro[macroindex].npneighbors=count;
+        /* pneighbors are of the four possible direct neighbors (plus pattern), not the same as cneighbors */
+        count=0;
+        if(col)
+          cpi->macro[macroindex].pneighbors[count++]=macroindex-1;
+        if(row)
+          cpi->macro[macroindex].pneighbors[count++]=macroindex-cpi->macro_h;
+        if(col+1<cpi->macro_h)
+          cpi->macro[macroindex].pneighbors[count++]=macroindex+1;
+        if(row+1<cpi->macro_v)
+          cpi->macro[macroindex].pneighbors[count++]=macroindex+cpi->macro_h;
+        cpi->macro[macroindex].npneighbors=count;
       }
     }
   }
@@ -379,8 +379,8 @@
     int p,f;
     for(p=0;p<3;p++)
       for(f=0;f<4;f++){
-	cpi->macro[cpi->macro_total].Ryuv[p][f] = cpi->frag_total;
-	cpi->macro[cpi->macro_total].Hyuv[p][f] = cpi->frag_total;
+        cpi->macro[cpi->macro_total].Ryuv[p][f] = cpi->frag_total;
+        cpi->macro[cpi->macro_total].Hyuv[p][f] = cpi->frag_total;
       }
     cpi->macro[cpi->macro_total].ncneighbors=0;
     cpi->macro[cpi->macro_total].npneighbors=0;
@@ -399,14 +399,14 @@
   {
     ogg_uint32_t plane,row,col;
     ogg_uint32_t *bp = cpi->frag_buffer_index;
-    
+
     for(plane=0;plane<3;plane++){
       ogg_uint32_t offset = cpi->offset[plane];
       for(row=0;row<cpi->frag_v[plane];row++){
-	for(col=0;col<cpi->frag_h[plane];col++,bp++){
-	  *bp = offset+col*8;
-	}
-	offset += cpi->stride[plane]*8;
+        for(col=0;col<cpi->frag_h[plane];col++,bp++){
+          *bp = offset+col*8;
+        }
+        offset += cpi->stride[plane]*8;
       }
     }
   }

Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/mcenc.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -70,199 +70,151 @@
   {0,1,3}
 };
 
-/*Swaps two integers _a and _b if _a>_b.*/
-#define OC_SORT2I(_a,_b)\
-  if((_a)>(_b)){\
-    int t__;\
-    t__=(_a);\
-    (_a)=(_b);\
-    (_b)=t__;\
-  }
 
-
-#define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
-#define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
-/*Clamps an integer into the given range.
-  If _a>_c, then the lower bound _a is respected over the upper bound _c (this
-   behavior is required to meet our documented API behavior).
-  _a: The lower bound.
-  _b: The value to clamp.
-  _c: The upper boud.*/
-#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
-
-/*Divides an integer by a power of two, truncating towards 0.
-  _dividend: The integer to divide.
-  _shift:    The non-negative power of two to divide by.
-  _rmask:    (1<<_shift)-1*/
-#define OC_DIV_POW2(_dividend,_shift,_rmask)\
-  ((_dividend)+(((_dividend)>>sizeof(_dividend)*8-1)&(_rmask))>>(_shift))
-/*Divides _x by 65536, truncating towards 0.*/
-#define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF)
-/*Divides _x by 2, truncating towards 0.*/
-#define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1)
-
-/*Right shifts _dividend by _shift, adding _rval, and subtracting one for
-   negative dividends first..
-  When _rval is (1<<_shift-1), this is equivalent to division with rounding
-   ties towards positive infinity.*/
-#define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\
-  ((_dividend)+((_dividend)>>sizeof(_dividend)*8-1)+(_rval)>>(_shift))
-
-static void oc_mcenc_find_candidates(CP_INSTANCE *cpi, 
-				     mc_state *_mcenc,
-				     int _mbi,
-				     int _goldenp){
+static void oc_mcenc_find_candidates(CP_INSTANCE *cpi,mc_state *_mcenc,
+ int _mbi,int _goldenp){
   macroblock_t *nemb;
   macroblock_t *emb;
   ogg_int32_t   mvapw1;
   ogg_int32_t   mvapw2;
-  mv_t          a[3];
+  int           a[3][2];
   int           ncandidates;
   int           i;
-  emb = &cpi->macro[_mbi];
+  emb=cpi->macro+_mbi;
   if(emb->ncneighbors>0){
-
     /*Fill in the first part of set A: the last motion vectors used and the
        vectors from adjacent blocks.*/
     /*Skip a position to store the median predictor in.*/
-
     ncandidates=1;
     for(i=0;i<emb->ncneighbors;i++){
-      nemb = &cpi->macro[emb->cneighbors[i]];
-      _mcenc->candidates[ncandidates++] = nemb->analysis_mv[0][_goldenp];
+      nemb=cpi->macro+emb->cneighbors[i];
+      _mcenc->candidates[ncandidates][0]=nemb->analysis_mv[0][_goldenp][0];
+      _mcenc->candidates[ncandidates][1]=nemb->analysis_mv[0][_goldenp][1];
+      ncandidates++;
     }
-
-    /* Add a few additional vectors to set A: the vector used in the
+    /*Add a few additional vectors to set A: the vector used in the
        previous frame and the (0,0) vector.*/
-    _mcenc->candidates[ncandidates++] = emb->analysis_mv[1][_goldenp];
-    _mcenc->candidates[ncandidates++] = (mv_t){0,0};
-
+    _mcenc->candidates[ncandidates][0]=emb->analysis_mv[1][_goldenp][0];
+    _mcenc->candidates[ncandidates][1]=emb->analysis_mv[1][_goldenp][1];
+    ncandidates++;
+    _mcenc->candidates[ncandidates][0]=0;
+    _mcenc->candidates[ncandidates][1]=0;
+    ncandidates++;
     /*Use the first three vectors of set A to find our best predictor: their
        median.*/
     memcpy(a,_mcenc->candidates+1,sizeof(a));
-    OC_SORT2I(a[0].x,a[1].x);
-    OC_SORT2I(a[0].y,a[1].y);
-    OC_SORT2I(a[1].x,a[2].x);
-    OC_SORT2I(a[1].y,a[2].y);
-    OC_SORT2I(a[0].x,a[1].x);
-    OC_SORT2I(a[0].y,a[1].y);
-    _mcenc->candidates[0] = a[1];
-
-  } else {
-
+    OC_SORT2I(a[0][0],a[1][0]);
+    OC_SORT2I(a[0][1],a[1][1]);
+    OC_SORT2I(a[1][0],a[2][0]);
+    OC_SORT2I(a[1][1],a[2][1]);
+    OC_SORT2I(a[0][0],a[1][0]);
+    OC_SORT2I(a[0][1],a[1][1]);
+    _mcenc->candidates[0][0]=a[1][0];
+    _mcenc->candidates[0][1]=a[1][1];
+  }
+  else{
     /*The upper-left most macro block has no neighbors at all
       We just use 0,0 as the median predictor and its previous motion vector
       for set A.*/
-
-    _mcenc->candidates[0] = (mv_t){0,0};
-    _mcenc->candidates[1] = emb->analysis_mv[1][_goldenp];
+    _mcenc->candidates[0][0]=0;
+    _mcenc->candidates[0][1]=1;
+    _mcenc->candidates[1][0]=emb->analysis_mv[1][_goldenp][0];
+    _mcenc->candidates[1][1]=emb->analysis_mv[1][_goldenp][1];
     ncandidates=2;
   }
-
   /*Fill in set B: accelerated predictors for this and adjacent macro
      blocks.*/
-
-  _mcenc->setb0 = ncandidates;
+  _mcenc->setb0=ncandidates;
   mvapw1=_mcenc->mvapw1[_goldenp];
   mvapw2=_mcenc->mvapw2[_goldenp];
-
   /*The first time through the loop use the current macro block.*/
   nemb=emb;
   for(i=0;;i++){
-    _mcenc->candidates[ncandidates].x =
-      OC_DIV_ROUND_POW2(nemb->analysis_mv[1][_goldenp].x*mvapw1-
-			nemb->analysis_mv[2][_goldenp].x*mvapw2,16,0x8000);
-    _mcenc->candidates[ncandidates].y =
-      OC_DIV_ROUND_POW2(nemb->analysis_mv[1][_goldenp].y*mvapw1-
-			nemb->analysis_mv[2][_goldenp].y*mvapw2,16,0x8000);
-    _mcenc->candidates[ncandidates].x = OC_CLAMPI(-31,_mcenc->candidates[ncandidates].x,31);
-    _mcenc->candidates[ncandidates].y = OC_CLAMPI(-31,_mcenc->candidates[ncandidates].y,31);
+    _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
+     OC_DIV_POW2_RE(nemb->analysis_mv[1][_goldenp][0]*mvapw1
+     -nemb->analysis_mv[2][_goldenp][0]*mvapw2,16),31);
+    _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
+     OC_DIV_POW2_RE(nemb->analysis_mv[1][_goldenp][1]*mvapw1
+     -nemb->analysis_mv[2][_goldenp][1]*mvapw2,16),31);
     ncandidates++;
-    if(i >= emb->npneighbors) break;
-    nemb=&cpi->macro[emb->pneighbors[i]];
+    if(i>=emb->npneighbors)break;
+    nemb=cpi->macro+emb->pneighbors[i];
   }
   /*Truncate to full-pel positions.*/
   for(i=0;i<ncandidates;i++){
-    _mcenc->candidates[i].x=OC_DIV2(_mcenc->candidates[i].x);
-    _mcenc->candidates[i].y=OC_DIV2(_mcenc->candidates[i].y);
+    _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]);
+    _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]);
   }
   _mcenc->ncandidates=ncandidates;
 }
 
-static int oc_sad16_halfpel(CP_INSTANCE *cpi, 
-			    int mbi,
-			    int _mvoffset0,
-			    int _mvoffset1,
-			    int _goldenp,
-			    int _best_err){
-
-  macroblock_t *mb = &cpi->macro[mbi];
-  int err;
-  int i;
+static int oc_sad16_halfpel(CP_INSTANCE *cpi,int mbi,
+ int _mvoffset0,int _mvoffset1,int _goldenp,int _best_err){
+  macroblock_t *mb;
+  int           err;
+  int           i;
+  mb=cpi->macro+mbi;
   err=0;
   for(i=0;i<4;i++){
     int fi = mb->Ryuv[0][i];
     ogg_uint32_t base_offset = cpi->frag_buffer_index[fi];
     const unsigned char *cur = cpi->frame + base_offset;
     const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
-    
-    err+=  dsp_sad8x8_xy2_thres (cpi->dsp, cur, ref+_mvoffset0, ref+_mvoffset1, cpi->stride[0], _best_err-err);
-
+    fi=mb->Ryuv[0][i];
+    base_offset=cpi->frag_buffer_index[fi];
+    cur=cpi->frame+base_offset;
+    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
+    err+=dsp_sad8x8_xy2_thres(cpi->dsp,cur,
+     ref+_mvoffset0,ref+_mvoffset1,cpi->stride[0],_best_err-err);
   }
-  
   return err;
 }
 
 static int oc_mcenc_ysad_check_mbcandidate_fullpel(CP_INSTANCE *cpi, 
-						   mc_state *_mcenc,
-						   int _mbi,
-						   mv_t _delta,
-						   int _goldenp,
-						   int _block_err[4]){
-  int                      stride;
-  int                      mvoffset;
-  int                      err;
-  int                      bi;
-  macroblock_t            *mb = &cpi->macro[_mbi];
+ mc_state *_mcenc,int _mbi,int _dx,int _dy,int _goldenp,int _block_err[4]){
+  int           stride;
+  int           mvoffset;
+  int           err;
+  int           bi;
+  macroblock_t *mb;
+  mb=cpi->macro+_mbi;
   /*TODO: customize error function for speed/(quality+size) tradeoff.*/
   stride=cpi->stride[0];
-  mvoffset=_delta.x+_delta.y*stride;
+  mvoffset=_dx+_dy*stride;
   err=0;
   for(bi=0;bi<4;bi++){
-    int fi = mb->Ryuv[0][bi];
-    if(fi < cpi->frag_total){ /* last fragment is the 'invalid fragment' */
-      ogg_uint32_t base_offset = cpi->frag_buffer_index[fi];
-      const unsigned char *cur = cpi->frame + base_offset;
-      const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
-      
-      _block_err[bi] = dsp_sad8x8_thres (cpi->dsp, cur, ref+mvoffset,stride,9999999); 
-
-      err += _block_err[bi];
+    int fi;
+    fi=mb->Ryuv[0][bi];
+    /*Only check valid fragments.*/
+    if(fi<cpi->frag_total){
+      ogg_uint32_t         base_offset;
+      const unsigned char *cur;
+      const unsigned char *ref;
+      base_offset=cpi->frag_buffer_index[fi];
+      cur=cpi->frame+base_offset;
+      ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
+      _block_err[bi]=dsp_sad8x8_thres(cpi->dsp,cur,ref+mvoffset,stride,16384);
+      err+=_block_err[bi];
     }
   }
   return err;
 }
 
-static int oc_mcenc_ysad_halfpel_mbrefine(CP_INSTANCE *cpi, 
-					  int _mbi,
-					  mv_t *_vec,
-					  int _best_err,
-					  int _goldenp){
-  int                      offset_y[9];
-  int                      stride;
-  int                      mvoffset_base;
-  int                      best_site;
-  int                      sitei;
-  int                      err;
-
+static int oc_mcenc_ysad_halfpel_mbrefine(CP_INSTANCE *cpi,int _mbi,
+ int _vec[2],int _best_err,int _goldenp){
+  int offset_y[9];
+  int stride;
+  int mvoffset_base;
+  int best_site;
+  int sitei;
+  int err;
   stride=cpi->stride[0];
-  mvoffset_base=_vec->x+_vec->y*stride;
+  mvoffset_base=_vec[0]+_vec[1]*stride;
   offset_y[0]=offset_y[1]=offset_y[2]=-stride;
   offset_y[3]=offset_y[5]=0;
   offset_y[6]=offset_y[7]=offset_y[8]=stride;
   err=_best_err;
   best_site=4;
-
   for(sitei=0;sitei<8;sitei++){
     int site;
     int xmask;
@@ -271,139 +223,147 @@
     int dy;
     int mvoffset0;
     int mvoffset1;
-
     site=OC_SQUARE_SITES[0][sitei];
     dx=OC_SQUARE_DX[site];
     dy=OC_SQUARE_DY[site];
-
-    xmask=-((((_vec->x<<1)+dx)^dx)<0);
-    ymask=-((((_vec->y<<1)+dy)^dy)<0);
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-
     err=oc_sad16_halfpel(cpi,_mbi,mvoffset0,mvoffset1,_goldenp,_best_err);
     if(err<_best_err){
       _best_err=err;
       best_site=site;
     }
   }
-
-  _vec->x=(_vec->x<<1)+OC_SQUARE_DX[best_site];
-  _vec->y=(_vec->y<<1)+OC_SQUARE_DY[best_site];
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
   return _best_err;
 }
 
-static int oc_mcenc_ysad_halfpel_brefine(CP_INSTANCE *cpi, 
-					 int _mbi,
-					 int _bi,
-					 mv_t *_vec,
-					 int _best_err,
-					 int _goldenp){
-  macroblock_t *mb = &cpi->macro[_mbi];
-  int offset_y[9];
-  int stride = cpi->stride[0];
-  int mvoffset_base;
-  int best_site;
-  int sitei;
-  int err;
-  int fi = mb->Ryuv[0][_bi];
-
-  if(fi >= cpi->frag_total) return _best_err;
-
-  mvoffset_base=_vec->x+_vec->y*stride;
+static int oc_mcenc_ysad_halfpel_brefine(CP_INSTANCE *cpi,int _mbi,
+ int _bi,int _vec[2],int _best_err,int _goldenp){
+  macroblock_t *mb;
+  int           offset_y[9];
+  int           stride;
+  int           mvoffset_base;
+  int           best_site;
+  int           sitei;
+  int           err;
+  int           fi;
+  mb=cpi->macro+_mbi;
+  stride=cpi->stride[0];
+  fi=mb->Ryuv[0][_bi];
+  if(fi>=cpi->frag_total)return _best_err;
+  mvoffset_base=_vec[0]+_vec[1]*stride;
   offset_y[0]=offset_y[1]=offset_y[2]=-stride;
   offset_y[3]=offset_y[5]=0;
   offset_y[6]=offset_y[7]=offset_y[8]=stride;
   err=_best_err;
   best_site=4;
-
   for(sitei=0;sitei<8;sitei++){
-    int site;
-    int xmask;
-    int ymask;
-    int dx;
-    int dy;
-    int mvoffset0;
-    int mvoffset1;
-
-    ogg_uint32_t base_offset = cpi->frag_buffer_index[fi];
-    const unsigned char *cur = cpi->frame + base_offset;
-    const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
-
+    ogg_uint32_t         base_offset;
+    const unsigned char *cur;
+    const unsigned char *ref;
+    int                  site;
+    int                  xmask;
+    int                  ymask;
+    int                  dx;
+    int                  dy;
+    int                  mvoffset0;
+    int                  mvoffset1;
+    base_offset=cpi->frag_buffer_index[fi];
+    cur=cpi->frame+base_offset;
+    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
     site=OC_SQUARE_SITES[0][sitei];
     dx=OC_SQUARE_DX[site];
     dy=OC_SQUARE_DY[site];
-
-    xmask=-((((_vec->x<<1)+dx)^dx)<0);
-    ymask=-((((_vec->y<<1)+dy)^dy)<0);
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-
-    err=dsp_sad8x8_xy2_thres (cpi->dsp, cur, ref+mvoffset0, ref+mvoffset1, stride, _best_err);
-
+    err=dsp_sad8x8_xy2_thres(cpi->dsp,cur,
+     ref+mvoffset0,ref+mvoffset1,stride,_best_err);
     if(err<_best_err){
       _best_err=err;
       best_site=site;
     }
   }
-  _vec->x=(_vec->x<<1)+OC_SQUARE_DX[best_site];
-  _vec->y=(_vec->y<<1)+OC_SQUARE_DY[best_site];
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
   return _best_err;
 }
 
-/* Perform a motion vector search for this macro block against a single
+/*Perform a motion vector search for this macro block against a single
    reference frame.
-  
-   As a bonus, individual block motion vectors are computed as well, as much of
+  As a bonus, individual block motion vectors are computed as well, as much of
    the work can be shared.
-  
-   The actual motion vector is stored in the appropriate place in the
+  The actual motion vector is stored in the appropriate place in the
    oc_mb_enc_info structure.
-  
-   _mcenc:    The motion compensation context.
-   _mbi:      The macro block index.
-   _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.
-   _bmvs:     Returns the individual block motion vectors. */
-
-void oc_mcenc_search(CP_INSTANCE *cpi, 
-		    mc_state *_mcenc,
-		    int _mbi,
-		    int _goldenp,
-		    mv_t *_bmvs,
-		    int *best_err,
-		    int best_block_err[4]){
-  
-  /*TODO: customize error function for speed/(quality+size) tradeoff.*/
-
-  ogg_int32_t     hit_cache[31];
-  ogg_int32_t     hitbit;
-  int             block_err[4];
-  mv_t            best_vec;
-  mv_t            best_block_vec[4];
-  mv_t            cand;
-  int             bi;
-  macroblock_t   *mb = &cpi->macro[_mbi];
-
+  _mcenc:    The motion compensation context.
+  _mbi:      The macro block index.
+  _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.
+  _bmvs:     Returns the individual block motion vectors.*/
+void oc_mcenc_search(CP_INSTANCE *cpi,mc_state *_mcenc,int _mbi,
+ int _goldenp,oc_mv _bmvs[4],int *_best_err,int _best_block_err[4]){
+  /*Note: Traditionally this search is done using a rate-distortion objective
+     function of the form D+lambda*R.
+    However, xiphmont tested this and found it produced a small degredation,
+     while requiring extra computation.
+    This is most likely due to Theora's peculiar MV encoding scheme: MVs are
+     not coded relative to a predictor, and the only truly cheap way to use a
+     MV is in the LAST or LAST2 MB modes, which are not being considered here.
+    Therefore if we use the MV found here, it's only because both LAST and
+     LAST2 performed poorly, and therefore the MB is not likely to be uniform
+     or suffer from the aperture problem.
+    Furthermore we would like to re-use the MV found here for as many MBs as
+     possible, so picking a slightly sub-optimal vector to save a bit or two
+     may cause increased degredation in many blocks to come.
+    We could artificially reduce lambda to compensate, but it's faster to just
+     disable it entirely, and use D (the distortion) as the sole criterion.*/
+  macroblock_t *mb;
+  ogg_int32_t   hit_cache[31];
+  ogg_int32_t   hitbit;
+  int           block_err[4];
+  int           best_vec[2];
+  int           best_err;
+  int           best_block_vec[4][2];
+  int           candx;
+  int           candy;
+  int           bi;
+  mb=cpi->macro+_mbi;
   /*Find some candidate motion vectors.*/
   oc_mcenc_find_candidates(cpi,_mcenc,_mbi,_goldenp);
-
   /*Clear the cache of locations we've examined.*/
   memset(hit_cache,0,sizeof(hit_cache));
-
   /*Start with the median predictor.*/
-  cand=_mcenc->candidates[0];
-  hit_cache[cand.y+15]|=(ogg_int32_t)1<<cand.x+15;
-  *best_err = oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,cand,
-						     _goldenp,block_err);
-  best_vec=cand;
-  if(_bmvs)
+  candx=_mcenc->candidates[0][0];
+  candy=_mcenc->candidates[0][1];
+  hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
+  /*TODO: customize error function for speed/(quality+size) tradeoff.*/
+  best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,
+   candx,candy,_goldenp,block_err);
+  best_vec[0]=candx;
+  best_vec[1]=candy;
+  if(_bmvs){
     for(bi=0;bi<4;bi++){
-      best_block_err[bi]=block_err[bi];
-      best_block_vec[bi]=cand;
+      _best_block_err[bi]=block_err[bi];
+      best_block_vec[bi][0]=candx;
+      best_block_vec[bi][1]=candy;
     }
-  
+  }
   /*If this predictor fails, move on to set A.*/
-  if(*best_err>OC_YSAD_THRESH1){
+  if(best_err>OC_YSAD_THRESH1){
     int err;
     int ci;
     int ncs;
@@ -411,54 +371,57 @@
     /*Compute the early termination threshold for set A.*/
     t2=mb->aerror;
     ncs=OC_MINI(3,mb->ncneighbors);
-    for(ci=0;ci<ncs;ci++)
-      t2=OC_MAXI(t2,cpi->macro[mb->cneighbors[ci]].aerror);
+    for(ci=0;ci<ncs;ci++)t2=OC_MAXI(t2,cpi->macro[mb->cneighbors[ci]].aerror);
     t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
-
     /*Examine the candidates in set A.*/
     for(ci=1;ci<_mcenc->setb0;ci++){
-      cand=_mcenc->candidates[ci];
-
+      candx=_mcenc->candidates[ci][0];
+      candy=_mcenc->candidates[ci][1];
       /*If we've already examined this vector, then we would be using it if it
-	was better than what we are using.*/
-      hitbit=(ogg_int32_t)1<<cand.x+15;
-      if(hit_cache[cand.y+15]&hitbit)continue;
-      hit_cache[cand.y+15]|=hitbit;
-      err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,cand,_goldenp,block_err);
-      if(err<*best_err){
-        *best_err=err;
-        best_vec=cand;
+         was better than what we are using.*/
+      hitbit=(ogg_int32_t)1<<candx+15;
+      if(hit_cache[candy+15]&hitbit)continue;
+      hit_cache[candy+15]|=hitbit;
+      err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,
+       candx,candy,_goldenp,block_err);
+      if(err<best_err){
+        best_err=err;
+        best_vec[0]=candx;
+        best_vec[1]=candy;
       }
-      if(_bmvs)
-	for(bi=0;bi<4;bi++)
-	  if(block_err[bi]<best_block_err[bi]){
-	    best_block_err[bi]=block_err[bi];
-	    best_block_vec[bi]=cand;
-	  }
+      if(_bmvs){
+        for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
+          _best_block_err[bi]=block_err[bi];
+          best_block_vec[bi][0]=candx;
+          best_block_vec[bi][1]=candy;
+        }
+      }
     }
-
-    if(*best_err>t2){
+    if(best_err>t2){
       /*Examine the candidates in set B.*/
       for(;ci<_mcenc->ncandidates;ci++){
-        cand=_mcenc->candidates[ci];
-        hitbit=(ogg_int32_t)1<<cand.x+15;
-        if(hit_cache[cand.y+15]&hitbit)continue;
-        hit_cache[cand.y+15]|=hitbit;
-        err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,cand,_goldenp,block_err);
-        if(err<*best_err){
-          *best_err=err;
-          best_vec=cand;
+        candx=_mcenc->candidates[ci][0];
+        candy=_mcenc->candidates[ci][1];
+        hitbit=(ogg_int32_t)1<<candx+15;
+        if(hit_cache[candy+15]&hitbit)continue;
+        hit_cache[candy+15]|=hitbit;
+        err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,
+         candx,candy,_goldenp,block_err);
+        if(err<best_err){
+          best_err=err;
+          best_vec[0]=candx;
+          best_vec[1]=candy;
         }
-	if(_bmvs)
-	  for(bi=0;bi<4;bi++)
-	    if(block_err[bi]<best_block_err[bi]){
-	      best_block_err[bi]=block_err[bi];
-	      best_block_vec[bi]=cand;
-	    }
+        if(_bmvs){
+          for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
+            _best_block_err[bi]=block_err[bi];
+            best_block_vec[bi][0]=candx;
+            best_block_vec[bi][1]=candy;
+          }
+        }
       }
-
       /*Use the same threshold for set B as in set A.*/
-      if(*best_err>t2){
+      if(best_err>t2){
         int best_site;
         int nsites;
         int sitei;
@@ -468,161 +431,153 @@
         for(;;){
           best_site=4;
           /*Compose the bit flags for boundary conditions.*/
-          b=OC_DIV16(-best_vec.x+1)|OC_DIV16(best_vec.x+1)<<1|
-	    OC_DIV16(-best_vec.y+1)<<2|OC_DIV16(best_vec.y+1)<<3;
+          b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1|
+           OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3;
           nsites=OC_SQUARE_NSITES[b];
           for(sitei=0;sitei<nsites;sitei++){
             site=OC_SQUARE_SITES[b][sitei];
-            cand.x=best_vec.x+OC_SQUARE_DX[site];
-            cand.y=best_vec.y+OC_SQUARE_DY[site];
-            hitbit=(ogg_int32_t)1<<cand.x+15;
-            if(hit_cache[cand.y+15]&hitbit)continue;
-            hit_cache[cand.y+15]|=hitbit;
-            err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,cand,_goldenp,block_err);
-            if(err<*best_err){
-              *best_err=err;
+            candx=best_vec[0]+OC_SQUARE_DX[site];
+            candy=best_vec[1]+OC_SQUARE_DY[site];
+            hitbit=(ogg_int32_t)1<<candx+15;
+            if(hit_cache[candy+15]&hitbit)continue;
+            hit_cache[candy+15]|=hitbit;
+            err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,
+             candx,candy,_goldenp,block_err);
+            if(err<best_err){
+              best_err=err;
               best_site=site;
             }
-	    if(_bmvs)
-	      for(bi=0;bi<4;bi++)
-		if(block_err[bi]<best_block_err[bi]){
-		  best_block_err[bi]=block_err[bi];
-		  best_block_vec[bi]=cand;
-		}
+            if(_bmvs){
+              for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
+                _best_block_err[bi]=block_err[bi];
+                best_block_vec[bi][0]=candx;
+                best_block_vec[bi][1]=candy;
+              }
+            }
           }
           if(best_site==4)break;
-          best_vec.x+=OC_SQUARE_DX[best_site];
-          best_vec.y+=OC_SQUARE_DY[best_site];
+          best_vec[0]+=OC_SQUARE_DX[best_site];
+          best_vec[1]+=OC_SQUARE_DY[best_site];
         }
-
         /*Final 4-MV search.*/
         /*Simply use 1/4 of the macro block set A and B threshold as the
            individual block threshold.*/
-	if(_bmvs){
-	  t2>>=2;
-	  for(bi=0;bi<4;bi++)
-	    if(best_block_err[bi]>t2){
-	      /*Square pattern search. We do this in a slightly interesting manner.
-		We continue to check the SAD of all four blocks in the macroblock.
-		This gives us two things:
-		
- 	        1) We can continue to use the hit_cache to avoid
-		   duplicate checks.  Otherwise we could continue to
-		   read it, but not write to it without saving and
-		   restoring it for each block.  Note that we could
-		   still eliminate a large number of duplicate checks
-		   by taking into account the site we came from when
-		   choosing the site list.  We can still do that to
-		   avoid extra hit_cache queries, and it might even be
-		   a speed win.
-
-		2) It gives us a slightly better chance of escaping local minima.
-		   We would not be here if we weren't doing a fairly bad job in
-		   finding a good vector, and checking these vectors can save us
-		   from 100 to several thousand points off our SAD 1 in 15
-		   times.
-
-		TODO: Is this a good idea?
-		Who knows. It needs more testing.*/
-
-	      for(;;){
-		mv_t best;
-		int bj;
-		best=best_block_vec[bi];
-		/*Compose the bit flags for boundary conditions.*/
-		b=OC_DIV16(-best.x+1)|OC_DIV16(best.x+1)<<1|
-		  OC_DIV16(-best.y+1)<<2|OC_DIV16(best.y+1)<<3;
-		nsites=OC_SQUARE_NSITES[b];
-		for(sitei=0;sitei<nsites;sitei++){
-		  site=OC_SQUARE_SITES[b][sitei];
-		  cand.x=best.x+OC_SQUARE_DX[site];
-		  cand.y=best.y+OC_SQUARE_DY[site];
-		  hitbit=(ogg_int32_t)1<<cand.x+15;
-		  if(hit_cache[cand.y+15]&hitbit)continue;
-		  hit_cache[cand.y+15]|=hitbit;
-		  err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,cand,_goldenp,block_err);
-		  if(err<*best_err){
-		    *best_err=err;
-		    best_vec=cand;
-		  }
-		  for(bj=0;bj<4;bj++)
-		    if(block_err[bj]<best_block_err[bj]){
-		      best_block_err[bj]=block_err[bj];
-		      best_block_vec[bj]=cand;
-		    }
-		  
-		}
-		if(best_block_vec[bi].x==best.x && best_block_vec[bi].y==best.y) break;
-	      }
-	    }
-	}
+        if(_bmvs){
+          t2>>=2;
+          for(bi=0;bi<4;bi++){
+            if(_best_block_err[bi]>t2){
+              /*Square pattern search.
+                We do this in a slightly interesting manner.
+                We continue to check the SAD of all four blocks in the
+                 macro block.
+                This gives us two things:
+                 1) We can continue to use the hit_cache to avoid duplicate
+                     checks.
+                    Otherwise we could continue to read it, but not write to it
+                     without saving and restoring it for each block.
+                    Note that we could still eliminate a large number of
+                     duplicate checks by taking into account the site we came
+                     from when choosing the site list.
+                    We can still do that to avoid extra hit_cache queries, and
+                     it might even be a speed win.
+                 2) It gives us a slightly better chance of escaping local
+                     minima.
+                    We would not be here if we weren't doing a fairly bad job
+                     in finding a good vector, and checking these vectors can
+                     save us from 100 to several thousand points off our SAD 1
+                     in 15 times.
+                TODO: Is this a good idea?
+                Who knows.
+                It needs more testing.*/
+              for(;;){
+                int bestx;
+                int besty;
+                int bj;
+                bestx=best_block_vec[bi][0];
+                besty=best_block_vec[bi][1];
+                /*Compose the bit flags for boundary conditions.*/
+                b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1|
+                 OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3;
+                nsites=OC_SQUARE_NSITES[b];
+                for(sitei=0;sitei<nsites;sitei++){
+                  site=OC_SQUARE_SITES[b][sitei];
+                  candx=bestx+OC_SQUARE_DX[site];
+                  candy=besty+OC_SQUARE_DY[site];
+                  hitbit=(ogg_int32_t)1<<candx+15;
+                  if(hit_cache[candy+15]&hitbit)continue;
+                  hit_cache[candy+15]|=hitbit;
+                  err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,_mcenc,_mbi,
+                   candx,candy,_goldenp,block_err);
+                  if(err<best_err){
+                    best_err=err;
+                    best_vec[0]=candx;
+                    best_vec[1]=candy;
+                  }
+                  for(bj=0;bj<4;bj++)if(block_err[bj]<_best_block_err[bj]){
+                    _best_block_err[bj]=block_err[bj];
+                    best_block_vec[bj][0]=candx;
+                    best_block_vec[bj][1]=candy;
+                  }
+                }
+                if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){
+                  break;
+                }
+              }
+            }
+          }
+        }
       }
     }
   }
-
-  if(!_goldenp) 
-    mb->aerror = *best_err;
-  else
-    mb->gerror = *best_err;
-
-  mb->analysis_mv[0][_goldenp].x=best_vec.x<<1;;
-  mb->analysis_mv[0][_goldenp].y=best_vec.y<<1;;
-
+  if(!_goldenp)mb->aerror=best_err;
+  else mb->gerror=best_err;
+  mb->analysis_mv[0][_goldenp][0]=(signed char)(best_vec[0]<<1);
+  mb->analysis_mv[0][_goldenp][1]=(signed char)(best_vec[1]<<1);
   if(_bmvs){
     for(bi=0;bi<4;bi++){
-      _bmvs[bi].x=best_block_vec[bi].x<<1;
-      _bmvs[bi].y=best_block_vec[bi].y<<1;
+      _bmvs[bi][0]=(signed char)(best_block_vec[bi][0]<<1);
+      _bmvs[bi][1]=(signed char)(best_block_vec[bi][1]<<1);
     }
   }
+  *_best_err=best_err;
 }
 
-
-void oc_mcenc_refine1mv(CP_INSTANCE *cpi, 
-		       int _mbi,
-		       int _goldenp,
-		       int err){
-
-  macroblock_t *mb = &cpi->macro[_mbi];
-  mv_t mv;
-  mv.x = mb->analysis_mv[0][_goldenp].x/2;
-  mv.y = mb->analysis_mv[0][_goldenp].y/2;
-  
-  err=oc_mcenc_ysad_halfpel_mbrefine(cpi,_mbi,&mv,err,_goldenp);
-  mb->analysis_mv[0][_goldenp]=mv;
-  if(!_goldenp)
-    mb->aerror = err;
-  else
-    mb->gerror = err;
-
+void oc_mcenc_refine1mv(CP_INSTANCE *cpi,int _mbi,int _goldenp,int _err){
+  macroblock_t *mb;
+  int           vec[2];
+  mb=cpi->macro+_mbi;
+  vec[0]=OC_DIV2(mb->analysis_mv[0][_goldenp][0]);
+  vec[1]=OC_DIV2(mb->analysis_mv[0][_goldenp][1]);
+  _err=oc_mcenc_ysad_halfpel_mbrefine(cpi,_mbi,vec,_err,_goldenp);
+  mb->analysis_mv[0][_goldenp][0]=(signed char)vec[0];
+  mb->analysis_mv[0][_goldenp][1]=(signed char)vec[1];
+  if(!_goldenp)mb->aerror=_err;
+  else mb->gerror=_err;
 }
 
-void oc_mcenc_refine4mv(CP_INSTANCE *cpi, 
-		       int _mbi,
-		       int err[4]){
-  macroblock_t *mb = &cpi->macro[_mbi];
-  int bi;
+void oc_mcenc_refine4mv(CP_INSTANCE *cpi,int _mbi,int _err[4]){
+  macroblock_t *mb;
+  int           bi;
+  mb=cpi->macro+_mbi;
   for(bi=0;bi<4;bi++){
-    mv_t mv;
-    mv.x = mb->mv[bi].x/2;
-    mv.y = mb->mv[bi].y/2;
-    oc_mcenc_ysad_halfpel_brefine(cpi,_mbi,bi,&mv,err[bi],0);
-    mb->mv[bi]=mv;
+    int vec[2];
+    vec[0]=OC_DIV2(mb->block_mv[bi][0]);
+    vec[1]=OC_DIV2(mb->block_mv[bi][1]);
+    oc_mcenc_ysad_halfpel_brefine(cpi,_mbi,bi,vec,_err[bi],0);
+    mb->ref_mv[bi][0]=(signed char)vec[0];
+    mb->ref_mv[bi][1]=(signed char)vec[1];
   }
 }
 
-void oc_mcenc_start(CP_INSTANCE *cpi,
-                    mc_state *mcenc){
-
-  ogg_int64_t  nframes;
-
+void oc_mcenc_start(CP_INSTANCE *cpi,mc_state *_mcenc){
+  ogg_int64_t nframes;
   /*Set up the accelerated MV weights for previous frame prediction.*/
-  mcenc->mvapw1[OC_FRAME_PREV]=(ogg_int32_t)1<<17;
-  mcenc->mvapw2[OC_FRAME_PREV]=(ogg_int32_t)1<<16;
-
+  _mcenc->mvapw1[OC_FRAME_PREV]=(ogg_int32_t)1<<17;
+  _mcenc->mvapw2[OC_FRAME_PREV]=(ogg_int32_t)1<<16;
   /*Set up the accelerated MV weights for golden frame prediction.*/
   nframes=cpi->LastKeyFrame;
-
-  mcenc->mvapw1[OC_FRAME_GOLD]=(ogg_int32_t)(nframes!=1?(nframes<<17)/(nframes-1):0);
-  mcenc->mvapw2[OC_FRAME_GOLD]=(ogg_int32_t)(nframes!=2?(nframes<<16)/(nframes-2):0);
-
+  _mcenc->mvapw1[OC_FRAME_GOLD]=(ogg_int32_t)(
+   nframes!=1?(nframes<<17)/(nframes-1):0);
+  _mcenc->mvapw2[OC_FRAME_GOLD]=(ogg_int32_t)(
+   nframes!=2?(nframes<<16)/(nframes-2):0);
 }

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/lib/enc/mode.c	2009-03-20 03:32:25 UTC (rev 15802)
@@ -20,200 +20,189 @@
 #include "mode_select.h"
 #include "encoder_lookup.h"
 
-/* Mode decision is done by exhaustively examining all potential
-   choices.  Since we use a minimum-quality encoding strategy, this
-   amounts to simply selecting the mode which uses the smallest number
-   of bits, since the minimum quality will be met in any mode.
-   Obviously, doing the motion compensation, fDCT, tokenization, and
-   then counting the bits each token uses is computationally
-   expensive.  Theora's EOB runs can also split the cost of these
-   tokens across multiple fragments, and naturally we don't know what
-   the optimal choice of Huffman codes will be until we know all the
-   tokens we're going to encode in all the fragments.
-
-   So we use a simple approach to estimating the bit cost of each mode
-   based upon the SAD value of the residual.  The mathematics behind
-   the technique are outlined by Kim \cite{Kim03}, but the process is
-   very simple.  For each quality index and SAD value, we have a table
-   containing the average number of bits needed to code a fragment.
-   The SAD values are placed into a small number of bins (currently
-   16).  The bit counts are obtained by examining actual encoded
-   frames, with optimal Huffman codes selected and EOB bits
-   appropriately divided among all the blocks they involve.  A
-   separate QIxSAD table is kept for each mode and color plane.  It
-   may be possible to combine many of these, but only experimentation
+/*Mode decision is done by exhaustively examining all potential choices.
+  Obviously, doing the motion compensation, fDCT, tokenization, and then
+   counting the bits each token uses is computationally expensive.
+  Theora's EOB runs can also split the cost of these tokens across multiple
+   fragments, and naturally we don't know what the optimal choice of Huffman
+   codes will be until we know all the tokens we're going to encode in all the
+   fragments.
+  So we use a simple approach to estimating the bit cost of each mode based
+   upon the SAD value of the residual.
+  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
+   the process is very simple.
+  For each quality index and SAD value, we have a table containing the average
+   number of bits needed to code a fragment.
+  The SAD values are placed into a small number of bins (currently 24).
+  TODO: The remaining portion is no longer current.
+  The bit counts are obtained by examining actual encoded frames, with optimal
+   Huffman codes selected and EOB bits appropriately divided among all the
+   blocks they involve.
+  A separate QIxSAD table is kept for each mode and color plane.
+  It may be possible to combine many of these, but only experimentation
    will tell which ones truly represent the same distribution.
 
-   @ARTICLE{Kim03,
-     author="Hyun Mun Kim",
-     title="Adaptive Rate Control Using Nonlinear Regression",
-     journal="IEEE Transactions on Circuits and Systems for Video
-     Technology",
-     volume=13,
-     number=5,
-     pages="432--439",
-     month="May",
-     year=2003
-   }
+  @ARTICLE{Kim03,
+    author="Hyun Mun Kim",
+    title="Adaptive Rate Control Using Nonlinear Regression",
+    journal="IEEE Transactions on Circuits and Systems for Video
+    Technology",
+    volume=13,
+    number=5,
+    pages="432--439",
+    month="May",
+    year=2003
+  }*/
 
-*/
+/*Pointers to the list of bit lengths for the VLC codes used for each mode
+   scheme.
+  Schemes 0-6 use the same VLC, while scheme 7 uses a FLC.*/
+static const unsigned char *OC_MODE_SCHEME_BITS[8]={
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengths,
+  ModeBitLengthsD,
+};
 
-/* Initialize the mode scheme chooser.
-   
-   Schemes 0-6 use a highly unbalanced Huffman code to code each of
-   the modes.  The same set of Huffman codes is used for each of these
-   7 schemes, but the mode assigned to each code varies.
+/*Initialize the mode scheme chooser.
+  This need only be called once per encoder.
+  This is probably the best place to describe the various schemes Theora uses
+   to encode macro block modes.
+  There are 8 possible schemes.
+  Schemes 0-6 use a highly unbalanced Huffman code to code each of the modes.
+  The same set of Huffman codes is used for each of these 7 schemes, but the
+   mode assigned to each code varies.
+  Schemes 1-6 have a fixed mapping from Huffman code to MB mode, while scheme 0
+   writes a custom mapping to the bitstream before all the modes.
+  Finally, scheme 7 just encodes each mode directly in 3 bits.*/
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
+  int si;
+  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
+  for(si=1;si<8;si++)_chooser->mode_ranks[si]=ModeSchemes[si-1];
+}
 
-   Schemes 1-6 have a fixed mapping from Huffman code to MB mode,
-   while scheme 0 writes a custom mapping to the bitstream before all
-   the modes.  Finally, scheme 7 just encodes each mode directly in 3
-   bits. 
-
-*/
-
-void oc_mode_scheme_chooser_init(CP_INSTANCE *cpi){
-  oc_mode_scheme_chooser *chooser = &cpi->chooser;
-  int i;
-
-  for(i=0;i<7;i++)
-    chooser->mode_bits[i] = ModeBitLengths;
-  chooser->mode_bits[7] = ModeBitLengthsD;
-  
-  chooser->mode_ranks[0] = chooser->scheme0_ranks;
-  for(i=1;i<8;i++)
-    chooser->mode_ranks[i] = ModeSchemes[i-1];
-
-  memset(chooser->mode_counts,0,OC_NMODES*sizeof(*chooser->mode_counts));
-  
-  /* Scheme 0 starts with 24 bits to store the mode list in. */
-  chooser->scheme_bits[0] = 24;
-  memset(chooser->scheme_bits+1,0,7*sizeof(*chooser->scheme_bits));
-  for(i=0;i<8;i++){
-    /* Scheme 7 should always start first, and scheme 0 should always start
-       last. */
-    chooser->scheme_list[i] = 7-i;
-    chooser->scheme0_list[i] = chooser->scheme0_ranks[i] = i;
+/*Reset the mode scheme chooser.
+  This needs to be called once for each frame, including the first.*/
+static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
+  int si;
+  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
+  /*Scheme 0 starts with 24 bits to store the mode list in.*/
+  _chooser->scheme_bits[0]=24;
+  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
+  for(si=0;si<8;si++){
+    /*Scheme 7 should always start first, and scheme 0 should always start
+       last.*/
+    _chooser->scheme_list[si]=7-si;
+    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
   }
 }
 
-/* This is the real purpose of this data structure: not actually
-   selecting a mode scheme, but estimating the cost of coding a given
-   mode given all the modes selected so far.
-
-   This is done via opportunity cost: the cost is defined as the number of bits
+/*This is the real purpose of this data structure: not actually selecting a
+   mode scheme, but estimating the cost of coding a given mode given all the
+   modes selected so far.
+  This is done via opportunity cost: the cost is defined as the number of bits
    required to encode all the modes selected so far including the current one
    using the best possible scheme, minus the number of bits required to encode
    all the modes selected so far not including the current one using the best
    possible scheme.
-  
-   The computational expense of doing this probably makes it overkill.
-   Just be happy we take a greedy approach instead of trying to solve the
+  The computational expense of doing this probably makes it overkill.
+  Just be happy we take a greedy approach instead of trying to solve the
    global mode-selection problem (which is NP-hard).
-   _mode: The mode to determine the cost of.
-   Return: The number of bits required to code this mode.*/
-
-int oc_mode_cost(CP_INSTANCE *cpi,
-		 int _mode){
-
-  oc_mode_scheme_chooser *chooser = &cpi->chooser;
-  int scheme0 = chooser->scheme_list[0];
-  int scheme1 = chooser->scheme_list[1];
-  int best_bits = chooser->scheme_bits[scheme0];
-  int mode_bits = chooser->mode_bits[scheme0][chooser->mode_ranks[scheme0][_mode]];
+  _mode: The mode to determine the cost of.
+  Return: The number of bits required to code this mode.*/
+static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
+ int _mode){
+  int scheme0;
+  int scheme1;
+  int best_bits;
+  int mode_bits;
   int si;
   int scheme_bits;
-
-  /*Typical case: If the difference between the best scheme and the
-     next best is greater than 6 bits, then adding just one mode
-     cannot change which scheme we use.*/
-
-  if(chooser->scheme_bits[scheme1]-best_bits > 6) return mode_bits;
-
-  /*Otherwise, check to see if adding this mode selects a different scheme as the best.*/
-  si = 1;
-  best_bits += mode_bits;
-
+  scheme0=_chooser->scheme_list[0];
+  scheme1=_chooser->scheme_list[1];
+  best_bits=_chooser->scheme_bits[scheme0];
+  mode_bits=OC_MODE_SCHEME_BITS[scheme0][_chooser->mode_ranks[scheme0][_mode]];
+  /*Typical case: If the difference between the best scheme and the next best
+     is greater than 6 bits, then adding just one mode cannot change which
+     scheme we use.*/
+  if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
+  /*Otherwise, check to see if adding this mode selects a different scheme as
+     the best.*/
+  si=1;
+  best_bits+=mode_bits;
   do{
-    /* For any scheme except 0, we can just use the bit cost of the mode's rank in that scheme.*/
+    /*For any scheme except 0, we can just use the bit cost of the mode's rank
+       in that scheme.*/
     if(scheme1!=0){
-
-      scheme_bits = chooser->scheme_bits[scheme1]+
-	chooser->mode_bits[scheme1][chooser->mode_ranks[scheme1][_mode]];
-
-    }else{
+      scheme_bits=_chooser->scheme_bits[scheme1]+
+       OC_MODE_SCHEME_BITS[scheme1][_chooser->mode_ranks[scheme1][_mode]];
+    }
+    else{
       int ri;
-
-      /* For scheme 0, incrementing the mode count could potentially
-         change the mode's rank.
-
-        Find the index where the mode would be moved to in the optimal
-        list, and use its bit cost instead of the one for the mode's
-        current position in the list. */
-
-      /* don't recompute scheme bits; this is computing opportunity
-	 cost, not an update. */
-
-      for(ri = chooser->scheme0_ranks[_mode] ; ri>0 &&
-	    chooser->mode_counts[_mode]>=
-	    chooser->mode_counts[chooser->scheme0_list[ri-1]] ; ri--);
-
-      scheme_bits = chooser->scheme_bits[0] + ModeBitLengths[ri];
+      /*For scheme 0, incrementing the mode count could potentially change the
+         mode's rank.
+        Find the index where the mode would be moved to in the optimal list,
+         and use its bit cost instead of the one for the mode's current
+         position in the list.*/
+      /*We don't recompute scheme bits; this is computing opportunity cost, not
+         an update.*/
+      for(ri=_chooser->scheme0_ranks[_mode];ri>0&&
+       _chooser->mode_counts[_mode]>=
+       _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
+      scheme_bits=_chooser->scheme_bits[0]+ModeBitLengths[ri];
     }
-
-    if(scheme_bits<best_bits) best_bits = scheme_bits;
-    if(++si>=8) break;
-    scheme1 = chooser->scheme_list[si];
-  } while(chooser->scheme_bits[scheme1] - chooser->scheme_bits[scheme0] <= 6);
-
-  return best_bits - chooser->scheme_bits[scheme0];
+    if(scheme_bits<best_bits)best_bits=scheme_bits;
+    if(++si>=8)break;
+    scheme1=_chooser->scheme_list[si];
+  }
+  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
+  return best_bits-_chooser->scheme_bits[scheme0];
 }
 
-/* Incrementally update the mode counts and per-scheme bit counts and re-order the scheme
-   lists once a mode has been selected.
-
+/*Incrementally update the mode counts and per-scheme bit counts and re-order
+   the scheme lists once a mode has been selected.
   _mode: The mode that was chosen.*/
-
-static void oc_mode_set( CP_INSTANCE *cpi,
-			 macroblock_t *mb,
-			 int _mode){
-
-  oc_mode_scheme_chooser *chooser = &cpi->chooser;
+static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
+ int _mode){
   int ri;
   int si;
-
-  chooser->mode_counts[_mode]++;
-
-  /* Re-order the scheme0 mode list if necessary. */
-  for(ri = chooser->scheme0_ranks[_mode]; ri>0; ri--){
+  _chooser->mode_counts[_mode]++;
+  /*Re-order the scheme0 mode list if necessary.*/
+  for(ri=_chooser->scheme0_ranks[_mode];ri>0;ri--){
     int pmode;
-    pmode=chooser->scheme0_list[ri-1];
-    if(chooser->mode_counts[pmode] >= chooser->mode_counts[_mode])break;
-
-    /* reorder the mode ranking */
-    chooser->scheme0_ranks[pmode]++;
-    chooser->scheme0_list[ri]=pmode;
-
+    pmode=_chooser->scheme0_list[ri-1];
+    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mode])break;
+    /*Reorder the mode ranking.*/
+    _chooser->scheme0_ranks[pmode]++;
+    _chooser->scheme0_list[ri]=pmode;
   }
-  chooser->scheme0_ranks[_mode]=ri;
-  chooser->scheme0_list[ri]=_mode;
-
+  _chooser->scheme0_ranks[_mode]=ri;
+  _chooser->scheme0_list[ri]=_mode;
   /*Now add the bit cost for the mode to each scheme.*/
   for(si=0;si<8;si++){
-    chooser->scheme_bits[si]+=
-      chooser->mode_bits[si][chooser->mode_ranks[si][_mode]];
+    _chooser->scheme_bits[si]+=
+     OC_MODE_SCHEME_BITS[si][_chooser->mode_ranks[si][_mode]];
   }
-
-  /* Finally, re-order the list of schemes. */
+  /*Finally, re-order the list of schemes.*/
   for(si=1;si<8;si++){
-    int sj = si;
-    int scheme0 = chooser->scheme_list[si];
-    int bits0 = chooser->scheme_bits[scheme0];
+    int sj;
+    int scheme0;
+    int bits0;
+    sj=si;
+    scheme0=_chooser->scheme_list[si];
+    bits0=_chooser->scheme_bits[scheme0];
     do{
-      int scheme1 = chooser->scheme_list[sj-1];
-      if(bits0 >= chooser->scheme_bits[scheme1]) break;
-      chooser->scheme_list[sj] = scheme1;
-    } while(--sj>0);
-    chooser->scheme_list[sj]=scheme0;
+      int scheme1;
+      scheme1=_chooser->scheme_list[sj-1];
+      if(bits0>=_chooser->scheme_bits[scheme1])break;
+      _chooser->scheme_list[sj]=scheme1;
+    }
+    while(--sj>0);
+    _chooser->scheme_list[sj]=scheme0;
   }
 }
 
@@ -223,17 +212,17 @@
   ogg_int32_t acc = 0;
   int stride = cpi->stride[plane];
   int j,k;
-  
+
   for(j=0;j<8;j++){
     for(k=0;k<8;k++)
-      acc += b[k]; 
+      acc += b[k];
     b += stride;
   }
-  
+
   b = cpi->frame + cpi->frag_buffer_index[fi];
   for(j=0;j<8;j++){
     for(k=0;k<8;k++)
-      sad += abs ((b[k]<<6)-acc); 
+      sad += abs ((b[k]<<6)-acc);
     b += stride;
   }
 
@@ -249,10 +238,10 @@
   return (ret>0?ret:0);
 }
 
-static const int mvmap[2][63] = {
+static const signed char OC_MVMAP[2][64]={
   {     -15,-15,-14, -14,-13,-13,-12, -12,-11,-11,-10, -10, -9, -9, -8,
      -8, -7, -7, -6,  -6, -5, -5, -4,  -4, -3, -3, -2,  -2, -1, -1,  0,
-      0,  0,  1,  1,   2,  2,  3,  3,   4,  4,  5,  5,   6,  6,  7,  7, 
+      0,  0,  1,  1,   2,  2,  3,  3,   4,  4,  5,  5,   6,  6,  7,  7,
       8,  8,  9,  9,  10, 10, 11, 11,  12, 12, 13, 13,  14, 14, 15, 15 },
   {      -7, -7, -7,  -7, -6, -6, -6,  -6, -5, -5, -5,  -5, -4, -4, -4,
      -4, -3, -3, -3,  -3, -2, -2, -2,  -2, -1, -1, -1,  -1,  0,  0,  0,
@@ -260,7 +249,7 @@
       4,  4,  4,  4,   5,  5,  5,  5,   6,  6,  6,  6,   7,  7,  7,  7 }
 };
 
-static const int mvmap2[2][63] = {
+static const signed char OC_MVMAP2[2][63]={
   {   -1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
     0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
     0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,
@@ -271,205 +260,230 @@
     0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1 }
 };
 
-static int BInterSAD(CP_INSTANCE *cpi, int fi, int plane, int goldenp, mv_t mv){
-  int sad = 0;
-  unsigned char *b = cpi->frame + cpi->frag_buffer_index[fi];
-  unsigned char *r = (goldenp ? cpi->golden : cpi->lastrecon ) + 
-    cpi->frag_buffer_index[fi];
-  int stride = cpi->stride[plane];
+int oc_get_mv_offsets(int _offsets[2],int _dx,int _dy,
+ int _ystride,int _pli,int _pf){
+  int qpx;
+  int qpy;
+  int mx;
+  int my;
+  int mx2;
+  int my2;
+  int offs;
+  qpy=!(_pf&2)&&_pli;
+  my=OC_MVMAP[qpy][_dy+31];
+  my2=OC_MVMAP2[qpy][_dy+31];
+  qpx=!(_pf&1)&&_pli;
+  mx=OC_MVMAP[qpx][_dx+31];
+  mx2=OC_MVMAP2[qpx][_dx+31];
+  offs=my*_ystride+mx;
+  if(mx2||my2){
+    _offsets[1]=offs+my2*_ystride+mx2;
+    _offsets[0]=offs;
+    return 2;
+  }
+  _offsets[0]=offs;
+  return 1;
+}
 
-  if(mv.x || mv.y){
-    int qp = (plane>0);
-    int mx = mvmap[qp][mv.x+31];
-    int my = mvmap[qp][mv.y+31];
-    int mx2 = mvmap2[qp][mv.x+31];
-    int my2 = mvmap2[qp][mv.y+31];
-    
-    r += my * stride + mx;
-    
-    if(mx2 || my2){
-      unsigned char *r2 = r + my2 * stride + mx2;
-      sad =  dsp_sad8x8_xy2_thres (cpi->dsp, b, r, r2, stride, 9999999);
-    }else{
-      sad =  dsp_sad8x8 (cpi->dsp, b, r, stride);
+static int BInterSAD(CP_INSTANCE *cpi,int _fi,int _dx,int _dy,
+ int _pli,int _goldenp){
+  unsigned char *b;
+  unsigned char *r;
+  int            stride;
+  int            sad;
+  b=cpi->frame+cpi->frag_buffer_index[_fi];
+  r=(_goldenp?cpi->golden:cpi->lastrecon)+cpi->frag_buffer_index[_fi];
+  stride=cpi->stride[_pli];
+  sad=0;
+  if(_dx||_dy){
+    int offs[2];
+    if(oc_get_mv_offsets(offs,_dx,_dy,
+     cpi->stride[_pli],_pli,cpi->info.pixelformat)>1){
+      sad=dsp_sad8x8_xy2_thres(cpi->dsp,b,r+offs[0],r+offs[1],stride,16384);
     }
-  }else
-    sad =  dsp_sad8x8 (cpi->dsp, b, r, stride);
-
-  if(plane)
-    return sad<<2;
-  else
-    return sad;
+    else sad=dsp_sad8x8(cpi->dsp,b,r+offs[0],stride);
+  }
+  /*TODO: Is this special case worth it?*/
+  else sad=dsp_sad8x8(cpi->dsp,b,r,stride);
+  /*TODO: <<2? Really? Why?*/
+  if(_pli)return sad<<2;
+  else return sad;
 }
 
-static int cost_intra(CP_INSTANCE *cpi, int qi, int mbi, ogg_uint32_t *intrabits, int *overhead){
-  macroblock_t *mb = &cpi->macro[mbi];
-  int i,j;
-  int cost = 0;
-  for(i=0;i<3;i++){
-    for(j=0;j<4;j++){
-      int fi=mb->Ryuv[i][j];
+static int cost_intra(CP_INSTANCE *cpi,int _qi,int _mbi,
+ ogg_uint32_t *_intrabits,int *_overhead){
+  macroblock_t *mb;
+  int           pli;
+  int           bi;
+  int           cost;
+  int           overhead;
+  mb=cpi->macro+_mbi;
+  cost=0;
+  for(pli=0;pli<3;pli++){
+    for(bi=0;bi<4;bi++){
+      int fi;
+      fi=mb->Ryuv[pli][bi];
       if(fi<cpi->frag_total){
-	int sad = BIntraSAD(cpi,fi,i);
-	cost += BINMAP(mode_rate[qi][i][1],sad);
+        int sad;
+        sad=BIntraSAD(cpi,fi,pli);
+        cost+=BINMAP(mode_rate[_qi][pli][1],sad);
       }
     }
   }
- 
-  *intrabits+=cost;
-  *overhead = (oc_mode_cost(cpi,CODE_INTRA) << OC_BIT_SCALE);
-  return cost + *overhead;
+  *_intrabits+=cost;
+  overhead=oc_mode_scheme_chooser_cost(&cpi->chooser,CODE_INTRA)<<OC_BIT_SCALE;
+  *_overhead=overhead;
+  return cost+overhead;
 }
 
-static int cost_inter(CP_INSTANCE *cpi, int qi, int mbi, mv_t mv, int mode, int *overhead){
-  macroblock_t *mb = &cpi->macro[mbi];
-  int i,j;
-  int cost = 0;
-  for(i=0;i<3;i++){
-    for(j=0;j<4;j++){
-      int fi=mb->Ryuv[i][j];
+static int cost_inter(CP_INSTANCE *cpi,int _qi,int _mbi,int _dx, int _dy,
+ int _mode,int *_overhead){
+  macroblock_t *mb;
+  int           goldenp;
+  int           pli;
+  int           bi;
+  int           cost;
+  int           overhead;
+  mb=cpi->macro+_mbi;
+  goldenp=_mode==CODE_USING_GOLDEN;
+  cost=0;
+  for(pli=0;pli<3;pli++){
+    for(bi=0;bi<4;bi++){
+      int fi;
+      fi=mb->Ryuv[pli][bi];
       if(fi<cpi->frag_total){
-	int sad = BInterSAD(cpi,fi,i,mode==CODE_USING_GOLDEN,mv);
-	cost += BINMAP(mode_rate[qi][i][0],sad);
+        int sad;
+        sad=BInterSAD(cpi,fi,_dx,_dy,pli,goldenp);
+        cost+=BINMAP(mode_rate[_qi][pli][0],sad);
       }
     }
   }
-  *overhead = (oc_mode_cost(cpi,mode) << OC_BIT_SCALE);
-  return cost + *overhead;
+  overhead=oc_mode_scheme_chooser_cost(&cpi->chooser,_mode)<<OC_BIT_SCALE;
+  *_overhead=overhead;
+  return cost+overhead;
 }
 
-static int cost_inter_nomv(CP_INSTANCE *cpi, int qi, int mbi, int *overhead){
-  macroblock_t *mb = &cpi->macro[mbi];
-  int i,j;
-  int cost = 0;
-  for(i=0;i<3;i++){
-    for(j=0;j<4;j++){
-      int fi=mb->Ryuv[i][j];
+static int cost_inter_nomv(CP_INSTANCE *cpi,int _qi,int _mbi,int *_overhead){
+  macroblock_t *mb;
+  int           pli;
+  int           bi;
+  int           cost;
+  int           overhead;
+  mb=cpi->macro+_mbi;
+  cost=0;
+  for(pli=0;pli<3;pli++){
+    int stride;
+    stride=cpi->stride[pli];
+    for(bi=0;bi<4;bi++){
+      int fi;
+      fi=mb->Ryuv[pli][bi];
       if(fi<cpi->frag_total){
-	int bi = cpi->frag_buffer_index[fi];
-	int stride = cpi->stride[i];  
-	int sad =  dsp_sad8x8 (cpi->dsp, cpi->frame+bi, cpi->lastrecon+bi, stride);
-      
-	if(i)sad<<=2;
-	cost += BINMAP(mode_rate[qi][i][0],sad);
+        int offs;
+        int sad;
+        offs=cpi->frag_buffer_index[fi];
+        sad=dsp_sad8x8(cpi->dsp,cpi->frame+offs,cpi->lastrecon+offs,stride);
+        if(pli)sad<<=2;
+        cost+=BINMAP(mode_rate[_qi][pli][0],sad);
       }
     }
   }
-  *overhead = (oc_mode_cost(cpi,CODE_INTER_NO_MV) << OC_BIT_SCALE);
-  return cost + *overhead;
+  overhead=
+   oc_mode_scheme_chooser_cost(&cpi->chooser,CODE_INTER_NO_MV)<<OC_BIT_SCALE;
+  *_overhead=overhead;
+  return cost+overhead;
 }
 
-static int cost_inter1mv(CP_INSTANCE *cpi, int qi, int mbi, int golden, int *bits0, int *overhead){
-  macroblock_t *mb = &cpi->macro[mbi];
-  int i,j;
-  int cost = 0;
-  for(i=0;i<3;i++){
-    for(j=0;j<4;j++){
-      int fi=mb->Ryuv[i][j];
+static int cost_inter1mv(CP_INSTANCE *cpi,int _qi,int _mbi,int _goldenp,
+ signed char *_mv,int *_bits0,int *_overhead){
+  macroblock_t *mb;
+  int           dx;
+  int           dy;
+  int           pli;
+  int           bi;
+  int           bits0;
+  int           cost;
+  int           overhead;
+  mb=cpi->macro+_mbi;
+  dx=_mv[0];
+  dy=_mv[1];
+  cost=0;
+  for(pli=0;pli<3;pli++){
+    for(bi=0;bi<4;bi++){
+      int fi;
+      fi=mb->Ryuv[pli][bi];
       if(fi<cpi->frag_total){
-	int sad = BInterSAD(cpi,fi,i,golden,mb->analysis_mv[0][golden]);
-	cost += BINMAP(mode_rate[qi][i][0],sad);
+        int          sad;
+        sad=BInterSAD(cpi,fi,dx,dy,pli,_goldenp);
+        cost+=BINMAP(mode_rate[_qi][pli][0],sad);
       }
     }
   }
-  
-  *bits0  = 
-    MvBits[mb->analysis_mv[0][golden].x + MAX_MV_EXTENT] + 
-    MvBits[mb->analysis_mv[0][golden].y + MAX_MV_EXTENT];
-  
-  *overhead = (oc_mode_cost(cpi,golden?CODE_GOLDEN_MV:CODE_INTER_PLUS_MV) +
-	       (OC_MINI(cpi->MVBits_0 + *bits0, cpi->MVBits_1+12)-
-		OC_MINI(cpi->MVBits_0, cpi->MVBits_1))) << OC_BIT_SCALE;
-  
-  return cost + *overhead;
+  bits0=MvBits[dx+MAX_MV_EXTENT]+MvBits[dy+MAX_MV_EXTENT];
+  overhead=oc_mode_scheme_chooser_cost(&cpi->chooser,
+   _goldenp?CODE_GOLDEN_MV:CODE_INTER_PLUS_MV)
+   +OC_MINI(cpi->MVBits_0+bits0,cpi->MVBits_1+12)
+   -OC_MINI(cpi->MVBits_0,cpi->MVBits_1)<<OC_BIT_SCALE;
+  *_bits0=bits0;
+  *_overhead=overhead;
+  return cost+overhead;
 }
 
-static int cost_inter4mv(CP_INSTANCE *cpi, int qi, int mbi, int *bits0, int *bits1, int *overhead){
-  int pf = cpi->info.pixelformat;
-  macroblock_t *mb = &cpi->macro[mbi];
-  int i,j;
-  int cost = 0;
+static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
+  dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
+  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
+  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
+}
 
-  *bits0 = *bits1 = 0;
-
-  for(j=0;j<4;j++){
-    int fi=mb->Ryuv[0][j];
+static int cost_inter4mv(CP_INSTANCE *cpi,int _qi,int _mbi,
+ oc_mv _mv[4],int *_bits0,int *_bits1,int *_overhead){
+  macroblock_t *mb;
+  int           pli;
+  int           bi;
+  int           cost;
+  int           overhead;
+  int           bits0;
+  int           bits1;
+  mb=cpi->macro+_mbi;
+  cost=bits0=bits1=0;
+  memcpy(mb->mv,_mv,sizeof(mb->mv));
+  for(bi=0;bi<4;bi++){
+    int fi;
+    fi=mb->Ryuv[0][bi];
     if(fi<cpi->frag_total){
-      int sad = BInterSAD(cpi,fi,0,0,mb->mv[j]);
-      cost += BINMAP(mode_rate[qi][0][0],sad);
-      
-      *bits0 += 
-	MvBits[mb->mv[j].x + MAX_MV_EXTENT] + 
-	MvBits[mb->mv[j].y + MAX_MV_EXTENT];
-      *bits1 += 12;
+      int dx;
+      int dy;
+      int sad;
+      dx=_mv[bi][0];
+      dy=_mv[bi][1];
+      sad=BInterSAD(cpi,fi,dx,dy,0,0);
+      cost+=BINMAP(mode_rate[_qi][0][0],sad);
+      bits0+=MvBits[dx+MAX_MV_EXTENT]+MvBits[dy+MAX_MV_EXTENT];
+      bits1+=12;
     }
   }
-  
-  switch(pf){
-  case OC_PF_420:
-    {
-      mv_t ch;
-      
-      ch.x = mb->mv[0].x + mb->mv[1].x + mb->mv[2].x + mb->mv[3].x;
-      ch.y = mb->mv[0].y + mb->mv[1].y + mb->mv[2].y + mb->mv[3].y;
-      
-      ch.x = ( ch.x >= 0 ? (ch.x + 2) / 4 : (ch.x - 2) / 4);
-      ch.y = ( ch.y >= 0 ? (ch.y + 2) / 4 : (ch.y - 2) / 4);
-      
-      for(i=1;i<3;i++){
-	int fi=mb->Ryuv[i][0];
-	if(fi<cpi->frag_total){
-	  int sad = BInterSAD(cpi,fi,i,0,ch);
-	  cost += BINMAP(mode_rate[qi][i][0],sad);
-	}
+  /*TODO: Use OC_SET_CHROMA_MVS_TABLE from decoder; 4:2:0 only for now.*/
+  oc_set_chroma_mvs00(mb->cbmvs,_mv);
+  for(pli=1;pli<3;pli++){
+    for(bi=0;bi<4;bi++){
+      int fi;
+      fi=mb->Ryuv[pli][bi];
+      if(fi<cpi->frag_total){
+        int sad;
+        sad=BInterSAD(cpi,fi,mb->cbmvs[bi][0],mb->cbmvs[bi][1],pli,0);
+        cost+=BINMAP(mode_rate[_qi][pli][0],sad);
       }
     }
-    break;
-
-  case OC_PF_422:
-    {
-      mv_t mv[2];
-      
-      mv[0].x = mb->mv[0].x + mb->mv[1].x;
-      mv[0].y = mb->mv[0].y + mb->mv[1].y;
-      mv[0].x = ( mv[0].x >= 0 ? (mv[0].x + 1) / 2 : (mv[0].x - 1) / 2);
-      mv[0].y = ( mv[0].y >= 0 ? (mv[0].y + 1) / 2 : (mv[0].y - 1) / 2);
-
-      mv[1].x = mb->mv[2].x + mb->mv[3].x;
-      mv[1].y = mb->mv[2].y + mb->mv[3].y;
-      mv[1].x = ( mv[1].x >= 0 ? (mv[1].x + 1) / 2 : (mv[1].x - 1) / 2);
-      mv[1].y = ( mv[1].y >= 0 ? (mv[1].y + 1) / 2 : (mv[1].y - 1) / 2);
-      
-      for(i=1;i<3;i++){
-	for(j=0;j<2;j++){
-	  int fi=mb->Ryuv[i][j];
-	  if(fi<cpi->frag_total){
-	    int sad = BInterSAD(cpi,fi,i,0,mv[j]);
-	    cost += BINMAP(mode_rate[qi][i][0],sad);
-	  }
-	}
-      }
-    }
-    break;
-    
-  case OC_PF_444:
-    for(i=1;i<3;i++){
-      for(j=0;j<4;j++){
-	int fi=mb->Ryuv[i][j];
-	if(fi<cpi->frag_total){
-	  int sad = BInterSAD(cpi,fi,i,0,mb->mv[j]);
-	  cost += BINMAP(mode_rate[qi][i][0],sad);
-	}
-      }
-    }
-    break;
-    
   }
-  
-  *overhead = (oc_mode_cost(cpi,CODE_INTER_FOURMV) +
-	       (OC_MINI(cpi->MVBits_0 + *bits0, cpi->MVBits_1 + *bits1)-
-		OC_MINI(cpi->MVBits_0, cpi->MVBits_1))) << OC_BIT_SCALE;
-  return cost + *overhead;
+  overhead=oc_mode_scheme_chooser_cost(&cpi->chooser,CODE_INTER_FOURMV)
+   +OC_MINI(cpi->MVBits_0+bits0,cpi->MVBits_1+bits1)
+   -OC_MINI(cpi->MVBits_0,cpi->MVBits_1)<<OC_BIT_SCALE;
+  *_overhead=overhead;
+  *_bits0=bits0;
+  *_bits1=bits1;
+  return cost+overhead;
 }
 
 #include "quant_lookup.h"
@@ -480,7 +494,7 @@
 
   cpi->frag_coded[fi]=0;
   dsp_copy8x8 (cpi->dsp, cpi->lastrecon+bi, cpi->recon+bi, stride);
-}      
+}
 
 typedef struct{
   int uncoded_ac_ssd;
@@ -493,7 +507,7 @@
   int plane;
   int qi;
   ogg_int16_t re_q[2][3][64];
-  ogg_int32_t *iq[2];
+  oc_iquant *iq[2];
   quant_tables *qq[2];
   ogg_int32_t *mode_rate[2];
   int xqp;
@@ -509,7 +523,7 @@
   for(i=0;i<2;i++)
     for(j=0;j<3;j++)
       for(k=0;k<64;k++)
-	ps->re_q[i][j][k]=cpi->quant_tables[i][j][k][qi];
+        ps->re_q[i][j][k]=cpi->quant_tables[i][j][k][qi];
 }
 
 static void ps_setup_plane(CP_INSTANCE *cpi, plane_state_t *ps, int plane){
@@ -527,20 +541,19 @@
 
 /* coding overhead is unscaled */
 #include<stdio.h>
-static int TQB (CP_INSTANCE *cpi, plane_state_t *ps, int mode, int fi, mv_t mv, 
-		int coding_overhead, rd_metric_t *mo, long *rho_count,
-		token_checkpoint_t **stack){
-  
+static int TQB (CP_INSTANCE *cpi,plane_state_t *ps,int mode,int fi,
+ int _dx,int _dy,int coding_overhead,rd_metric_t *mo,long *rho_count,
+ token_checkpoint_t **stack){
   const int keyframe = (cpi->FrameType == KEY_FRAME);
-  const ogg_int32_t *iq = ps->iq[mode != CODE_INTRA];
+  const oc_iquant *iq = ps->iq[mode != CODE_INTRA];
   ogg_int16_t buffer[64];
   ogg_int16_t data[64];
   const int bi = cpi->frag_buffer_index[fi];
   const int stride = cpi->stride[ps->plane];
   const unsigned char *frame_ptr = &cpi->frame[bi];
-  unsigned char *lastrecon = ((mode == CODE_USING_GOLDEN || 
-			       mode == CODE_GOLDEN_MV) ? 
-			      cpi->golden : cpi->lastrecon)+bi;
+  unsigned char *lastrecon = ((mode == CODE_USING_GOLDEN ||
+                               mode == CODE_GOLDEN_MV) ?
+                              cpi->golden : cpi->lastrecon)+bi;
   unsigned char *thisrecon = cpi->recon+bi;
   int nonzero=0;
   const ogg_int16_t *dequant = ps->re_q[mode != CODE_INTRA][ps->plane];
@@ -549,7 +562,8 @@
   int lambda = cpi->lambda;
   token_checkpoint_t *checkpoint=*stack;
   int cost;
-  int i;
+  int ci;
+  int pi;
 
   cpi->frag_coded[fi]=1;
 
@@ -560,7 +574,7 @@
      encourage coding through negative coding overhead deltas is
      useful.  For that reason, we disallow negative
      coding_overheads */
-  if(coding_overhead<0)coding_overhead = 0; 
+  if(coding_overhead<0)coding_overhead = 0;
 
   /* motion comp */
   switch(mode){
@@ -569,26 +583,22 @@
   case CODE_INTER_PRIOR_LAST:
   case CODE_GOLDEN_MV:
   case CODE_INTER_FOURMV:
-    
-    {    
-      int mx = mvmap[ps->xqp][mv.x+31];
-      int my = mvmap[ps->yqp][mv.y+31];
-      int mx2 = mvmap2[ps->xqp][mv.x+31];
-      int my2 = mvmap2[ps->yqp][mv.y+31];
-      
-      unsigned char *r1 = lastrecon + my * stride + mx;
-      
-      if(mx2 || my2){
-	unsigned char *r2 = r1 + my2 * stride + mx2;
-	dsp_copy8x8_half (cpi->dsp, r1, r2, thisrecon, stride);
-	dsp_sub8x8(cpi->dsp, frame_ptr, thisrecon, data, stride);
-      }else{
-	dsp_copy8x8 (cpi->dsp, r1, thisrecon, stride);
-	dsp_sub8x8(cpi->dsp, frame_ptr, r1, data, stride);
+
+    {
+      int offs[2];
+      if(oc_get_mv_offsets(offs,_dx,_dy,
+       stride,ps->plane,cpi->info.pixelformat)>1){
+        dsp_copy8x8_half(cpi->dsp,
+         lastrecon+offs[0],lastrecon+offs[1],thisrecon,stride);
+        dsp_sub8x8(cpi->dsp,frame_ptr,thisrecon,data,stride);
       }
+      else{
+        dsp_copy8x8(cpi->dsp,lastrecon+offs[0],thisrecon,stride);
+        dsp_sub8x8(cpi->dsp,frame_ptr,lastrecon+offs[0],data,stride);
+      }
     }
     break;
-    
+
   case CODE_USING_GOLDEN:
   case CODE_INTER_NO_MV:
     dsp_copy8x8 (cpi->dsp, lastrecon, thisrecon, stride);
@@ -604,15 +614,15 @@
   int sad=0;
   if(mode==CODE_INTRA){
     int acc=0;
-    for(i=0;i<64;i++)
-      acc += data[i];
-    for(i=0;i<64;i++)
-      sad += abs((data[i]<<6)-acc);
+    for(pi=0;pi<64;pi++)
+      acc += data[pi];
+    for(pi=0;pi<64;pi++)
+      sad += abs((data[pi]<<6)-acc);
     sad >>=6;
   }else{
-    for(i=0;i<64;i++)
-      sad += abs(data[i]);
-    
+    for(pi=0;pi<64;pi++)
+      sad += abs(data[pi]);
+
     if(ps->plane)sad<<=2;
   }
 
@@ -621,15 +631,15 @@
 
   if(!keyframe){
     if(mode==CODE_INTER_NO_MV){
-      for(i=0;i<64;i++){
-	uncoded_ssd += data[i]*data[i];
-	uncoded_dc += data[i];
+      for(pi=0;pi<64;pi++){
+        uncoded_ssd += data[pi]*data[pi];
+        uncoded_dc += data[pi];
       }
     }else{
       dsp_sub8x8(cpi->dsp, frame_ptr, cpi->lastrecon+bi, buffer, stride);
-      for(i=0;i<64;i++){
-	uncoded_ssd += buffer[i]*buffer[i];
-	uncoded_dc += buffer[i];
+      for(pi=0;pi<64;pi++){
+        uncoded_ssd += buffer[pi]*buffer[pi];
+        uncoded_dc += buffer[pi];
       }
     }
     uncoded_ssd <<= 4; /* scale to match DCT domain */
@@ -640,34 +650,49 @@
 
   /* collect rho metrics, quantize */
   {
-    int i;
+    int          zzi;
+#if 0
     quant_tables *qq = ps->qq[mode != CODE_INTRA];
-    
-    for(i=0;i<64;i++){
-      int v = buffer[dezigzag_index[i]];
-      int pos;
-      int val = abs(v)<<1;
-      ogg_int16_t *qqq = (*qq)[i];
-      for(pos=64;pos>0;pos--)
-      if(val < qqq[pos-1])break;
-      
+#endif
+    for(zzi=0;zzi<64;zzi++){
+      int v;
+      int val;
+      int d;
+      ci=dezigzag_index[zzi];
+      v=buffer[ci];
+      d=dequant[zzi];
       /* rho-domain distribution */
-      rho_count[pos]++;
-
-      if(val>=dequant[i]){
-	val = (((iq[i]>>15)*v) + (1<<15) + (((iq[i]&0x7fff)*v)>>15)) >>16;
-	data[i] = (val>511?511:(val<-511?-511:val));
-	nonzero=i;
-      }else{
-	data[i] = 0;
+      val=v<<1;
+      v=abs(val);
+#if 0
+      {
+        ogg_int16_t *qqq = (*qq)[zzi];
+        int pos;
+        for(pos=64;pos>0;pos--)if(v<qqq[pos-1])break;
+        rho_count[pos]++;
       }
+#endif
+      if(v>=d){
+        int s;
+        s=OC_SIGNMASK(val);
+        /*The bias added here rounds ties away from zero, since token
+           optimization can only decrease the magnitude of the quantized
+           value.*/
+        val+=(d+s)^s;
+        /*Note the arithmetic right shift is not guaranteed by ANSI C.
+          Hopefully no one still uses ones-complement architectures.*/
+        val=((iq[zzi].m*(ogg_int32_t)val>>16)+val>>iq[zzi].l)-s;
+        data[zzi]=OC_CLAMPI(-580,val,580);
+        nonzero=zzi;
+      }
+      else data[zzi]=0;
     }
   }
   cpi->frag_dc[fi] = data[0];
 
   /* tokenize */
   cost = dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack);
-  
+
   /* reconstruct */
   switch(nonzero){
   case 0:
@@ -682,57 +707,52 @@
   default:
     dsp_IDctSlow(cpi->dsp, data, dequant, buffer );
   }
-  
+
   dsp_recon8x8 (cpi->dsp, thisrecon, buffer, stride);
 
   if(!keyframe){
-    int i;
-
     /* in retrospect, should we have skipped this block? */
-    dsp_sub8x8(cpi->dsp, frame_ptr, thisrecon, buffer, stride);    
-    for(i=0;i<64;i++){
-      coded_ssd += buffer[i]*buffer[i];
-      coded_dc += buffer[i];
+    dsp_sub8x8(cpi->dsp, frame_ptr, thisrecon, buffer, stride);
+    for(pi=0;pi<64;pi++){
+      coded_ssd+=buffer[pi]*buffer[pi];
+      coded_dc+=buffer[pi];
     }
     coded_ssd <<= 4; /* scale to match DCT domain */
-    
     /* We actually only want the AC contribution to the SSDs */
     uncoded_ssd -= ((uncoded_dc*uncoded_dc)>>2);
     coded_ssd -= ((coded_dc*coded_dc)>>2);
-
     /* for undersampled planes */
-    //coded_ssd*=ps->ssdmul; 
-    //uncoded_ssd*=ps->ssdmul;
+    /*coded_ssd*=ps->ssdmul;*/
+    /*uncoded_ssd*=ps->ssdmul;*/
+    mo->uncoded_ac_ssd+=uncoded_ssd;
 
-    mo->uncoded_ac_ssd+=uncoded_ssd;  
-
     /* DC is a special case; if there's more than a full-quantizer
        improvement in the effective DC component, always force-code
        the block */
     if( abs(uncoded_dc)-abs(coded_dc) > (dequant[0]<<1)){
       mo->dc_flag = dc_flag = 1;
     }
-       
+
     if(!dc_flag && uncoded_ssd <= coded_ssd+(coding_overhead+cost)*lambda){
       /* Hm, not worth it.  roll back */
       tokenlog_rollback(cpi, checkpoint, (*stack)-checkpoint);
       *stack = checkpoint;
       uncode_frag(cpi,fi,ps->plane);
-      
+
       mo->coded_ac_ssd+=uncoded_ssd;
       //fprintf(stderr,"skip(%d:%d)",coding_overhead,cost);
-      
+
       return 0;
     }else{
-      
+
       //fprintf(stderr,"*****(%d:%d)",coding_overhead,cost);
 
       mo->coded_ac_ssd+=coded_ssd;
       mo->ac_cost+=cost;
-      
+
     }
   }
-  
+
   //for(i=0;i<64;i++)
   //if(data[i]!=0)cpi->rho_postop++;
 
@@ -742,8 +762,8 @@
 static int macroblock_phase_Y[4][4] = {{0,1,3,2},{0,2,3,1},{0,2,3,1},{3,2,0,1}};
 
 /* mode_overhead is scaled by << OC_BIT_SCALE */
-static int TQMB_Y ( CP_INSTANCE *cpi, macroblock_t *mb, int mb_phase, plane_state_t *ps, long *rc, 
-		    int mode_overhead, fr_state_t *fr){
+static int TQMB_Y(CP_INSTANCE *cpi,macroblock_t *mb,int mb_phase,
+ plane_state_t *ps,long *rc,int mode_overhead,int *mb_mv_bits_0,fr_state_t *fr){
 
   int full_checkpoint = cpi->fr_full_count;
   int partial_checkpoint = cpi->fr_partial_count;
@@ -767,81 +787,79 @@
     int bi = macroblock_phase_Y[mb_phase][i];
     int fi = mb->Ryuv[0][bi];
 
-    if(TQB(cpi,ps,mode,fi,mb->mv[bi],fr_cost1(fr),&mo,rc,&stackptr)){
+    if(TQB(cpi,ps,mode,fi,mb->mv[bi][0],mb->mv[bi][1],
+     fr_cost1(fr),&mo,rc,&stackptr)){
       fr_codeblock(cpi,fr);
       coded++;
-    }else{
-      fr_skipblock(cpi,fr);
-      if(mode == CODE_INTER_FOURMV) 
-	mb->mv[bi]=(mv_t){0,0};
     }
+    else fr_skipblock(cpi,fr);
   }
-  
 
+
   if(cpi->FrameType != KEY_FRAME){
+    int bi;
     if(coded && !mo.dc_flag){
       /* block by block, still coding the MB.  Now consider the
-	 macroblock coding cost as a whole (mode and MV) */ 
+         macroblock coding cost as a whole (mode and MV) */
       int codecost = mo.ac_cost+fr_cost4(&fr_checkpoint,fr)+(mode_overhead>>OC_BIT_SCALE);
       if(mo.uncoded_ac_ssd <= mo.coded_ac_ssd+cpi->lambda*codecost){
-	
-	/* taking macroblock overhead into account, it is not worth coding this MB */
-	tokenlog_rollback(cpi, stack, stackptr-stack);
-	memcpy(fr,&fr_checkpoint,sizeof(fr_checkpoint));
-	cpi->fr_full_count = full_checkpoint;
-	cpi->fr_partial_count = partial_checkpoint;
-	cpi->fr_block_count = block_checkpoint;
-	//cpi->rho_postop = rho_check;
 
-	for(i=0;i<4;i++){
-	  int fi = mb->Ryuv[0][i];
-	  if(cp[fi])
-	    uncode_frag(cpi,fi,0);
-	  fr_skipblock(cpi,fr);
-	}
-	coded=0;
+        /* taking macroblock overhead into account, it is not worth coding this MB */
+        tokenlog_rollback(cpi, stack, stackptr-stack);
+        memcpy(fr,&fr_checkpoint,sizeof(fr_checkpoint));
+        cpi->fr_full_count = full_checkpoint;
+        cpi->fr_partial_count = partial_checkpoint;
+        cpi->fr_block_count = block_checkpoint;
+        /*cpi->rho_postop = rho_check;*/
 
+        for(i=0;i<4;i++){
+          int fi = mb->Ryuv[0][i];
+          if(cp[fi])
+            uncode_frag(cpi,fi,0);
+          fr_skipblock(cpi,fr);
+        }
+        coded=0;
+
       }
     }
 
     if(coded==0){
       mb->mode = CODE_INTER_NO_MV; /* No luma blocks coded, mode is forced */
       mb->coded = 0;
-      mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = (mv_t){0,0};
-      return 0; 
-
+      memset(mb->mv,0,sizeof(mb->mv));
+      memset(mb->cbmvs,0,sizeof(mb->cbmvs));
+      return 0;
     }
-
-    /* assume that a 1mv with a single coded block is always cheaper than a 4mv with a single coded block */
-    if(coded==1 && mode==CODE_INTER_FOURMV){
-      mode = mb->mode = CODE_INTER_PLUS_MV;
-      if(cp[mb->Ryuv[0][0]])
-	mb->mv[1] = mb->mv[2] = mb->mv[3] = mb->mv[0];
-      else if(cp[mb->Ryuv[0][1]])
-	mb->mv[0] = mb->mv[2] = mb->mv[3] = mb->mv[1];
-      else if(cp[mb->Ryuv[0][2]])
-	mb->mv[0] = mb->mv[1] = mb->mv[3] = mb->mv[2];
-      else
-	mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3];
+    /*Assume that a 1mv with a single coded block is always cheaper than a 4mv
+       with a single coded block.
+      This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
+       skipped blocks, while a 1MV does not.*/
+    else if(coded==1&&mode==CODE_INTER_FOURMV){
+      int dx;
+      int dy;
+      mode=mb->mode=CODE_INTER_PLUS_MV;
+      for(bi=0;!cp[mb->Ryuv[0][bi]];bi++);
+      dx=mb->mv[bi][0];
+      dy=mb->mv[bi][1];
+      mb->cbmvs[0][0]=mb->cbmvs[1][0]=mb->cbmvs[2][0]=mb->cbmvs[3][0]=
+       mb->mv[0][0]=mb->mv[1][0]=mb->mv[2][0]=mb->mv[3][0]=(signed char)dx;
+      mb->cbmvs[0][1]=mb->cbmvs[1][1]=mb->cbmvs[2][1]=mb->cbmvs[3][1]=
+       mb->mv[0][1]=mb->mv[1][1]=mb->mv[2][1]=mb->mv[3][1]=(signed char)dy;
+      *mb_mv_bits_0=MvBits[dx+MAX_MV_EXTENT]+MvBits[dy+MAX_MV_EXTENT];
     }
-    
-    /* replace the block MVs for not-coded blocks with (0,0).*/   
-    mb->coded = 0;
-    for ( i=0; i<4; i++ ){
-      int fi = mb->Ryuv[0][i];
-      if(cp[fi]) 
-	mb->coded |= (1<<i);
-    }
+    mb->coded=0;
+    for(bi=0;bi<4;bi++)mb->coded|=cp[mb->Ryuv[0][bi]]<<bi;
   }
 
   /* Commit tokenization */
   tokenlog_commit(cpi, stack, stackptr-stack);
 
-  return coded;  
+  return coded;
 }
 
-static int macroblock_phase_422[16] = {0,0,2,2,0,2,2,0,0,2,2,0,2,2,0,0};
-static int macroblock_phase_444[16] = {0,1,3,2,0,2,3,1,0,2,3,1,3,2,0,1};
+static const unsigned char OC_MACROBLOCK_PHASE[16]={
+  0,1,3,2,0,2,3,1,0,2,3,1,3,2,0,1
+};
 
 static int TQSB_UV ( CP_INSTANCE *cpi, superblock_t *sb, plane_state_t *ps, long *rc, fr_state_t *fr){
   int pf = cpi->info.pixelformat;
@@ -853,70 +871,44 @@
 
   for(i=0;i<16;i++){
     int fi = sb->f[i];
-    int mb_phase;
 
     if(fi<cpi->frag_total){
-      token_checkpoint_t *stackptr = stack;
-      macroblock_t *mb = &cpi->macro[sb->m[i]];
-      mv_t mv;
-      if(mb->mode == CODE_INTER_FOURMV){
-	
-	switch(pf){
-	case OC_PF_420:
-	  /* sixteen blocks/macroblocks per chroma superblock */
-	  
-	  mv.x = mb->mv[0].x + mb->mv[1].x + mb->mv[2].x + mb->mv[3].x;
-	  mv.y = mb->mv[0].y + mb->mv[1].y + mb->mv[2].y + mb->mv[3].y;
-	  
-	  mv.x = ( mv.x >= 0 ? (mv.x + 2) / 4 : (mv.x - 2) / 4);
-	  mv.y = ( mv.y >= 0 ? (mv.y + 2) / 4 : (mv.y - 2) / 4);
-	  break;
-	  
-	case OC_PF_422:
-	  /* sixteen blocks / eight macroblocks per chroma superblock */
-	  mb_phase = macroblock_phase_422[i];
-	  mv.x = mb->mv[mb_phase].x + mb->mv[mb_phase+1].x;
-	  mv.y = mb->mv[mb_phase].y + mb->mv[mb_phase+1].y;
-	  mv.x = ( mv.x >= 0 ? (mv.x + 1) / 2 : (mv.x - 1) / 2);
-	  mv.y = ( mv.y >= 0 ? (mv.y + 1) / 2 : (mv.y - 1) / 2);
-	  break;
-	default: /*case OC_PF_444: */
-	  /* sixteen blocks / eight macroblocks per chroma superblock */
-	  mb_phase = macroblock_phase_444[i];
-	  mv = mb->mv[mb_phase];
-	  break;
-	}
-      }else
-	mv = mb->mv[0];
-      
-      if(TQB(cpi,ps,mb->mode,fi,mv,fr_cost1(fr),&mo,rc,&stackptr)){
-	fr_codeblock(cpi,fr);
-	tokenlog_commit(cpi, stack, stackptr-stack);
-	coded++;
+      token_checkpoint_t *stackptr;
+      macroblock_t       *mb;
+      int                 bi;
+      stackptr = stack;
+      mb=cpi->macro+sb->m[i];
+      bi=OC_MACROBLOCK_PHASE[i]&pf;
+      if(TQB(cpi,ps,mb->mode,fi,mb->cbmvs[bi][0],mb->cbmvs[bi][1],
+       fr_cost1(fr),&mo,rc,&stackptr)){
+        fr_codeblock(cpi,fr);
+        tokenlog_commit(cpi, stack, stackptr-stack);
+        coded++;
       }else{
-	fr_skipblock(cpi,fr);
+        fr_skipblock(cpi,fr);
       }
     }
   }
 
-  return coded;  
+  return coded;
 }
 
 int PickModes(CP_INSTANCE *cpi, int recode){
-  unsigned char qi = cpi->BaseQ; // temporary
-  superblock_t *sb = cpi->super[0];
+  int qi;
+  superblock_t *sb;
   superblock_t *sb_end;
   int i,j;
-  ogg_uint32_t interbits = 0;
-  ogg_uint32_t intrabits = 0;
+  ogg_uint32_t interbits;
+  ogg_uint32_t intrabits;
   mc_state mcenc;
-  mv_t last_mv = {0,0};
-  mv_t prior_mv = {0,0};
+  oc_mv last_mv;
+  oc_mv prior_mv;
   long rho_count[65];
   plane_state_t ps;
   fr_state_t fr;
-
-  oc_mode_scheme_chooser_init(cpi);
+  interbits=intrabits=0;
+  last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
+  oc_mode_scheme_chooser_reset(&cpi->chooser);
   ps_setup_frame(cpi,&ps);
   ps_setup_plane(cpi,&ps,0);
   fr_clear(cpi,&fr);
@@ -929,14 +921,15 @@
   memset(rho_count,0,sizeof(rho_count));
   cpi->MVBits_0 = 0;
   cpi->MVBits_1 = 0;
- 
+
   if(!recode)
-    oc_mcenc_start(cpi, &mcenc); 
+    oc_mcenc_start(cpi, &mcenc);
 
   dct_tokenize_init(cpi);
 
   /* Choose mvs, modes; must be done in Hilbert order */
   /* quantize and code Luma */
+  qi=cpi->BaseQ;
   sb = cpi->super[0];
   sb_end = sb + cpi->super_n[0];
   for(; sb<sb_end; sb++){
@@ -960,30 +953,30 @@
       macroblock_t *mb = &cpi->macro[mbi];
 
       if(!recode){
-	/* Motion estimation */
+        /* Motion estimation */
 
-	/* Move the motion vector predictors back a frame */
-	memmove(mb->analysis_mv+1,mb->analysis_mv,2*sizeof(mb->analysis_mv[0]));
-	
-	/* basic 1MV search always done for all macroblocks, coded or not, keyframe or not */
-	oc_mcenc_search(cpi, &mcenc, mbi, 0, mb->mv, &aerror, block_err);
-	
-	/* search golden frame */
-	oc_mcenc_search(cpi, &mcenc, mbi, 1, NULL, &gerror, NULL);
-	
+        /* Move the motion vector predictors back a frame */
+        memmove(mb->analysis_mv+1,mb->analysis_mv,2*sizeof(mb->analysis_mv[0]));
+
+        /* basic 1MV search always done for all macroblocks, coded or not, keyframe or not */
+        oc_mcenc_search(cpi, &mcenc, mbi, 0, mb->block_mv, &aerror, block_err);
+
+        /* search golden frame */
+        oc_mcenc_search(cpi, &mcenc, mbi, 1, NULL, &gerror, NULL);
+
       }else{
-	aerror = mb->aerror;
-	gerror = mb->gerror;
+        aerror = mb->aerror;
+        gerror = mb->gerror;
       }
 
       if(cpi->FrameType == KEY_FRAME){
-	mb->mode = CODE_INTRA;
-	/* Transform, quantize, collect rho metrics */
-	TQMB_Y(cpi, mb, j, &ps, rho_count, 0, &fr);
-	
+        mb->mode = CODE_INTRA;
+        /* Transform, quantize, collect rho metrics */
+        TQMB_Y(cpi, mb, j, &ps, rho_count, 0, NULL, &fr);
+
       }else{
 
-	/**************************************************************
+        /**************************************************************
            Find the block choice with the lowest estimated coding cost
 
            NOTE THAT if U or V is coded but no Y from a macro block then
@@ -991,122 +984,157 @@
            state to which the mode data structure is initialised in
            encoder and decoder at the start of each frame. */
 
-	/* block coding cost is estimated from correlated SAD metrics */
-	/* At this point, all blocks that are in frame are still marked coded */
+        /* block coding cost is estimated from correlated SAD metrics */
+        /* At this point, all blocks that are in frame are still marked coded */
+        if(!recode){
+          memcpy(mb->unref_mv,mb->analysis_mv[0],sizeof(mb->unref_mv));
+          mb->refined=0;
+        }
+        cost[CODE_INTER_NO_MV] =
+          cost_inter_nomv(cpi, qi, mbi, &overhead[CODE_INTER_NO_MV]);
+        cost[CODE_INTRA] =
+          cost_intra(cpi, qi, mbi, &intrabits, &overhead[CODE_INTRA]);
+        cost[CODE_INTER_PLUS_MV] =
+          cost_inter1mv(cpi,qi,mbi,0,mb->unref_mv[0],
+           &mb_mv_bits_0,&overhead[CODE_INTER_PLUS_MV]);
+        cost[CODE_INTER_LAST_MV] =
+          cost_inter(cpi, qi, mbi, last_mv[0], last_mv[1], CODE_INTER_LAST_MV, &overhead[CODE_INTER_LAST_MV]);
+        cost[CODE_INTER_PRIOR_LAST] =
+          cost_inter(cpi, qi, mbi, prior_mv[0], prior_mv[1], CODE_INTER_PRIOR_LAST, &overhead[CODE_INTER_PRIOR_LAST]);
+        cost[CODE_USING_GOLDEN] =
+          cost_inter(cpi, qi, mbi, 0, 0, CODE_USING_GOLDEN, &overhead[CODE_USING_GOLDEN]);
+        cost[CODE_GOLDEN_MV] =
+          cost_inter1mv(cpi,qi,mbi,1,mb->unref_mv[1],
+           &mb_gmv_bits_0, &overhead[CODE_GOLDEN_MV]);
+        cost[CODE_INTER_FOURMV] =
+          cost_inter4mv(cpi, qi, mbi, mb->block_mv, &mb_4mv_bits_0, &mb_4mv_bits_1, &overhead[CODE_INTER_FOURMV]);
 
-	cost[CODE_INTER_NO_MV] = 
-	  cost_inter_nomv(cpi, qi, mbi, &overhead[CODE_INTER_NO_MV]);
-	cost[CODE_INTRA] = 
-	  cost_intra(cpi, qi, mbi, &intrabits, &overhead[CODE_INTRA]);
-	cost[CODE_INTER_PLUS_MV] = 
-	  cost_inter1mv(cpi, qi, mbi, 0, &mb_mv_bits_0, &overhead[CODE_INTER_PLUS_MV]);
-	cost[CODE_INTER_LAST_MV] = 
-	  cost_inter(cpi, qi, mbi, last_mv, CODE_INTER_LAST_MV, &overhead[CODE_INTER_LAST_MV]);
-	cost[CODE_INTER_PRIOR_LAST] = 
-	  cost_inter(cpi, qi, mbi, prior_mv, CODE_INTER_PRIOR_LAST, &overhead[CODE_INTER_PRIOR_LAST]);
-	cost[CODE_USING_GOLDEN] = 
-	  cost_inter(cpi, qi, mbi, (mv_t){0,0},CODE_USING_GOLDEN, &overhead[CODE_USING_GOLDEN]);
-	cost[CODE_GOLDEN_MV] = 
-	  cost_inter1mv(cpi, qi, mbi, 1, &mb_gmv_bits_0, &overhead[CODE_GOLDEN_MV]);
-	cost[CODE_INTER_FOURMV] = 
-	  cost_inter4mv(cpi, qi, mbi, &mb_4mv_bits_0, &mb_4mv_bits_1, &overhead[CODE_INTER_FOURMV]);
-	
-	
-	/* the explicit MV modes (2,6,7) have not yet gone through
-	   halfpel refinement. We choose the explicit mv mode that's
-	   already furthest ahead on bits and refine only that one */
-	if(cost[CODE_INTER_FOURMV]<cost[CODE_INTER_PLUS_MV] && cost[CODE_INTER_FOURMV]<cost[CODE_GOLDEN_MV]){
-	  oc_mcenc_refine4mv(cpi, mbi, block_err);
-	  cost[CODE_INTER_FOURMV] = 
-	    cost_inter4mv(cpi, qi, mbi, &mb_4mv_bits_0, &mb_4mv_bits_1, &overhead[CODE_INTER_FOURMV]);
-	}else if (cost[CODE_GOLDEN_MV]<cost[CODE_INTER_PLUS_MV]-384){
-	  oc_mcenc_refine1mv(cpi, mbi, 1, gerror);
-	  cost[CODE_GOLDEN_MV] = 
-	    cost_inter1mv(cpi, qi, mbi, 1, &mb_gmv_bits_0, &overhead[CODE_GOLDEN_MV]);
-	}
-	oc_mcenc_refine1mv(cpi, mbi, 0, aerror);
-	cost[CODE_INTER_PLUS_MV] = 
-	  cost_inter1mv(cpi, qi, mbi, 0, &mb_mv_bits_0, &overhead[CODE_INTER_PLUS_MV]);
+        /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+           refinement.
+          We choose the explicit MV mode that's already furthest ahead on bits
+           and refine only that one.
+          We have to be careful to remember which ones we've refined so that
+           we don't refine it again if we re-encode this frame.*/
+        if(cost[CODE_INTER_FOURMV]<cost[CODE_INTER_PLUS_MV] && cost[CODE_INTER_FOURMV]<cost[CODE_GOLDEN_MV]){
+          if(!(mb->refined&0x80)){
+            oc_mcenc_refine4mv(cpi, mbi, block_err);
+            mb->refined|=0x80;
+          }
+          cost[CODE_INTER_FOURMV] =
+            cost_inter4mv(cpi, qi, mbi, mb->ref_mv,&mb_4mv_bits_0, &mb_4mv_bits_1, &overhead[CODE_INTER_FOURMV]);
+        }else if (cost[CODE_GOLDEN_MV]<cost[CODE_INTER_PLUS_MV]-384){
+          if(!(mb->refined&0x40)){
+            oc_mcenc_refine1mv(cpi,mbi,1,gerror);
+            mb->refined|=0x40;
+          }
+          cost[CODE_GOLDEN_MV] =
+            cost_inter1mv(cpi,qi,mbi,1,mb->analysis_mv[0][1],
+             &mb_gmv_bits_0,&overhead[CODE_GOLDEN_MV]);
+        }
+        if(!(mb->refined&0x04)){
+          oc_mcenc_refine1mv(cpi,mbi,0,aerror);
+          mb->refined|=0x04;
+        }
+        cost[CODE_INTER_PLUS_MV] =
+          cost_inter1mv(cpi,qi,mbi,0,mb->analysis_mv[0][0],
+           &mb_mv_bits_0, &overhead[CODE_INTER_PLUS_MV]);
 
-	/* Finally, pick the mode with the cheapest estimated bit cost.*/
-	/* prefer CODE_INTER_PLUS_MV, but not over LAST and LAST2 */
-	mode=0;
-	if(cost[1] < cost[0])mode=1;
-	if(cost[3] < cost[mode])mode=3;
-	if(cost[4] < cost[mode])mode=4;
-	if(cost[5] < cost[mode])mode=5;
-	if(cost[6] < cost[mode])mode=6;
-	if(cost[7] < cost[mode])mode=7;
-	if(mode == CODE_INTER_LAST_MV || mode == CODE_INTER_PRIOR_LAST){
-	  if(cost[2] < cost[mode])mode=2;
-	}else{
-	  if(cost[2]-384 < cost[mode])mode=2;
-	}
-
-	switch(mode){
-	case CODE_INTER_PLUS_MV:
-	  mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = mb->analysis_mv[0][0];
-	  break;
-	case CODE_INTER_LAST_MV:
-	  mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = last_mv;
-	  break;
-	case CODE_INTER_PRIOR_LAST:
-	  mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = prior_mv;
-	  break;
-	case CODE_INTER_FOURMV:
-	  break;
-	case CODE_GOLDEN_MV:
-	  mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = mb->analysis_mv[0][1];
-	  break;
-	default:
-	  mb->mv[0] = mb->mv[1] = mb->mv[2] = mb->mv[3] = (mv_t){0,0};
-	  break;
-	}
-	mb->mode = mode;
-	
-	/* Transform, quantize, collect rho metrics */
-	if(TQMB_Y(cpi, mb, j, &ps, rho_count, overhead[mode], &fr)){
-
-	  switch(mb->mode){
-	  case CODE_INTER_PLUS_MV:
-	    prior_mv = last_mv;
-	    last_mv = mb->mv[0]; /* not the same as analysis_mv[0][0]
-				    if we're backing out from a 4mv */
-
-	    cpi->MVBits_0 += mb_mv_bits_0;
-	    cpi->MVBits_1 += 12;
-	    break;
-	  case CODE_INTER_PRIOR_LAST:
-	    {
-	      mv_t temp = prior_mv;
-	      prior_mv = last_mv;
-	      last_mv = temp;
-	    }
-	    break;
-	  case CODE_GOLDEN_MV:
-	    cpi->MVBits_0 += mb_gmv_bits_0;
-	    cpi->MVBits_1 += 12;
-	    break;
-	  case CODE_INTER_FOURMV:
-	    prior_mv = last_mv;
-
-	    for(i=0;i<4;i++)
-	      if(mb->coded & (1<<i)){
-		cpi->MVBits_0 += 
-		  MvBits[mb->mv[i].x + MAX_MV_EXTENT] + 
-		  MvBits[mb->mv[i].y + MAX_MV_EXTENT];
-		cpi->MVBits_1 += 12;
-		last_mv = mb->mv[i];
-	      }
-	    break;
-	  default:
-	    break;
-	  }
-
-	  oc_mode_set(cpi,mb,mb->mode);      
-	  
-	  interbits += cost[mb->mode];
-	}
+        /* Finally, pick the mode with the cheapest estimated bit cost.*/
+        /* prefer CODE_INTER_PLUS_MV, but not over LAST and LAST2 */
+        mode=0;
+        if(cost[1] < cost[0])mode=1;
+        if(cost[3] < cost[mode])mode=3;
+        if(cost[4] < cost[mode])mode=4;
+        if(cost[5] < cost[mode])mode=5;
+        if(cost[6] < cost[mode])mode=6;
+        if(cost[7] < cost[mode])mode=7;
+        if(mode == CODE_INTER_LAST_MV || mode == CODE_INTER_PRIOR_LAST){
+          if(cost[2] < cost[mode])mode=2;
+        }else{
+          if(cost[2]-384 < cost[mode])mode=2;
+        }
+        /*If we picked something other than 4MV, propagate the MV to the
+           blocks.*/
+        if(mode!=CODE_INTER_FOURMV){
+          int dx;
+          int dy;
+          switch(mode){
+            case CODE_INTER_PLUS_MV:{
+              dx=mb->analysis_mv[0][0][0];
+              dy=mb->analysis_mv[0][0][1];
+            }break;
+            case CODE_INTER_LAST_MV:{
+              dx=last_mv[0];
+              dy=last_mv[1];
+            }break;
+            case CODE_INTER_PRIOR_LAST:{
+              dx=prior_mv[0];
+              dy=prior_mv[1];
+            }break;
+            case CODE_GOLDEN_MV:{
+              dx=mb->analysis_mv[0][1][0];
+              dy=mb->analysis_mv[0][1][1];
+            }break;
+            default:dx=dy=0;break;
+          }
+          mb->cbmvs[0][0]=mb->cbmvs[1][0]=mb->cbmvs[2][0]=mb->cbmvs[3][0]=
+           mb->mv[0][0]=mb->mv[1][0]=mb->mv[2][0]=mb->mv[3][0]=(signed char)dx;
+          mb->cbmvs[0][1]=mb->cbmvs[1][1]=mb->cbmvs[2][1]=mb->cbmvs[3][1]=
+           mb->mv[0][1]=mb->mv[1][1]=mb->mv[2][1]=mb->mv[3][1]=(signed char)dy;
+        }
+        mb->mode=mode;
+        /* Transform, quantize, collect rho metrics */
+        if(TQMB_Y(cpi,mb,j,&ps,rho_count,overhead[mode],&mb_mv_bits_0,&fr)){
+          switch(mb->mode){
+            case CODE_INTER_PLUS_MV:{
+              prior_mv[0]=last_mv[0];
+              prior_mv[1]=last_mv[1];
+              /*mb->mv[0] is not the same as analysis_mv[0][0] if we're
+                 backing out from a 4MV.*/
+              last_mv[0]=mb->mv[0][0];
+              last_mv[1]=mb->mv[0][1];
+              cpi->MVBits_0+=mb_mv_bits_0;
+              cpi->MVBits_1+=12;
+            }break;
+            case CODE_INTER_PRIOR_LAST:{
+              oc_mv temp;
+              temp[0]=prior_mv[0];
+              temp[1]=prior_mv[1];
+              prior_mv[0]=last_mv[0];
+              prior_mv[1]=last_mv[1];
+              last_mv[0]=temp[0];
+              last_mv[1]=temp[1];
+            }break;
+            case CODE_GOLDEN_MV:{
+              cpi->MVBits_0 += mb_gmv_bits_0;
+              cpi->MVBits_1 += 12;
+            }break;
+            case CODE_INTER_FOURMV:{
+              int bi;
+              prior_mv[0]=last_mv[0];
+              prior_mv[1]=last_mv[1];
+              for(bi=0;bi<4;bi++){
+                if(mb->coded&(1<<bi)){
+                  cpi->MVBits_0+=MvBits[mb->mv[bi][0]+MAX_MV_EXTENT]
+                   +MvBits[mb->mv[bi][1]+MAX_MV_EXTENT];
+                  cpi->MVBits_1+=12;
+                  last_mv[0]=mb->mv[bi][0];
+                  last_mv[1]=mb->mv[bi][1];
+                }
+                /*Replace the block MVs for not-coded blocks with (0,0).*/
+                else mb->mv[bi][0]=mb->mv[bi][1]=0;
+              }
+              if(mb->coded!=0xF){
+                /*TODO: Use OC_SET_CHROMA_MVS_TABLE from decoder; 4:2:0 only
+                   for now.*/
+                oc_set_chroma_mvs00(mb->cbmvs,mb->mv);
+              }
+            }break;
+            default:break;
+          }
+          oc_mode_scheme_chooser_update(&cpi->chooser,mb->mode);
+          interbits+=cost[mb->mode];
+        }
       }
     }
     fr_finishsb(cpi,&fr);
@@ -1137,18 +1165,18 @@
 
   memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
   if(cpi->FrameType != KEY_FRAME){
-    
+
     if(interbits>intrabits) return 1; /* short circuit */
-    
+
     /* finish adding flagging overhead costs to inter bit counts */
-    
+
     if(cpi->MVBits_0 < cpi->MVBits_1)
       interbits += (cpi->MVBits_0 << OC_BIT_SCALE);
     else
       interbits += (cpi->MVBits_1 << OC_BIT_SCALE);
-    
+
     interbits += (cpi->chooser.scheme_bits[cpi->chooser.scheme_list[0]] << OC_BIT_SCALE);
-    
+
     if(interbits>intrabits) return 1; /* short circuit */
 
     /* The easiest way to count the bits needed for coded/not coded fragments is
@@ -1158,11 +1186,10 @@
       fr_write(cpi,&fr);
       interbits += ((oggpackB_bits(cpi->oggbuffer) - bits) << OC_BIT_SCALE);
     }
-    
-    if(interbits>intrabits) return 1; 
-    
+
+    if(interbits>intrabits) return 1;
+
   }
-
   return 0;
 }
 #ifdef COLLECT_METRICS
@@ -1195,32 +1222,32 @@
       ogg_int64_t frags=0;
       int rbin=0;
       for(bin=0;bin<OC_SAD_BINS;bin++){
-	sadx += mode_metric[qi][plane][mode].sad[bin];
-	bity += mode_metric[qi][plane][mode].bits[bin];
-	frags += mode_metric[qi][plane][mode].frag[bin];
-	if(frags > ZWEIGHT){
-	  sadx = (sadx + (frags>>1))/frags;
-	  bity = (bity + (frags>>1))/frags;
-	  if(lastx != -1LL){
-	    b = ((bity - lasty)<<8)/(sadx-lastx);
-	    a = lasty - (((lastx * b) + (1<<7))>>8);
-	    
-	    for(;rbin<<OC_SAD_SHIFT <= sadx && rbin <= OC_SAD_BINS;rbin++)
-	      mode_rate[qi][plane][mode][rbin] = a + ((b * (rbin<<OC_SAD_SHIFT) + (1<<7))>>8);
-	    
-	  }
-	  lastx = sadx;
-	  lasty = bity;
-	  frags = 0;
-	}
+        sadx += mode_metric[qi][plane][mode].sad[bin];
+        bity += mode_metric[qi][plane][mode].bits[bin];
+        frags += mode_metric[qi][plane][mode].frag[bin];
+        if(frags > ZWEIGHT){
+          sadx = (sadx + (frags>>1))/frags;
+          bity = (bity + (frags>>1))/frags;
+          if(lastx != -1LL){
+            b = ((bity - lasty)<<8)/(sadx-lastx);
+            a = lasty - (((lastx * b) + (1<<7))>>8);
+
+            for(;rbin<<OC_SAD_SHIFT <= sadx && rbin <= OC_SAD_BINS;rbin++)
+              mode_rate[qi][plane][mode][rbin] = a + ((b * (rbin<<OC_SAD_SHIFT) + (1<<7))>>8);
+
+          }
+          lastx = sadx;
+          lasty = bity;
+          frags = 0;
+        }
       }
       if(lastx!=-1LL){
-	for(;rbin <= OC_SAD_BINS;rbin++)
-	  mode_rate[qi][plane][mode][rbin] = mode_rate[qi][plane][mode][rbin-1];
+        for(;rbin <= OC_SAD_BINS;rbin++)
+          mode_rate[qi][plane][mode][rbin] = mode_rate[qi][plane][mode][rbin-1];
       }else{
-	for(;rbin <= OC_SAD_BINS;rbin++)
-	  mode_rate[qi][plane][mode][rbin] = 0;
-	
+        for(;rbin <= OC_SAD_BINS;rbin++)
+          mode_rate[qi][plane][mode][rbin] = 0;
+
       }
     }
 }
@@ -1253,11 +1280,11 @@
   int *tfi = cpi->dct_token_frag[group];
   int ty = cpi->dct_token_ycount[group];
   int tn = cpi->dct_token_count[group];
-  
+
   for(ti=0;ti<tn;ti++){
     int token = cpi->dct_token[group][ti];
     int bits = cpi->HuffCodeLengthArray_VP3x[(ti<ty ? huffY : huffC)][token] + cpi->ExtraBitLengths_VP3x[token];
-      
+
     if(token>DCT_REPEAT_RUN4_TOKEN){
       /* not an EOB run; this token belongs to a single fragment */
       int fi = tfi[ti];
@@ -1267,25 +1294,25 @@
       int run = parse_eob_run(token, cpi->dct_token_eb[group][ti]);
       int fi = stack[eobcounts[group]];
       actual_bits[fi]+=(bits<<OC_BIT_SCALE);
-      
+
       if(ti+1<tn){
-	/* tokens follow EOB so it must be entirely ensconced within this plane/group */
-	eobcounts[group]+=run;
+        /* tokens follow EOB so it must be entirely ensconced within this plane/group */
+        eobcounts[group]+=run;
       }else{
-	/* EOB is the last token in this plane/group, so it may span into the next plane/group */
-	int n = cpi->dct_eob_fi_count[group];
-	while(run){
-	  int rem = n - eobcounts[group];
-	  if(rem>run)rem=run;
+        /* EOB is the last token in this plane/group, so it may span into the next plane/group */
+        int n = cpi->dct_eob_fi_count[group];
+        while(run){
+          int rem = n - eobcounts[group];
+          if(rem>run)rem=run;
 
-	  eobcounts[group]+=rem;
-	  run -= rem;
-	  if(run){
-	    group++;
-	    n = cpi->dct_eob_fi_count[group];
-	    stack = cpi->dct_eob_fi_stack[group];
-	  }
-	}
+          eobcounts[group]+=rem;
+          run -= rem;
+          if(run){
+            group++;
+            n = cpi->dct_eob_fi_count[group];
+            stack = cpi->dct_eob_fi_stack[group];
+          }
+        }
       }
     }
   }
@@ -1331,38 +1358,32 @@
     ModeMetricsGroup(cpi, gi, huff[2]+AC_HUFF_CHOICES*3, huff[3]+AC_HUFF_CHOICES*3, eobcounts, actual_bits);
 
   /* accumulate */
-  for(fi=0;fi<v;fi++)
-    if(cp[fi]){
-      int mbi = mp[fi];
-      macroblock_t *mb = &cpi->macro[mbi];
-      int mode = mb->mode;
-      int plane = (fi<y ? 0 : (fi<u ? 1 : 2));
-      int bin = BIN(sp[fi]);
-      mode_metric[qi][plane][mode==CODE_INTRA].frag[bin]++;
-      mode_metric[qi][plane][mode==CODE_INTRA].sad[bin] += sp[fi];
-      mode_metric[qi][plane][mode==CODE_INTRA].bits[bin] += actual_bits[fi];
-      
-      if(0){
-	int bi = cpi->frag_buffer_index[fi];
-	unsigned char *frame = cpi->frame+bi;
-	unsigned char *recon = cpi->lastrecon+bi;
-	int stride = cpi->stride[plane];
-	int lssd=0;
-	int xi,yi;
-	
-	for(yi=0;yi<8;yi++){
-	  for(xi=0;xi<8;xi++)
-	    lssd += (frame[xi]-recon[xi])*(frame[xi]-recon[xi]);
-	  frame+=stride;
-	  recon+=stride;
-	}
-	cpi->dist_dist[plane][mode] += lssd;
-	cpi->dist_bits[plane][mode] += actual_bits[fi];
+  for(fi=0;fi<v;fi++)if(cp[fi]){
+    int mbi = mp[fi];
+    macroblock_t *mb = &cpi->macro[mbi];
+    int mode = mb->mode;
+    int plane = (fi<y ? 0 : (fi<u ? 1 : 2));
+    int bin = BIN(sp[fi]);
+    mode_metric[qi][plane][mode==CODE_INTRA].frag[bin]++;
+    mode_metric[qi][plane][mode==CODE_INTRA].sad[bin] += sp[fi];
+    mode_metric[qi][plane][mode==CODE_INTRA].bits[bin] += actual_bits[fi];
+    if(0){
+      int bi = cpi->frag_buffer_index[fi];
+      unsigned char *frame = cpi->frame+bi;
+      unsigned char *recon = cpi->lastrecon+bi;
+      int stride = cpi->stride[plane];
+      int lssd=0;
+      int xi,yi;
+      for(yi=0;yi<8;yi++){
+        for(xi=0;xi<8;xi++)
+          lssd += (frame[xi]-recon[xi])*(frame[xi]-recon[xi]);
+        frame+=stride;
+        recon+=stride;
       }
+      cpi->dist_dist[plane][mode] += lssd;
+      cpi->dist_bits[plane][mode] += actual_bits[fi];
     }
-
-
-
+  }
   /* update global SAD/rate estimation matrix */
   UpdateModeEstimation(cpi);
 }
@@ -1371,49 +1392,49 @@
   int qi,plane,mode,bin;
 
   fprintf(stdout,
-	  "/* file generated by libtheora with COLLECT_METRICS defined at compile time */\n\n"
+          "/* file generated by libtheora with COLLECT_METRICS defined at compile time */\n\n"
 
-	  "#define OC_BIT_SCALE (7)\n"
-	  "#define OC_SAD_BINS (%d)\n"
-	  "#define OC_SAD_SHIFT (%d)\n"
-	  "\n"
+          "#define OC_BIT_SCALE (7)\n"
+          "#define OC_SAD_BINS (%d)\n"
+          "#define OC_SAD_SHIFT (%d)\n"
+          "\n"
 
-	  "#ifdef COLLECT_METRICS\n"
-	  "typedef struct {\n"
-	  "  ogg_int64_t      bits[OC_SAD_BINS];\n"
-	  "  ogg_int64_t      frag[OC_SAD_BINS];\n"
-	  "  ogg_int64_t      sad[OC_SAD_BINS];\n"
-	  "} mode_metric_t;\n"
+          "#ifdef COLLECT_METRICS\n"
+          "typedef struct {\n"
+          "  ogg_int64_t      bits[OC_SAD_BINS];\n"
+          "  ogg_int64_t      frag[OC_SAD_BINS];\n"
+          "  ogg_int64_t      sad[OC_SAD_BINS];\n"
+          "} mode_metric_t;\n"
 
-	  "int              mode_metrics = 1;\n"
-	  "mode_metric_t    mode_metric[64][3][2]={\n",OC_SAD_BINS,OC_SAD_SHIFT);
-  
+          "int              mode_metrics = 1;\n"
+          "mode_metric_t    mode_metric[64][3][2]={\n",OC_SAD_BINS,OC_SAD_SHIFT);
+
   for(qi=0;qi<64;qi++){
     fprintf(stdout,"  {\n");
     for(plane=0;plane<3;plane++){
       fprintf(stdout,"    {\n");
       for(mode=0;mode<2;mode++){
-	fprintf(stdout,"      { /* qi=%d %c %s */\n",qi,(plane?(plane==1?'U':'V'):'Y'),(mode?"INTRA":"INTER"));
+        fprintf(stdout,"      { /* qi=%d %c %s */\n",qi,(plane?(plane==1?'U':'V'):'Y'),(mode?"INTRA":"INTER"));
 
-	fprintf(stdout,"        { ");
-	for(bin=0;bin<OC_SAD_BINS;bin++){
-	  if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
-	  fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].bits[bin]);
-	}
-	fprintf(stdout," },\n");
-	fprintf(stdout,"        { ");
-	for(bin=0;bin<OC_SAD_BINS;bin++){
-	  if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
-	  fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].frag[bin]);
-	}
-	fprintf(stdout," },\n");
-	fprintf(stdout,"        { ");
-	for(bin=0;bin<OC_SAD_BINS;bin++){
-	  if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
-	  fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].sad[bin]);
-	}
-	fprintf(stdout," },\n");
-	fprintf(stdout,"      },\n");
+        fprintf(stdout,"        { ");
+        for(bin=0;bin<OC_SAD_BINS;bin++){
+          if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
+          fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].bits[bin]);
+        }
+        fprintf(stdout," },\n");
+        fprintf(stdout,"        { ");
+        for(bin=0;bin<OC_SAD_BINS;bin++){
+          if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
+          fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].frag[bin]);
+        }
+        fprintf(stdout," },\n");
+        fprintf(stdout,"        { ");
+        for(bin=0;bin<OC_SAD_BINS;bin++){
+          if(bin && !(bin&0x3))fprintf(stdout,"\n          ");
+          fprintf(stdout,"%12ldLL,",mode_metric[qi][plane][mode].sad[bin]);
+        }
+        fprintf(stdout," },\n");
+        fprintf(stdout,"      },\n");
 
       }
       fprintf(stdout,"    },\n");
@@ -1423,20 +1444,20 @@
   fprintf(stdout,"};\n\n#endif\n\n");
 
   fprintf(stdout,
-	  "ogg_int32_t     mode_rate[64][3][2][OC_SAD_BINS+1]={\n");
+          "ogg_int32_t     mode_rate[64][3][2][OC_SAD_BINS+1]={\n");
   for(qi=0;qi<64;qi++){
     fprintf(stdout,"  {\n");
     for(plane=0;plane<3;plane++){
       fprintf(stdout,"    {\n");
       for(mode=0;mode<2;mode++){
-	fprintf(stdout,"      { /* qi=%d %c %s */\n        ",qi,(plane?(plane==1?'U':'V'):'Y'),(mode?"INTRA":"INTER"));
+        fprintf(stdout,"      { /* qi=%d %c %s */\n        ",qi,(plane?(plane==1?'U':'V'):'Y'),(mode?"INTRA":"INTER"));
 
-	for(bin=0;bin<OC_SAD_BINS+1;bin++){
-	  if(bin && !(bin&0x7))fprintf(stdout,"\n        ");
-	  fprintf(stdout,"%6d,",mode_rate[qi][plane][mode][bin]);
-	}
+        for(bin=0;bin<OC_SAD_BINS+1;bin++){
+          if(bin && !(bin&0x7))fprintf(stdout,"\n        ");
+          fprintf(stdout,"%6d,",mode_rate[qi][plane][mode][bin]);
+        }
 
-	fprintf(stdout," },\n");
+        fprintf(stdout," },\n");
       }
       fprintf(stdout,"    },\n");
     }

Modified: branches/theora-thusnelda/tests/Makefile.am
===================================================================
--- branches/theora-thusnelda/tests/Makefile.am	2009-03-19 17:10:23 UTC (rev 15801)
+++ branches/theora-thusnelda/tests/Makefile.am	2009-03-20 03:32:25 UTC (rev 15802)
@@ -5,7 +5,7 @@
 AM_CFLAGS = $(OGG_CFLAGS)
 
 THEORADIR = ../lib
-THEORA_LIBS = $(THEORADIR)/libtheora.la $(OGG_LIBS)
+THEORA_LIBS = $(THEORADIR)/libtheora.la $(OGG_LIBS) -lm
 
 test: check