[xiph-commits] r16102 - in branches/theora-thusnelda/lib: . dec dec/x86 enc enc/x86

Sat Jun 13 09:04:11 PDT 2009

Author: tterribe
Date: 2009-06-13 09:04:06 -0700 (Sat, 13 Jun 2009)
New Revision: 16102

Added:
   branches/theora-thusnelda/lib/enc/analyze.c
   branches/theora-thusnelda/lib/enc/encapiwrapper.c
   branches/theora-thusnelda/lib/enc/encinfo.c
   branches/theora-thusnelda/lib/enc/encint.h
   branches/theora-thusnelda/lib/enc/enquant.c
   branches/theora-thusnelda/lib/enc/rate.c
   branches/theora-thusnelda/lib/enc/tokenize.c
Removed:
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct_decode.c
   branches/theora-thusnelda/lib/enc/dct_encode.c
   branches/theora-thusnelda/lib/enc/encapiwrapper.c
   branches/theora-thusnelda/lib/enc/encoder_huffman.h
   branches/theora-thusnelda/lib/enc/encoder_lookup.h
   branches/theora-thusnelda/lib/enc/encoder_quant.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/frarray.c
   branches/theora-thusnelda/lib/enc/frinit.c
   branches/theora-thusnelda/lib/enc/hufftables.h
   branches/theora-thusnelda/lib/enc/mode.c
   branches/theora-thusnelda/lib/enc/quant_lookup.h
   branches/theora-thusnelda/lib/enc/x86/mmxenc.c
Modified:
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/dec/apiwrapper.h
   branches/theora-thusnelda/lib/dec/decint.h
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/ocintrin.h
   branches/theora-thusnelda/lib/dec/quant.c
   branches/theora-thusnelda/lib/dec/quant.h
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/enc/dct.c
   branches/theora-thusnelda/lib/enc/encfrag.c
   branches/theora-thusnelda/lib/enc/encode.c
   branches/theora-thusnelda/lib/enc/encoder_disabled.c
   branches/theora-thusnelda/lib/enc/enquant.h
   branches/theora-thusnelda/lib/enc/mathops.c
   branches/theora-thusnelda/lib/enc/mcenc.c
   branches/theora-thusnelda/lib/enc/modedec.h
   branches/theora-thusnelda/lib/enc/toplevel_lookup.h
   branches/theora-thusnelda/lib/enc/x86/x86enc.c
   branches/theora-thusnelda/lib/enc/x86/x86enc.h
   branches/theora-thusnelda/lib/internal.h
Log:
Major encoder refactoring.
This eliminates the remaining duplicate code between encoder and decoder, as
 well as rewriting the entire tokenization engine, and completes support for
 4:2:2 and 4:4:4 pixel formats.
Although no major new functionality has been introduced for 4:2:0, results will
 not be bit-exact identical to the previous encoder since a) several
 tokenization and motion estimation bugs were fixed and b) tokens are now split
 by color planes, in anticipation of future cache locality improvements.

As a side effect, this makes the decoder about 2.4% faster and the encoder more
 than 13% faster (measured on a single clip at a single rate, x86-32 only).


Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================

--- branches/theora-thusnelda/lib/Makefile.am	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/Makefile.am	2009-06-13 16:04:06 UTC (rev 16102)
@@ -4,7 +4,6 @@
 EXTRA_DIST = \
 	cpu.c \
 	enc/x86/dsp_sse2.c \
-	enc/x86/mmxenc.c \
 	enc/x86/mmxencfrag.c \
 	enc/x86/mmxfdct.c \
 	enc/x86/sse2fdct.c \
@@ -13,9 +12,11 @@
 	enc/x86_32_vs/dsp_mmx.c \
 	enc/x86_32_vs/fdct_mmx.c \
 	enc/x86_32_vs/recon_mmx.c \
-	enc/dct_encode.c \
+	enc/analyize.c \
+	enc/encinfo.c \
 	enc/encode.c \
-	enc/encoder_toplevel.c \
+	enc/rate.c \
+	enc/tokenize.c \
 	dec/x86/mmxfrag.c \
 	dec/x86/mmxfrag.h \
 	dec/x86/mmxidct.c \
@@ -28,12 +29,13 @@
 lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
 
 if THEORA_DISABLE_ENCODE
-encoder_sources = \
-	enc/encapiwrapper.c \
+encoder_uniq_sources = \
 	enc/encoder_disabled.c
+
+encoder_sources = \
+	$(encoder_uniq_sources)
 else
 encoder_uniq_x86_sources = \
-	enc/x86/mmxenc.c \
 	enc/x86/mmxencfrag.c \
 	enc/x86/mmxfdct.c \
 	enc/x86/x86enc.c
@@ -43,7 +45,9 @@
 
 encoder_shared_x86_sources = \
 	dec/x86/mmxfrag.c \
-	dec/x86/mmxidct.c
+	dec/x86/mmxidct.c \
+	dec/x86/mmxstate.c \
+	dec/x86/x86state.c
 
 encoder_shared_x86_64_sources =
 
@@ -65,26 +69,27 @@
 endif
 
 encoder_uniq_sources = \
+	enc/analyze.c \
 	enc/dct.c \
-	enc/dct_decode.c \
-	enc/dct_encode.c \
 	enc/encfrag.c \
 	enc/encapiwrapper.c \
+	enc/encinfo.c \
 	enc/encode.c \
-	enc/encoder_toplevel.c \
-	enc/encoder_quant.c \
-	enc/frarray.c \
-	enc/frinit.c \
+	enc/enquant.c \
 	enc/huffenc.c \
 	enc/mathops.c \
 	enc/mcenc.c \
-	enc/mode.c \
+	enc/rate.c \
+	enc/tokenize.c \
 	$(encoder_uniq_arch_sources)
 
 encoder_sources = \
+	dec/apiwrapper.c \
 	dec/fragment.c \
 	dec/idct.c \
 	dec/internal.c \
+	dec/state.c \
+	dec/quant.c \
 	$(encoder_shared_arch_sources) \
 	$(encoder_uniq_sources)
 
@@ -124,16 +129,10 @@
 noinst_HEADERS = \
 	cpu.h \
 	internal.h \
-	enc/codec_internal.h \
-	enc/encoder_huffman.h \
-	enc/encoder_lookup.h \
 	enc/enquant.h \
 	enc/huffenc.h \
-	enc/hufftables.h \
 	enc/mathops.h \
 	enc/modedec.h \
-	enc/quant_lookup.h \
-	enc/toplevel_lookup.h \
 	dec/apiwrapper.h \
 	dec/bitpack.h \
 	dec/dct.h \

Modified: branches/theora-thusnelda/lib/dec/apiwrapper.h
===================================================================
--- branches/theora-thusnelda/lib/dec/apiwrapper.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/apiwrapper.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -20,8 +20,7 @@
 # include <ogg/ogg.h>
 # include <theora/theora.h>
 # include "theora/theoradec.h"
-/*# include "theora/theoraenc.h"*/
-typedef struct th_enc_ctx th_enc_ctx;
+# include "theora/theoraenc.h"
 # include "../internal.h"
 
 typedef struct th_api_wrapper th_api_wrapper;

Modified: branches/theora-thusnelda/lib/dec/decint.h
===================================================================
--- branches/theora-thusnelda/lib/dec/decint.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/decint.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -85,15 +85,19 @@
   unsigned char       *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
   int                  pp_frame_has_chroma;
-  /*The buffer used for the post-processed frame.*/
+  /*The buffer used for the post-processed frame.
+    Note that this is _not_ guaranteed to have the same strides and offsets as
+     the reference frame buffers.*/
   th_ycbcr_buffer      pp_frame_buf;
   /*The striped decode callback function.*/
   th_stripe_callback   stripe_cb;
+# if defined(HAVE_CAIRO)
   /*Output metrics for debugging.*/
   int                  telemetry;
   int                  telemetry_mbmode;
   int                  telemetry_mv;
   unsigned char       *telemetry_frame_data;
+# endif
 };
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -23,7 +23,7 @@
 # include <stdio.h>
 # include "png.h"
 #endif
-#ifdef HAVE_CAIRO
+#if defined(HAVE_CAIRO)
 # include <cairo.h>
 #endif
 
@@ -165,13 +165,13 @@
   int pli;
   int qi;
   int ret;
-  ret=oc_state_init(&_dec->state,_info);
+  ret=oc_state_init(&_dec->state,_info,3);
   if(ret<0)return ret;
   oc_huff_trees_copy(_dec->huff_tables,
    (const oc_huff_node *const *)_setup->huff_tables);
-  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-    _dec->state.dequant_tables[qti][pli]=
-     _dec->state.dequant_table_data[qti][pli];
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _dec->state.dequant_tables[qi][pli][qti]=
+     _dec->state.dequant_table_data[qi][pli][qti];
   }
   oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
    &_setup->qinfo);
@@ -196,19 +196,22 @@
   _dec->dc_qis=NULL;
   _dec->variances=NULL;
   _dec->pp_frame_data=NULL;
-  _dec->telemetry_frame_data=NULL;
+  _dec->stripe_cb.ctx=NULL;
+  _dec->stripe_cb.stripe_decoded=NULL;
+#if defined(HAVE_CAIRO)
   _dec->telemetry=0;
+  _dec->telemetry_mbmode=0;
   _dec->telemetry_mv=0;
-  _dec->telemetry_mbmode=0;
-  _dec->stripe_cb.ctx=NULL;
-  _dec->stripe_cb.stripe_decoded=NULL;
+  _dec->telemetry_frame_data=NULL;
+#endif
   return 0;
 }
 
 static void oc_dec_clear(oc_dec_ctx *_dec){
+#if defined(HAVE_CAIRO)
+  _ogg_free(_dec->telemetry_frame_data);
+#endif
   _ogg_free(_dec->pp_frame_data);
-  if(_dec->telemetry_frame_data)
-    _ogg_free(_dec->telemetry_frame_data);
   _ogg_free(_dec->variances);
   _ogg_free(_dec->dc_qis);
   oc_free_2d(_dec->extra_bits);
@@ -294,14 +297,13 @@
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
     prev_ncoded_fragis=ncoded_fragis;
-    _dec->state.nuncoded_fragis[pli]=0;
   }
 }
 
 /*Decodes the bit flags indicating whether each super block is partially coded
    or not.
   Return: The number of partially coded super blocks.*/
-static int oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
+static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
   oc_sb_flags *sb_flags;
   unsigned     nsbs;
   unsigned     sbi;
@@ -385,10 +387,11 @@
   int                pli;
   int                flag;
   int                run_count;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t         *uncoded_fragis;
   ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          nuncoded_fragis;
   ptrdiff_t          prev_ncoded_fragis;
-  ptrdiff_t          nuncoded_fragis;
-  ptrdiff_t          prev_nuncoded_fragis;
   npartial=oc_dec_partial_sb_flags_unpack(_dec);
   if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
   if(npartial>0){
@@ -400,11 +403,11 @@
   sb_flags=_dec->state.sb_flags;
   frags=_dec->state.frags;
   sbi=nsbs=run_count=0;
-  prev_ncoded_fragis=ncoded_fragis=prev_nuncoded_fragis=nuncoded_fragis=0;
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
+  prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
   for(pli=0;pli<3;pli++){
-    const oc_fragment_plane *fplane;
-    fplane=_dec->state.fplanes+pli;
-    nsbs+=fplane->nsbs;
+    nsbs+=_dec->state.fplanes[pli].nsbs;
     for(;sbi<nsbs;sbi++){
       int quadi;
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
@@ -424,8 +427,8 @@
               run_count--;
               coded=flag;
             }
-            if(coded)_dec->state.coded_fragis[ncoded_fragis++]=fragi;
-            else *(_dec->state.uncoded_fragis-++nuncoded_fragis)=fragi;
+            if(coded)coded_fragis[ncoded_fragis++]=fragi;
+            else *(uncoded_fragis-++nuncoded_fragis)=fragi;
             frags[fragi].coded=coded;
           }
         }
@@ -433,8 +436,6 @@
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
     prev_ncoded_fragis=ncoded_fragis;
-    _dec->state.nuncoded_fragis[pli]=nuncoded_fragis-prev_nuncoded_fragis;
-    prev_nuncoded_fragis=nuncoded_fragis;
   }
   /*TODO: run_count should be 0 here.
     If it's not, we should issue a warning of some kind.*/
@@ -464,6 +465,7 @@
 static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
   const oc_mb_map     *mb_maps;
   signed char         *mb_modes;
+  const oc_fragment   *frags;
   const unsigned char *alphabet;
   unsigned char        scheme0_alphabet[8];
   oc_mode_unpack_func  mode_unpack;
@@ -493,15 +495,12 @@
   mb_modes=_dec->state.mb_modes;
   mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   nmbs=_dec->state.nmbs;
+  frags=_dec->state.frags;
   for(mbi=0;mbi<nmbs;mbi++){
     if(mb_modes[mbi]!=OC_MODE_INVALID){
       int bi;
       /*Check for a coded luma block in this macro block.*/
-      for(bi=0;bi<4;bi++){
-        ptrdiff_t fragi;
-        fragi=mb_maps[mbi][0][bi];
-        if(fragi>=0&&_dec->state.frags[fragi].coded)break;
-      }
+      for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
       /*We found one, decode a mode.*/
       if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)];
       /*There were none: INTER_NOMV is forced.*/
@@ -539,7 +538,7 @@
     }break;
   }
   mask=-(int)bits;
-  return (mv+mask)^mask;
+  return mv+mask^mask;
 }
 
 static int oc_clc_mv_comp_unpack(oggpack_buffer *_opb){
@@ -549,7 +548,7 @@
   theorapackB_read(_opb,6,&bits);
   mv=(int)bits>>1;
   mask=-((int)bits&1);
-  return (mv+mask)^mask;
+  return mv+mask^mask;
 }
 
 /*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
@@ -595,7 +594,7 @@
       do{
         mapi=map_idxs[mapii];
         fragi=mb_maps[mbi][mapi>>2][mapi&3];
-        if(fragi>=0&&_dec->state.frags[fragi].coded)coded[ncoded++]=mapi;
+        if(frags[fragi].coded)coded[ncoded++]=mapi;
       }
       while(++mapii<map_nidxs);
       if(ncoded<=0)continue;
@@ -666,20 +665,17 @@
   oc_fragment     *frags;
   const ptrdiff_t *coded_fragis;
   ptrdiff_t        ncoded_fragis;
-  ptrdiff_t        coded_fragii;
+  ptrdiff_t        fragii;
   ptrdiff_t        fragi;
-  ncoded_fragis=_dec->state.ncoded_fragis[0]+
-   _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2];
+  ncoded_fragis=_dec->state.ntotal_coded_fragis;
   if(ncoded_fragis<=0)return;
   frags=_dec->state.frags;
   coded_fragis=_dec->state.coded_fragis;
   if(_dec->state.nqis==1){
-    int qi0;
-    /*If this frame has only a single qi value, then just set it in all coded
+    /*If this frame has only a single qi value, then just use it for all coded
        fragments.*/
-    qi0=_dec->state.qis[0];
-    for(coded_fragii=0;coded_fragii<ncoded_fragis;coded_fragii++){
-      frags[coded_fragis[coded_fragii]].qi=qi0;
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      frags[coded_fragis[fragii]].qii=0;
     }
   }
   else{
@@ -698,17 +694,17 @@
     theorapackB_read1(&_dec->opb,&val);
     flag=(int)val;
     run_count=nqi1=0;
-    coded_fragii=0;
-    while(coded_fragii<ncoded_fragis){
+    fragii=0;
+    while(fragii<ncoded_fragis){
       int full_run;
       run_count=oc_sb_run_unpack(&_dec->opb);
       full_run=run_count>=4129;
       do{
-        frags[coded_fragis[coded_fragii++]].qi=flag;
+        frags[coded_fragis[fragii++]].qii=flag;
         nqi1+=flag;
       }
-      while(--run_count>0&&coded_fragii<ncoded_fragis);
-      if(full_run&&coded_fragii<ncoded_fragis){
+      while(--run_count>0&&fragii<ncoded_fragis);
+      if(full_run&&fragii<ncoded_fragis){
         theorapackB_read1(&_dec->opb,&val);
         flag=(int)val;
       }
@@ -720,35 +716,29 @@
        fragment with a non-zero qi, make the second pass.*/
     if(_dec->state.nqis==3&&nqi1>0){
       /*Skip qii==0 fragments.*/
-      for(coded_fragii=0;
-       frags[coded_fragis[coded_fragii]].qi==0;coded_fragii++);
+      for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++);
       theorapackB_read1(&_dec->opb,&val);
       flag=(int)val;
       do{
         int full_run;
         run_count=oc_sb_run_unpack(&_dec->opb);
         full_run=run_count>=4129;
-        for(;coded_fragii<ncoded_fragis;coded_fragii++){
-          fragi=coded_fragis[coded_fragii];
-          if(frags[fragi].qi==0)continue;
+        for(;fragii<ncoded_fragis;fragii++){
+          fragi=coded_fragis[fragii];
+          if(frags[fragi].qii==0)continue;
           if(run_count--<=0)break;
-          frags[fragi].qi+=flag;
+          frags[fragi].qii+=flag;
         }
-        if(full_run&&coded_fragii<ncoded_fragis){
+        if(full_run&&fragii<ncoded_fragis){
           theorapackB_read1(&_dec->opb,&val);
           flag=(int)val;
         }
         else flag=!flag;
       }
-      while(coded_fragii<ncoded_fragis);
+      while(fragii<ncoded_fragis);
       /*TODO: run_count should be 0 here.
         If it's not, we should issue a warning of some kind.*/
     }
-    /*Finally, translate qii's to qi's.*/
-    for(coded_fragii=0;coded_fragii<ncoded_fragis;coded_fragii++){
-      fragi=coded_fragis[coded_fragii];
-      frags[fragi].qi=_dec->state.qis[frags[fragi].qi];
-    }
   }
 }
 
@@ -762,6 +752,12 @@
   Return: The decoded coefficient value.*/
 typedef int (*oc_token_dec1val_func)(int _token,int _extra_bits);
 
+/*We want to avoid accessing arrays of constants in these functions, because
+   we take the address of them, which means that when compiling with -fPIC,
+   an expensive prolog is added to set up the PIC register in any functions
+   which access a global symbol (even if it has file scope or smaller).
+  Thus a lot of what would be tables are packed into 32-bit constants.*/
+
 /*Handles zero run tokens.*/
 static int oc_token_dec1val_zrl(void){
   return 0;
@@ -769,38 +765,44 @@
 
 /*Handles 1, -1, 2 and -2 tokens.*/
 static int oc_token_dec1val_const(int _token){
-  static const int CONST_VALS[4]={1,-1,2,-2};
-  return CONST_VALS[_token-OC_NDCT_ZRL_TOKEN_MAX];
+  return OC_BYTE_TABLE32(1,-1,2,-2,_token-OC_NDCT_ZRL_TOKEN_MAX);
 }
 
 /*Handles DCT value tokens category 2.*/
 static int oc_token_dec1val_cat2(int _token,int _extra_bits){
   int mask;
   mask=-_extra_bits;
-  return (_token-OC_DCT_VAL_CAT2+3+mask)^mask;
+  return _token-OC_DCT_VAL_CAT2+3+mask^mask;
 }
 
-/*Handles DCT value tokens categories 3 through 8.*/
-static int oc_token_dec1val_cati(int _token,int _extra_bits){
-  static const unsigned char  VAL_CAT_OFFS[6]={
-    OC_NDCT_VAL_CAT2_SIZE+3,
-    OC_NDCT_VAL_CAT2_SIZE+5,
-    OC_NDCT_VAL_CAT2_SIZE+9,
-    OC_NDCT_VAL_CAT2_SIZE+17,
-    OC_NDCT_VAL_CAT2_SIZE+33,
-    OC_NDCT_VAL_CAT2_SIZE+65
-  };
-  static const unsigned short VAL_CAT_MASKS[6]={
-    0x001,0x003,0x007,0x00F,0x01F,0x1FF
-  };
-  static const unsigned char  VAL_CAT_SHIFTS[6]={1,2,3,4,5,9};
+/*Handles DCT value tokens categories 3 through 6.*/
+static int oc_token_dec1val_cat3_6(int _token,int _extra_bits){
   int cati;
   int mask;
-  cati=_token-OC_NDCT_VAL_CAT2_MAX;
-  mask=-(_extra_bits>>VAL_CAT_SHIFTS[cati]);
-  return (VAL_CAT_OFFS[cati]+(_extra_bits&VAL_CAT_MASKS[cati])+mask)^mask;
+  int val_cat_offs;
+  int val_cat_shift;
+  cati=_token-OC_DCT_VAL_CAT3;
+  val_cat_shift=cati+1;
+  mask=-(_extra_bits>>val_cat_shift);
+  _extra_bits&=(1<<val_cat_shift)-1;
+  val_cat_offs=OC_BYTE_TABLE32(7,9,13,21,cati);
+  return val_cat_offs+_extra_bits+mask^mask;
 }
 
+/*Handles DCT value tokens categories 7 through 8.*/
+static int oc_token_dec1val_cat7_8(int _token,int _extra_bits){
+  int cati;
+  int mask;
+  int val_cat_offs;
+  int val_cat_shift;
+  cati=_token-OC_DCT_VAL_CAT7;
+  val_cat_shift=5+(cati<<2);
+  mask=-(_extra_bits>>val_cat_shift);
+  _extra_bits&=(1<<val_cat_shift)-1;
+  val_cat_offs=37+(cati<<5);
+  return val_cat_offs+_extra_bits+mask^mask;
+}
+
 /*A jump table for computing the first coefficient value the given token value
    represents.*/
 static const oc_token_dec1val_func OC_TOKEN_DEC1VAL_TABLE[TH_NDCT_TOKENS-
@@ -815,12 +817,12 @@
   oc_token_dec1val_cat2,
   oc_token_dec1val_cat2,
   oc_token_dec1val_cat2,
-  oc_token_dec1val_cati,
-  oc_token_dec1val_cati,
-  oc_token_dec1val_cati,
-  oc_token_dec1val_cati,
-  oc_token_dec1val_cati,
-  oc_token_dec1val_cati,
+  oc_token_dec1val_cat3_6,
+  oc_token_dec1val_cat3_6,
+  oc_token_dec1val_cat3_6,
+  oc_token_dec1val_cat3_6,
+  oc_token_dec1val_cat7_8,
+  oc_token_dec1val_cat7_8,
   (oc_token_dec1val_func)oc_token_dec1val_zrl,
   (oc_token_dec1val_func)oc_token_dec1val_zrl,
   (oc_token_dec1val_func)oc_token_dec1val_zrl,
@@ -851,74 +853,71 @@
                 each coefficient.
                This is updated as EOB tokens and zero run tokens are decoded.
   Return: The length of any outstanding EOB run.*/
-static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[3],
+static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2],
  ptrdiff_t _ntoks_left[3][64]){
-  ptrdiff_t        run_counts[64];
+  unsigned char   *dct_tokens;
+  ogg_uint16_t    *extra_bits;
   oc_fragment     *frags;
   const ptrdiff_t *coded_fragis;
   ptrdiff_t        ncoded_fragis;
-  ptrdiff_t        coded_fragii;
-  ptrdiff_t        cfi;
-  ptrdiff_t        eobi;
+  ptrdiff_t        fragii;
   ptrdiff_t        eobs;
   ptrdiff_t        ti;
   ptrdiff_t        ebi;
-  long             val;
   int              pli;
-  int              rli;
+  dct_tokens=_dec->dct_tokens[0];
+  extra_bits=_dec->extra_bits[0];
   frags=_dec->state.frags;
   coded_fragis=_dec->state.coded_fragis;
-  eobs=0;
-  ti=ebi=0;
-  ncoded_fragis=coded_fragii=0;
+  ncoded_fragis=fragii=eobs=ti=ebi=0;
   for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t eob_count;
+    ptrdiff_t eobi;
+    int       rli;
     ncoded_fragis+=_dec->state.ncoded_fragis[pli];
     memset(run_counts,0,sizeof(run_counts));
     _dec->eob_runs[pli][0]=eobs;
     /*Continue any previous EOB run, if there was one.*/
-    for(eobi=eobs;eobi-->0&&coded_fragii<ncoded_fragis;){
-      frags[coded_fragis[coded_fragii++]].dc=0;
-    }
-    cfi=0;
-    while(eobs<_ntoks_left[pli][0]-cfi){
+    eobi=eobs;
+    if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+    eob_count=eobi;
+    eobs-=eobi;
+    while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+    while(fragii<ncoded_fragis){
       int token;
       int neb;
       int eb;
       int skip;
-      cfi+=eobs;
-      run_counts[63]+=eobs;
       token=oc_huff_token_decode(&_dec->opb,
-       _dec->huff_tables[_huff_idxs[pli]]);
-      _dec->dct_tokens[0][ti++]=(unsigned char)token;
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
       neb=OC_DCT_TOKEN_EXTRA_BITS[token];
       if(neb){
+        long val;
         theorapackB_read(&_dec->opb,neb,&val);
         eb=(int)val;
-        _dec->extra_bits[0][ebi++]=(ogg_uint16_t)eb;
+        extra_bits[ebi++]=(ogg_uint16_t)eb;
       }
       else eb=0;
       skip=oc_dct_token_skip(token,eb);
       if(skip<0){
         eobs=eobi=-skip;
-        while(eobi-->0&&coded_fragii<ncoded_fragis){
-          frags[coded_fragis[coded_fragii++]].dc=0;
-        }
+        if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+        eob_count+=eobi;
+        eobs-=eobi;
+        while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
       }
       else{
         run_counts[skip-1]++;
-        cfi++;
         eobs=0;
-        frags[coded_fragis[coded_fragii++]].dc=oc_dct_token_dec1val(token,eb);
+        frags[coded_fragis[fragii++]].dc=oc_dct_token_dec1val(token,eb);
       }
     }
     _dec->ti0[pli][0]=ti;
     _dec->ebi0[pli][0]=ebi;
-    /*Set the EOB count to the portion of the last EOB run which extends past
-       this coefficient.*/
-    eobs=eobs+cfi-_ntoks_left[pli][0];
-    /*Add the portion of the last EOB which was included in this coefficient to
-       to the longest run length.*/
-    run_counts[63]+=_ntoks_left[pli][0]-cfi;
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
     /*And convert the run_counts array to a moment table.*/
     for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
     /*Finally, subtract off the number of coefficients that have been
@@ -938,53 +937,61 @@
   _eobs:       The length of any outstanding EOB run from previous
                 coefficients.
   Return: The length of any outstanding EOB run.*/
-static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[3],
+static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2],
  ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){
-  ptrdiff_t run_counts[64];
-  ptrdiff_t cfi;
-  ptrdiff_t ti;
-  ptrdiff_t ebi;
-  long      val;
-  int       pli;
-  int       rli;
+  unsigned char *dct_tokens;
+  ogg_uint16_t  *extra_bits;
+  ptrdiff_t      ti;
+  ptrdiff_t      ebi;
+  int            pli;
+  dct_tokens=_dec->dct_tokens[_zzi];
+  extra_bits=_dec->extra_bits[_zzi];
   ti=ebi=0;
   for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t ntoks_left;
+    ptrdiff_t eob_count;
+    ptrdiff_t ntoks;
+    int       rli;
+    _dec->eob_runs[pli][_zzi]=_eobs;
+    ntoks_left=_ntoks_left[pli][_zzi];
     memset(run_counts,0,sizeof(run_counts));
-    _dec->eob_runs[pli][_zzi]=_eobs;
-    cfi=0;
-    while(_eobs<_ntoks_left[pli][_zzi]-cfi){
+    eob_count=0;
+    ntoks=0;
+    while(ntoks+_eobs<ntoks_left){
       int token;
       int neb;
       int eb;
       int skip;
-      cfi+=_eobs;
-      run_counts[63]+=_eobs;
+      ntoks+=_eobs;
+      eob_count+=_eobs;
       token=oc_huff_token_decode(&_dec->opb,
-       _dec->huff_tables[_huff_idxs[pli]]);
-      _dec->dct_tokens[_zzi][ti++]=(unsigned char)token;
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
       neb=OC_DCT_TOKEN_EXTRA_BITS[token];
       if(neb){
+        long val;
         theorapackB_read(&_dec->opb,neb,&val);
         eb=(int)val;
-        _dec->extra_bits[_zzi][ebi++]=(ogg_uint16_t)eb;
+        extra_bits[ebi++]=(ogg_uint16_t)eb;
       }
       else eb=0;
       skip=oc_dct_token_skip(token,eb);
       if(skip<0)_eobs=-skip;
       else{
         run_counts[skip-1]++;
-        cfi++;
+        ntoks++;
         _eobs=0;
       }
     }
     _dec->ti0[pli][_zzi]=ti;
     _dec->ebi0[pli][_zzi]=ebi;
-    /*Set the EOB count to the portion of the last EOB run which extends past
-       this coefficient.*/
-    _eobs=_eobs+cfi-_ntoks_left[pli][_zzi];
-    /*Add the portion of the last EOB which was included in this coefficient to
-       to the longest run length.*/
-    run_counts[63]+=_ntoks_left[pli][_zzi]-cfi;
+    /*Add the portion of the last EOB run actually used by this coefficient.*/
+    eob_count+=ntoks_left-ntoks;
+    /*And remove it from the remaining EOB count.*/
+    _eobs-=ntoks_left-ntoks;
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
     /*And convert the run_counts array to a moment table.*/
     for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
     /*Finally, subtract off the number of coefficients that have been
@@ -1020,33 +1027,29 @@
 static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){
   static const unsigned char OC_HUFF_LIST_MAX[5]={1,6,15,28,64};
   ptrdiff_t  ntoks_left[3][64];
-  int        huff_idxs[3];
+  int        huff_idxs[2];
   ptrdiff_t  eobs;
   long       val;
   int        pli;
   int        zzi;
   int        hgi;
-  int        huffi_y;
-  int        huffi_c;
   for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
     ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
   }
   theorapackB_read(&_dec->opb,4,&val);
-  huffi_y=(int)val;
+  huff_idxs[0]=(int)val;
   theorapackB_read(&_dec->opb,4,&val);
-  huffi_c=(int)val;
-  huff_idxs[0]=huffi_y;
-  huff_idxs[1]=huff_idxs[2]=huffi_c;
+  huff_idxs[1]=(int)val;
   _dec->eob_runs[0][0]=0;
   eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
   theorapackB_read(&_dec->opb,4,&val);
-  huffi_y=(int)val;
+  huff_idxs[0]=(int)val;
   theorapackB_read(&_dec->opb,4,&val);
-  huffi_c=(int)val;
+  huff_idxs[1]=(int)val;
   zzi=1;
   for(hgi=1;hgi<5;hgi++){
-    huff_idxs[0]=huffi_y+(hgi<<4);
-    huff_idxs[1]=huff_idxs[2]=huffi_c+(hgi<<4);
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
     for(;zzi<OC_HUFF_LIST_MAX[hgi];zzi++){
       eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs);
     }
@@ -1093,13 +1096,20 @@
   return _zzi;
 }
 
-/*Expands category 3 through 8 single-valued tokens.*/
-static int oc_token_expand_cati(int _token,int _extra_bits,
+/*Expands category 3 through 6 single-valued tokens.*/
+static int oc_token_expand_cat3_6(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int _zzi){
-  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cati(_token,_extra_bits);
+  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cat3_6(_token,_extra_bits);
   return _zzi;
 }
 
+/*Expands category 7 through 8 single-valued tokens.*/
+static int oc_token_expand_cat7_8(int _token,int _extra_bits,
+ ogg_int16_t _dct_coeffs[128],int _zzi){
+  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cat7_8(_token,_extra_bits);
+  return _zzi;
+}
+
 /*Expands a category 1a zero run/value combo token.*/
 static int oc_token_expand_run_cat1a(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int _zzi){
@@ -1113,33 +1123,27 @@
 /*Expands all other zero run/value combo tokens.*/
 static int oc_token_expand_run(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int _zzi){
-  static const unsigned char NZEROS_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    6,10,1,2
-  };
-  static const unsigned char NZEROS_MASK[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    3,7,0,1
-  };
-  static const unsigned char VALUE_SHIFT[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    0,0,0,1
-  };
-  static const unsigned char VALUE_MASK[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    0,0,1,1
-  };
-  static const unsigned char VALUE_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    1,1,2,2
-  };
-  static const unsigned char SIGN_SHIFT[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    2,3,1,2
-  };
+  int nzeros_mask;
+  int nzeros_adjust;
+  int sign_shift;
+  int value_shift;
+  int value_mask;
+  int value_adjust;
   int mask;
   int rl;
   _token-=OC_DCT_RUN_CAT1B;
-  rl=(_extra_bits&NZEROS_MASK[_token])+NZEROS_ADJUST[_token];
+  nzeros_mask=OC_BYTE_TABLE32(3,7,0,1,_token);
+  nzeros_adjust=OC_BYTE_TABLE32(6,10,1,2,_token);
+  rl=(_extra_bits&nzeros_mask)+nzeros_adjust;
   /*LOOP VECTORIZES.*/
   while(rl-->0)_dct_coeffs[_zzi++]=0;
-  mask=-(_extra_bits>>SIGN_SHIFT[_token]);
-  _dct_coeffs[_zzi++]=(ogg_int16_t)((VALUE_ADJUST[_token]+
-   (_extra_bits>>VALUE_SHIFT[_token]&VALUE_MASK[_token])+mask)^mask);
+  sign_shift=OC_BYTE_TABLE32(2,3,1,2,_token);
+  mask=-(_extra_bits>>sign_shift);
+  value_shift=_token+1>>2;
+  value_mask=_token>>1;
+  value_adjust=value_mask+1;
+  _dct_coeffs[_zzi++]=
+   (ogg_int16_t)(value_adjust+(_extra_bits>>value_shift&value_mask)+mask^mask);
   return _zzi;
 }
 
@@ -1158,12 +1162,12 @@
   oc_token_expand_cat2,
   oc_token_expand_cat2,
   oc_token_expand_cat2,
-  oc_token_expand_cati,
-  oc_token_expand_cati,
-  oc_token_expand_cati,
-  oc_token_expand_cati,
-  oc_token_expand_cati,
-  oc_token_expand_cati,
+  oc_token_expand_cat3_6,
+  oc_token_expand_cat3_6,
+  oc_token_expand_cat3_6,
+  oc_token_expand_cat3_6,
+  oc_token_expand_cat7_8,
+  oc_token_expand_cat7_8,
   oc_token_expand_run_cat1a,
   oc_token_expand_run_cat1a,
   oc_token_expand_run_cat1a,
@@ -1217,7 +1221,7 @@
     unsigned char   *dc_qis;
     const ptrdiff_t *coded_fragis;
     ptrdiff_t        ncoded_fragis;
-    ptrdiff_t        coded_fragii;
+    ptrdiff_t        fragii;
     unsigned char    qi0;
     /*Update the DC quantization index of each coded block.*/
     dc_qis=_dec->dc_qis;
@@ -1225,8 +1229,8 @@
     ncoded_fragis=_dec->state.ncoded_fragis[0]+
      _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2];
     qi0=(unsigned char)_dec->state.qis[0];
-    for(coded_fragii=0;coded_fragii<ncoded_fragis;coded_fragii++){
-      dc_qis[coded_fragis[coded_fragii]]=qi0;
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      dc_qis[coded_fragis[fragii]]=qi0;
     }
   }
   /*pp_level 1: Stop after updating DC quantization indices.*/
@@ -1296,20 +1300,21 @@
 
 
 typedef struct{
-  int              bounding_values[256];
-  ptrdiff_t        ti[3][64];
-  ptrdiff_t        ebi[3][64];
-  ptrdiff_t        eob_runs[3][64];
-  const ptrdiff_t *coded_fragis[3];
-  const ptrdiff_t *uncoded_fragis[3];
-  ptrdiff_t        ncoded_fragis[3];
-  ptrdiff_t        nuncoded_fragis[3];
-  int              fragy0[3];
-  int              fragy_end[3];
-  int              pred_last[3][3];
-  int              mcu_nvfrags;
-  int              loop_filter;
-  int              pp_level;
+  int                 bounding_values[256];
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           ebi[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *qtables[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][3];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
 }oc_dec_pipeline_state;
 
 
@@ -1317,9 +1322,11 @@
 /*Initialize the main decoding pipeline.*/
 static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe){
-  const ptrdiff_t *coded_fragi_end;
-  const ptrdiff_t *uncoded_fragi_end;
+  const ptrdiff_t *coded_fragis;
+  const ptrdiff_t *uncoded_fragis;
   int              pli;
+  int              qii;
+  int              qti;
   /*If chroma is sub-sampled in the vertical direction, we have to decode two
      super block rows of Y' for each super block row of Cb and Cr.*/
   _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
@@ -1334,14 +1341,25 @@
   /*Also copy over the initial the EOB run counts.*/
   memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
   /*Set up per-plane pointers to the coded and uncoded fragments lists.*/
-  coded_fragi_end=_dec->state.coded_fragis;
-  uncoded_fragi_end=_dec->state.uncoded_fragis;
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
   for(pli=0;pli<3;pli++){
-    _pipe->coded_fragis[pli]=coded_fragi_end;
-    _pipe->uncoded_fragis[pli]=uncoded_fragi_end;
-    coded_fragi_end+=_dec->state.ncoded_fragis[pli];
-    uncoded_fragi_end-=_dec->state.nuncoded_fragis[pli];
+    ptrdiff_t ncoded_fragis;
+    _pipe->coded_fragis[pli]=coded_fragis;
+    _pipe->uncoded_fragis[pli]=uncoded_fragis;
+    ncoded_fragis=_dec->state.ncoded_fragis[pli];
+    coded_fragis+=ncoded_fragis;
+    uncoded_fragis+=ncoded_fragis-_dec->state.fplanes[pli].nfrags;
   }
+  /*Set up condensed quantizer tables.*/
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<_dec->state.nqis;qii++){
+      for(qti=0;qti<2;qti++){
+        _pipe->qtables[pli][qii][qti]=
+         _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
+      }
+    }
+  }
   /*Set the previous DC predictor to 0 for all color planes and frame types.*/
   memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
   /*Initialize the bounding value array for the loop filter.*/
@@ -1412,28 +1430,30 @@
    counts.*/
 static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
+  ogg_uint16_t       dc_quant[2];
   const oc_fragment *frags;
   const ptrdiff_t   *coded_fragis;
   ptrdiff_t          ncoded_fragis;
-  ptrdiff_t          coded_fragii;
+  ptrdiff_t          fragii;
   ptrdiff_t         *ti;
   ptrdiff_t         *ebi;
   ptrdiff_t         *eob_runs;
+  int                qti;
   frags=_dec->state.frags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
   ti=_pipe->ti[_pli];
   ebi=_pipe->ebi[_pli];
   eob_runs=_pipe->eob_runs[_pli];
-  for(coded_fragii=0;coded_fragii<ncoded_fragis;coded_fragii++){
+  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->qtables[_pli][0][qti][0];
+  for(fragii=0;fragii<ncoded_fragis;fragii++){
     /*This array is made twice as large as necessary so that an invalid zero
        run cannot cause a buffer overflow.*/
-    ogg_int16_t     dct_coeffs[128];
-    oc_quant_table *quants;
-    ptrdiff_t       fragi;
-    int             last_zzi;
-    int             zzi;
-    fragi=coded_fragis[coded_fragii];
+    ogg_int16_t dct_coeffs[128];
+    ptrdiff_t   fragi;
+    int         last_zzi;
+    int         zzi;
+    fragi=coded_fragis[fragii];
     /*Decode the AC coefficients.*/
     for(zzi=0;zzi<64;){
       int token;
@@ -1459,12 +1479,11 @@
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
     dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
-    quants=
-     _dec->state.dequant_tables[frags[fragi].mb_mode!=OC_MODE_INTRA][_pli];
+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,
-     quants[_dec->state.qis[0]][0],quants[frags[fragi].qi]);
+     dc_quant[qti],_pipe->qtables[_pli][frags[fragi].qii][qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -1803,7 +1822,7 @@
       int b;
       int qi;
       int var;
-      qi=frag->qi;
+      qi=_dec->state.qis[frag->qii];
       var=*variance;
       b=(x<=0)|(x+8>=width)<<1|(y<=0)<<2|(y+8>=height)<<3;
       if(strong&&var>sthresh){
@@ -1880,10 +1899,10 @@
     granpos=*(ogg_int64_t *)_buf;
     if(granpos<0)return TH_EINVAL;
     _dec->state.granpos=granpos;
-    _dec->state.keyframe_num=
-      granpos>>_dec->state.info.keyframe_granule_shift;
-    _dec->state.curframe_num=_dec->state.keyframe_num+
-      (granpos&(1<<_dec->state.info.keyframe_granule_shift)-1);
+    _dec->state.keyframe_num=(granpos>>_dec->state.info.keyframe_granule_shift)
+     -_dec->state.granpos_bias;
+    _dec->state.curframe_num=_dec->state.keyframe_num
+     +(granpos&(1<<_dec->state.info.keyframe_granule_shift)-1);
     return 0;
   }break;
   case TH_DECCTL_SET_STRIPE_CB:{
@@ -1978,9 +1997,9 @@
     /*Update granule position.
       This must be done before the striped decode callbacks so that the
        application knows what to do with the frame data.*/
-    _dec->state.granpos=
-     (_dec->state.keyframe_num<<_dec->state.info.keyframe_granule_shift)+
-     (_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
     _dec->state.curframe_num++;
     if(_granpos!=NULL)*_granpos=_dec->state.granpos;
     /*All of the rest of the operations -- DC prediction reversal,
@@ -2114,9 +2133,9 @@
   }
   else{
     /*Just update the granule position and return.*/
-    _dec->state.granpos=
-     (_dec->state.keyframe_num<<_dec->state.info.keyframe_granule_shift)+
-     (_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
     _dec->state.curframe_num++;
     if(_granpos!=NULL)*_granpos=_dec->state.granpos;
     return TH_DUPFRAME;

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -121,9 +121,9 @@
 
 /*Handles the simple end of block tokens.*/
 static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
-  static const unsigned char NBLOCKS_ADJUST[OC_NDCT_EOB_TOKEN_MAX]=
-   {1,2,3,4,8,16,0};
-  return -_extra_bits-NBLOCKS_ADJUST[_token];
+  int nblocks_adjust;
+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
+  return -_extra_bits-nblocks_adjust;
 }
 
 /*The last EOB token has a special case, where an EOB run of size zero ends all
@@ -150,16 +150,15 @@
   return _token-OC_DCT_RUN_CAT1A+2;
 }
 
-/*Handles category 1b and 2 zero run/coefficient value combo tokens.*/
+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
 static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
-  static const unsigned char NCOEFFS_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    7,11,2,3
-  };
-  static const unsigned char NCOEFFS_MASK[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
-    3,7,0,1
-  };
-  _token-=OC_DCT_RUN_CAT1B;
-  return (_extra_bits&NCOEFFS_MASK[_token])+NCOEFFS_ADJUST[_token];
+  int run_cati;
+  int ncoeffs_mask;
+  int ncoeffs_adjust;
+  run_cati=_token-OC_DCT_RUN_CAT1B;
+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
 }
 
 /*A jump table for computing the number of coefficients or blocks to skip for

Modified: branches/theora-thusnelda/lib/dec/ocintrin.h
===================================================================
--- branches/theora-thusnelda/lib/dec/ocintrin.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/ocintrin.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -85,24 +85,35 @@
   When _rval is (1<<_shift-1), this is equivalent to division with rounding
    ties away from zero.*/
 #define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\
- ((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift))
+  ((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift))
 /*Divides a _x by 2, rounding towards even numbers.*/
 #define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1)
 /*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/
 #define OC_DIV_POW2_RE(_x,_shift) \
- ((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift))
+  ((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift))
 /*Swaps two integers _a and _b if _a>_b.*/
-#define OC_SORT2I(_a,_b)\
-  do{\
-    int t__;\
-    t__=((_a)^(_b))&-((_b)<(_a));\
-    (_a)^=t__;\
-    (_b)^=t__;\
-  }\
-  while(0)\
+#define OC_SORT2I(_a,_b) \
+  do{ \
+    int t__; \
+    t__=((_a)^(_b))&-((_b)<(_a)); \
+    (_a)^=t__; \
+    (_b)^=t__; \
+  } \
+  while(0)
 
+/*Accesses one of four (signed) bytes given an index.
+  This can be used to avoid small lookup tables.*/
+#define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \
+  ((signed char) \
+   (((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8))
+/*Accesses one of eight (unsigned) nibbles given an index.
+  This can be used to avoid small lookup tables.*/
+#define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \
+  ((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \
+   ((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF)
 
 
+
 /*All of these macros should expect floats as arguments.*/
 #define OC_MAXF(_a,_b)      ((_a)<(_b)?(_b):(_a))
 #define OC_MINF(_a,_b)      ((_a)>(_b)?(_b):(_a))

Modified: branches/theora-thusnelda/lib/dec/quant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/quant.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -37,7 +37,7 @@
   However, a much, much better option is to only store the quantization
    matrices being used for the current frame, and to recalculate these as the
    qi values change between frames (this is what VP3 did).*/
-void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
+void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
  int _pp_dc_scale[64],const th_quant_info *_qinfo){
   /*Coding mode: intra or inter.*/
   int          qti;
@@ -48,13 +48,11 @@
     int qi;
     /*Range iterator.*/
     int qri;
-    for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
-      oc_quant_table *qtables;
-      th_quant_base   base;
-      ogg_uint32_t    q;
-      int             qi_start;
-      int             qi_end;
-      qtables=_dequant[qti][pli];
+    for(qi=0,qri=0;qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
+      th_quant_base base;
+      ogg_uint32_t  q;
+      int           qi_start;
+      int           qi_end;
       memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
        sizeof(base));
       qi_start=qi;
@@ -62,9 +60,9 @@
       else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
       /*Iterate over quality indicies in this range.*/
       for(;;){
-        ogg_uint32_t  qfac;
-        int           zzi;
-        int           ci;
+        ogg_uint32_t qfac;
+        int          zzi;
+        int          ci;
         /*In the original VP3.2 code, the rounding offset and the size of the
            dead zone around 0 were controlled by a "sharpness" parameter.
           The size of our dead zone is now controlled by the per-coefficient
@@ -80,13 +78,32 @@
         /*Scale DC the coefficient from the proper table.*/
         q=(qfac/100)<<2;
         q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-        qtables[qi][0]=(ogg_uint16_t)q;
+        _dequant[qi][pli][qti][0]=(ogg_uint16_t)q;
         /*Now scale AC coefficients from the proper table.*/
         for(zzi=1;zzi<64;zzi++){
           q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[OC_FZIG_ZAG[zzi]]/100)<<2;
           q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-          qtables[qi][zzi]=(ogg_uint16_t)q;
+          _dequant[qi][pli][qti][zzi]=(ogg_uint16_t)q;
         }
+        /*If this is a duplicate of a previous matrix, use that instead.
+          This simple check helps us improve cache coherency later.*/
+        {
+          int dupe;
+          int qtj;
+          int plj;
+          dupe=0;
+          for(qtj=0;qtj<=qti;qtj++){
+            for(plj=0;plj<(qtj<qti?3:pli);plj++){
+              if(!memcmp(_dequant[qi][pli][qti],_dequant[qi][plj][qtj],
+               sizeof(oc_quant_table))){
+                dupe=1;
+                break;
+              }
+            }
+            if(dupe)break;
+          }
+          if(dupe)_dequant[qi][pli][qti]=_dequant[qi][plj][qtj];
+        }
         if(++qi>=qi_end)break;
         /*Interpolate the next base matrix.*/
         for(ci=0;ci<64;ci++){
@@ -98,25 +115,5 @@
         }
       }
     }
-    /*Staging matrices complete; commit to memory only if this isn't a
-       duplicate of a preceeding set of matrices.
-      This simple check helps us improve cache coherency later.*/
-    {
-      int dupe;
-      int qtj;
-      int plj;
-      dupe=0;
-      for(qtj=0;qtj<=qti;qtj++){
-        for(plj=0;plj<(qtj<qti?3:pli);plj++){
-          if(!memcmp(_dequant[qti][pli],_dequant[qtj][plj],
-           sizeof(oc_quant_tables))){
-            dupe=1;
-            break;
-          }
-        }
-        if(dupe)break;
-      }
-      if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
-    }
   }
 }

Modified: branches/theora-thusnelda/lib/dec/quant.h
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/quant.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -21,14 +21,13 @@
 # include "ocintrin.h"
 
 typedef ogg_uint16_t   oc_quant_table[64];
-typedef oc_quant_table oc_quant_tables[64];
 
 
 /*Maximum scaled quantizer value.*/
 #define OC_QUANT_MAX          (1024<<2)
 
 
-void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
+void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
  int _pp_dc_scale[64],const th_quant_info *_qinfo);
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -121,13 +121,9 @@
 static void oc_mb_fill_ymapping(oc_mb_map_plane _mb_map[3],
  const oc_fragment_plane *_fplane,int _xfrag0,int _yfrag0){
   int i;
-  for(i=0;i<2;i++){
-    int j;
-    if(_yfrag0+i>=_fplane->nvfrags)break;
-    for(j=0;j<2;j++){
-      if(_xfrag0+j>=_fplane->nhfrags)break;
-      _mb_map[0][i<<1|j]=(_yfrag0+i)*(ptrdiff_t)_fplane->nhfrags+_xfrag0+j;
-    }
+  int j;
+  for(i=0;i<2;i++)for(j=0;j<2;j++){
+    _mb_map[0][i<<1|j]=(_yfrag0+i)*(ptrdiff_t)_fplane->nhfrags+_xfrag0+j;
   }
 }
 
@@ -161,7 +157,6 @@
   _yfrag0>>=1;
   fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0;
   for(j=0;j<2;j++){
-    if(_xfrag0+j>=_fplanes[1].nhfrags)break;
     _mb_map[1][j]=fragi+_fplanes[1].froffset;
     _mb_map[2][j]=fragi+_fplanes[2].froffset;
     fragi++;
@@ -181,7 +176,6 @@
   _xfrag0>>=1;
   fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0;
   for(i=0;i<2;i++){
-    if(_yfrag0+i>=_fplanes[1].nvfrags)break;
     _mb_map[1][i<<1]=fragi+_fplanes[1].froffset;
     _mb_map[2][i<<1]=fragi+_fplanes[2].froffset;
     fragi+=_fplanes[1].nhfrags;
@@ -197,10 +191,8 @@
  const oc_fragment_plane _fplanes[3]){
   int k;
   for(k=0;k<4;k++){
-    if(_mb_map[0][k]>=0){
-      _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
-      _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
-    }
+    _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
+    _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
   }
 }
 
@@ -440,8 +432,6 @@
   _state->mb_maps=_ogg_calloc(nmbs,sizeof(*_state->mb_maps));
   _state->mb_modes=_ogg_calloc(nmbs,sizeof(*_state->mb_modes));
   _state->coded_fragis=_ogg_malloc(nfrags*sizeof(*_state->coded_fragis));
-  _state->uncoded_fragis=_state->coded_fragis+nfrags;
-  _state->coded_mbis=_ogg_malloc(nmbs*sizeof(*_state->coded_mbis));
   /*Create the mapping from super blocks to fragments.*/
   for(pli=0;pli<3;pli++){
     oc_fragment_plane *fplane;
@@ -459,7 +449,6 @@
 }
 
 static void oc_state_frarray_clear(oc_theora_state *_state){
-  _ogg_free(_state->coded_mbis);
   _ogg_free(_state->coded_fragis);
   _ogg_free(_state->mb_modes);
   _ogg_free(_state->mb_maps);
@@ -475,8 +464,8 @@
    unrestricted motion vectors without special casing the boundary.
   If chroma is decimated in either direction, the padding is reduced by a
    factor of 2 on the appropriate sides.
-  _enc: The encoding context to store the buffers in.*/
-static int oc_state_ref_bufs_init(oc_theora_state *_state){
+  _nrefs: The number of reference buffers to init; must be 3 or 4.*/
+static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
   th_info       *info;
   unsigned char *ref_frame_data;
   size_t         ref_frame_data_sz;
@@ -495,6 +484,7 @@
   int            vdec;
   int            rfi;
   int            pli;
+  if(_nrefs<3||_nrefs>4)return TH_EINVAL;
   info=&_state->info;
   /*Compute the image buffer parameters for each plane.*/
   hdec=!(info->pixel_fmt&1);
@@ -508,11 +498,11 @@
   yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
   coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
   ref_frame_sz=yplane_sz+2*cplane_sz;
-  ref_frame_data_sz=3*ref_frame_sz;
+  ref_frame_data_sz=_nrefs*ref_frame_sz;
   /*Check for overflow.
     The same caveats apply as for oc_state_frarray_init().*/
   if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
-   ref_frame_sz<yplane_sz||ref_frame_data_sz/3!=ref_frame_sz){
+   ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
     return TH_EIMPL;
   }
   ref_frame_data=_ogg_malloc(ref_frame_data_sz);
@@ -526,12 +516,12 @@
    info->frame_height>>vdec;
   _state->ref_frame_bufs[0][1].stride=_state->ref_frame_bufs[0][2].stride=
    chstride;
-  memcpy(_state->ref_frame_bufs[1],_state->ref_frame_bufs[0],
-   sizeof(_state->ref_frame_bufs[0]));
-  memcpy(_state->ref_frame_bufs[2],_state->ref_frame_bufs[0],
-   sizeof(_state->ref_frame_bufs[0]));
+  for(rfi=1;rfi<_nrefs;rfi++){
+    memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
+     sizeof(_state->ref_frame_bufs[0]));
+  }
   /*Set up the data pointers for the image buffers.*/
-  for(rfi=0;rfi<3;rfi++){
+  for(rfi=0;rfi<_nrefs;rfi++){
     _state->ref_frame_data[rfi]=ref_frame_data;
     _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
     ref_frame_data+=yplane_sz;
@@ -545,6 +535,8 @@
     oc_ycbcr_buffer_flip(_state->ref_frame_bufs[rfi],
      _state->ref_frame_bufs[rfi]);
   }
+  _state->ref_ystride[0]=-yhstride;
+  _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
   /*Initialize the fragment buffer offsets.*/
   ref_frame_data=_state->ref_frame_data[0];
   frag_buf_offs=_state->frag_buf_offs=
@@ -578,6 +570,7 @@
   _state->ref_frame_idx[OC_FRAME_GOLD]=
    _state->ref_frame_idx[OC_FRAME_PREV]=
    _state->ref_frame_idx[OC_FRAME_SELF]=-1;
+  _state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
   return 0;
 }
 
@@ -610,8 +603,7 @@
 }
 
 
-int oc_state_init(oc_theora_state *_state,const th_info *_info){
-  int old_granpos;
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
   int ret;
   /*First validate the parameters.*/
   if(_info==NULL)return TH_EFAULT;
@@ -648,22 +640,20 @@
   oc_state_vtable_init(_state);
   ret=oc_state_frarray_init(_state);
   if(ret<0)return ret;
-  ret=oc_state_ref_bufs_init(_state);
+  ret=oc_state_ref_bufs_init(_state,_nrefs);
   if(ret<0)return ret;
   /*If the keyframe_granule_shift is out of range, use the maximum allowable
      value.*/
   if(_info->keyframe_granule_shift<0||_info->keyframe_granule_shift>31){
     _state->info.keyframe_granule_shift=31;
   }
-  _state->keyframe_num=1;
-  _state->curframe_num=0;
+  _state->keyframe_num=0;
+  _state->curframe_num=-1;
   /*3.2.0 streams mark the frame index instead of the frame count.
     This was changed with stream version 3.2.1 to conform to other Ogg
      codecs.
-    We subtract an extra one from the frame number for old streams.*/
-  old_granpos=!TH_VERSION_CHECK(_info,3,2,1);
-  _state->curframe_num-=old_granpos;
-  _state->keyframe_num-=old_granpos;
+    We add an extra bias when computing granule positions for new streams.*/
+  _state->granpos_bias=TH_VERSION_CHECK(_info,3,2,1);
   return 0;
 }
 
@@ -748,28 +738,18 @@
   }
 }
 
-/*Returns the macro block index of the macro block in the given position.
-  _state: The Theora state the macro block is contained in.
-  _mbx:   The X coordinate of the macro block (in macro blocks, not pixels).
-  _mby:   The Y coordinate of the macro block (in macro blocks, not pixels).
-  Return: The index of the macro block in the given position.*/
-int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby){
-  return ((_mbx&~1)<<1)+(_mby&~1)*_state->nhmbs+OC_MB_MAP[_mby&1][_mbx&1];
-}
-
 /*Determines the offsets in an image buffer to use for motion compensation.
   _state:   The Theora state the offsets are to be computed with.
   _offsets: Returns the offset for the buffer(s).
             _offsets[0] is always set.
             _offsets[1] is set if the motion vector has non-zero fractional
              components.
+  _pli:     The color plane index.
   _dx:      The X component of the motion vector.
   _dy:      The Y component of the motion vector.
-  _ystride: The Y stride in the buffer the motion vector points into.
-  _pli:     The color plane index.
   Return: The number of offsets returned: 1 or 2.*/
 int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
- int _dx,int _dy,int _ystride,int _pli){
+ int _pli,int _dx,int _dy){
   /*Here is a brief description of how Theora handles motion vectors:
     Motion vector components are specified to half-pixel accuracy in
      undecimated directions of each plane, and quarter-pixel accuracy in
@@ -786,11 +766,13 @@
      appropriate amount, always truncating _away_ from zero.*/
 #if 0
   /*This version of the code doesn't use any tables, but is slower.*/
+  int ystride;
   int xprec;
   int yprec;
   int xfrac;
   int yfrac;
   int offs;
+  ystride=_state->ref_ystride[_pli];
   /*These two variables decide whether we are in half- or quarter-pixel
      precision in each component.*/
   xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
@@ -799,13 +781,13 @@
      if any of them are non-zero.*/
   xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
   yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
-  offs=(_dx>>xprec)+(_dy>>yprec)*_ystride;
+  offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
   if(xfrac||yfrac){
     int xmask;
     int ymask;
     xmask=OC_SIGNMASK(_dx);
     ymask=OC_SIGNMASK(_dy);
-    yfrac&=_ystride;
+    yfrac&=ystride;
     _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
     _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
     return 2;
@@ -845,6 +827,7 @@
       0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1
     }
   };
+  int ystride;
   int qpx;
   int qpy;
   int mx;
@@ -852,15 +835,16 @@
   int mx2;
   int my2;
   int offs;
+  ystride=_state->ref_ystride[_pli];
   qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
   my=OC_MVMAP[qpy][_dy+31];
   my2=OC_MVMAP2[qpy][_dy+31];
   qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
   mx=OC_MVMAP[qpx][_dx+31];
   mx2=OC_MVMAP2[qpx][_dx+31];
-  offs=my*_ystride+mx;
+  offs=my*ystride+mx;
   if(mx2||my2){
-    _offsets[1]=offs+my2*_ystride+mx2;
+    _offsets[1]=offs+my2*ystride+mx2;
     _offsets[0]=offs;
     return 2;
   }
@@ -890,7 +874,7 @@
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
-  ystride=_state->ref_frame_bufs[0][_pli].stride;
+  ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
   if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,res_buf);
   else{
@@ -900,7 +884,7 @@
      _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
      +frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1],ystride,_pli)>1){
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1],_pli)>1){
       oc_frag_recon_inter2(_state,
        dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,res_buf);
     }
@@ -932,7 +916,7 @@
   int                  ystride;
   dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
   src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_frame_bufs[0][_pli].stride;
+  ystride=_state->ref_ystride[_pli];
   frag_buf_offs=_state->frag_buf_offs;
   for(fragii=0;fragii<_nfragis;fragii++){
     ptrdiff_t frag_buf_off;
@@ -1007,7 +991,6 @@
 
 void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-  const th_img_plane      *iplane;
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -1019,7 +1002,6 @@
   int                      ystride;
   int                      nhfrags;
   _bv+=127;
-  iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -1031,7 +1013,7 @@
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
-  ystride=iplane->stride;
+  ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
   ref_frame_data=_state->ref_frame_data[_refi];
@@ -1174,6 +1156,7 @@
       png_set_cHRM_fixed(png,info,31271,32902,
        64000,33000,29000,60000,15000,6000);
     }break;
+    default:break;
   }
   png_set_pHYs(png,info,_state->info.aspect_numerator,
    _state->info.aspect_denominator,0);

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -47,8 +47,8 @@
     ref=
      _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]
      +frag_buf_off;
-    if(oc_state_get_mv_offsets(_state,mvoffsets,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1],ystride,_pli)>1){
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
        res_buf);
     }

Copied: branches/theora-thusnelda/lib/enc/analyze.c (from rev 16052, branches/theora-thusnelda/lib/enc/mode.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/analyze.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,1729 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#include <limits.h>
+#include <string.h>
+#include "encint.h"
+#include "modedec.h"
+
+
+
+typedef struct oc_plane_state oc_plane_state;
+typedef struct oc_frag_state  oc_frag_state;
+typedef struct oc_mode_choice oc_mode_choice;
+typedef struct oc_rd_metric   oc_rd_metric;
+
+
+
+/*Temporary encoder state for a single color plane.*/
+struct oc_plane_state{
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t *dequant[3][2];
+  /*Condensed quantization tables.*/
+  const oc_iquant    *enquant[3][2];
+  /*Plane index.*/
+  int                 pli;
+};
+
+
+
+/*State to track coded block flags and their bit cost.*/
+struct oc_frag_state{
+  unsigned   sb_partial_count:16;
+  unsigned   sb_full_count:16;
+  unsigned   b_count:8;
+  unsigned   b_pend:8;
+  signed int sb_partial_last:2;
+  signed int sb_full_last:2;
+  signed int b_last:2;
+  unsigned   sb_partial:1;
+  unsigned   sb_coded:1;
+  unsigned   sb_partial_break:1;
+  unsigned   sb_full_break:1;
+  ptrdiff_t  bits;
+};
+
+
+
+/*Cost information about a MB mode.*/
+struct oc_mode_choice{
+  unsigned cost;
+  unsigned ssd;
+  unsigned rate;
+  unsigned overhead;
+};
+
+
+
+/*Cost information about the coded blocks in a MB.*/
+struct oc_rd_metric{
+  int uncoded_ac_ssd;
+  int coded_ac_ssd;
+  int ac_bits;
+  int dc_flag;
+};
+
+
+
+/*There are 8 possible schemes used to encode macro block modes.
+  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
+  The same set of Huffman codes is used for each of these 7 schemes, but the
+   mode assigned to each codeword varies.
+  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
+   while schemes 1-6 have a fixed mapping.
+  Scheme 7 just encodes each mode directly in 3 bits.*/
+
+/*The mode orderings for the various mode coding schemes.
+  Scheme 0 uses a custom alphabet, which is not stored in this table.
+  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
+   decoder.*/
+static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
+  /*Last MV dominates.*/ 
+  /*L P M N I G GM 4*/
+  {3,4,2,0,1,5,6,7},
+  /*L P N M I G GM 4*/
+  {2,4,3,0,1,5,6,7},
+  /*L M P N I G GM 4*/
+  {3,4,1,0,2,5,6,7},
+  /*L M N P I G GM 4*/
+  {2,4,1,0,3,5,6,7},
+  /*No MV dominates.*/
+  /*N L P M I G GM 4*/
+  {0,4,3,1,2,5,6,7},
+  /*N G L P M I GM 4*/
+  {0,5,4,2,3,1,6,7},
+  /*Default ordering.*/
+  /*N I M L P G GM 4*/
+  {0,1,2,3,4,5,6,7}
+};
+
+
+
+/*Initialize the mode scheme chooser.
+  This need only be called once per encoder.*/
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
+  int si;
+  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
+  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
+}
+
+/*Reset the mode scheme chooser.
+  This needs to be called once for each frame, including the first.*/
+static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
+  int si;
+  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
+  /*Scheme 0 starts with 24 bits to store the mode list in.*/
+  _chooser->scheme_bits[0]=24;
+  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
+  for(si=0;si<8;si++){
+    /*Scheme 7 should always start first, and scheme 0 should always start
+       last.*/
+    _chooser->scheme_list[si]=7-si;
+    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
+  }
+}
+
+
+/*This is the real purpose of this data structure: not actually selecting a
+   mode scheme, but estimating the cost of coding a given mode given all the
+   modes selected so far.
+  This is done via opportunity cost: the cost is defined as the number of bits
+   required to encode all the modes selected so far including the current one
+   using the best possible scheme, minus the number of bits required to encode
+   all the modes selected so far not including the current one using the best
+   possible scheme.
+  The computational expense of doing this probably makes it overkill.
+  Just be happy we take a greedy approach instead of trying to solve the
+   global mode-selection problem (which is NP-hard).
+  _mb_mode: The mode to determine the cost of.
+  Return: The number of bits required to code this mode.*/
+static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int scheme0;
+  int scheme1;
+  int best_bits;
+  int mode_bits;
+  int si;
+  int scheme_bits;
+  scheme0=_chooser->scheme_list[0];
+  scheme1=_chooser->scheme_list[1];
+  best_bits=_chooser->scheme_bits[scheme0];
+  mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]];
+  /*Typical case: If the difference between the best scheme and the next best
+     is greater than 6 bits, then adding just one mode cannot change which
+     scheme we use.*/
+  if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
+  /*Otherwise, check to see if adding this mode selects a different scheme as
+     the best.*/
+  si=1;
+  best_bits+=mode_bits;
+  do{
+    /*For any scheme except 0, we can just use the bit cost of the mode's rank
+       in that scheme.*/
+    if(scheme1!=0){
+      scheme_bits=_chooser->scheme_bits[scheme1]+
+       OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]];
+    }
+    else{
+      int ri;
+      /*For scheme 0, incrementing the mode count could potentially change the
+         mode's rank.
+        Find the index where the mode would be moved to in the optimal list,
+         and use its bit cost instead of the one for the mode's current
+         position in the list.*/
+      /*We don't recompute scheme bits; this is computing opportunity cost, not
+         an update.*/
+      for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&&
+       _chooser->mode_counts[_mb_mode]>=
+       _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
+      scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri];
+    }
+    if(scheme_bits<best_bits)best_bits=scheme_bits;
+    if(++si>=8)break;
+    scheme1=_chooser->scheme_list[si];
+  }
+  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
+  return best_bits-_chooser->scheme_bits[scheme0];
+}
+
+/*Incrementally update the mode counts and per-scheme bit counts and re-order
+   the scheme lists once a mode has been selected.
+  _mb_mode: The mode that was chosen.*/
+static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int ri;
+  int si;
+  _chooser->mode_counts[_mb_mode]++;
+  /*Re-order the scheme0 mode list if necessary.*/
+  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
+    int pmode;
+    pmode=_chooser->scheme0_list[ri-1];
+    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
+    /*Reorder the mode ranking.*/
+    _chooser->scheme0_ranks[pmode]++;
+    _chooser->scheme0_list[ri]=pmode;
+  }
+  _chooser->scheme0_ranks[_mb_mode]=ri;
+  _chooser->scheme0_list[ri]=_mb_mode;
+  /*Now add the bit cost for the mode to each scheme.*/
+  for(si=0;si<8;si++){
+    _chooser->scheme_bits[si]+=
+     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
+  }
+  /*Finally, re-order the list of schemes.*/
+  for(si=1;si<8;si++){
+    int sj;
+    int scheme0;
+    int bits0;
+    sj=si;
+    scheme0=_chooser->scheme_list[si];
+    bits0=_chooser->scheme_bits[scheme0];
+    do{
+      int scheme1;
+      scheme1=_chooser->scheme_list[sj-1];
+      if(bits0>=_chooser->scheme_bits[scheme1])break;
+      _chooser->scheme_list[sj]=scheme1;
+    }
+    while(--sj>0);
+    _chooser->scheme_list[sj]=scheme0;
+  }
+}
+
+
+
+static void oc_plane_state_plane_setup(oc_enc_ctx *_enc,oc_plane_state *_ps,
+ int _pli){
+  int qii;
+  int qti;
+  _ps->pli=_pli;
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    int qi;
+    qi=_enc->state.qis[qii];
+    for(qti=0;qti<2;qti++){
+      _ps->dequant[qii][qti]=_enc->state.dequant_tables[qi][_pli][qti];
+      _ps->enquant[qii][qti]=_enc->enquant_tables[qi][_pli][qti];
+    }
+  }
+}
+
+
+
+static int oc_sb_run_bits(int _run_count){
+  int i;
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  return OC_SB_RUN_CODE_NBITS[i];
+}
+
+static int oc_block_run_bits(int _run_count){
+  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
+}
+
+
+
+static void oc_fr_state_init(oc_frag_state *_fr){
+  _fr->sb_partial_last=-1;
+  _fr->sb_partial_count=0;
+  _fr->sb_partial_break=0;
+  _fr->sb_full_last=-1;
+  _fr->sb_full_count=0;
+  _fr->sb_full_break=0;
+  _fr->b_last=-1;
+  _fr->b_count=0;
+  _fr->b_pend=0;
+  _fr->sb_partial=0;
+  _fr->sb_coded=0;
+  _fr->bits=0;
+}
+
+
+static void oc_fr_skip_block(oc_frag_state *_fr){
+  if(_fr->sb_coded){
+    if(!_fr->sb_partial){
+      /*The super block was previously fully coded.*/
+      if(_fr->b_last==-1){
+        /*First run of the frame...*/
+        _fr->bits++;
+        _fr->b_last=1;
+      }
+      if(_fr->b_last==1){
+        /*The in-progress run is also a coded run.*/
+        _fr->b_count+=_fr->b_pend;
+      }
+      else{
+        /*The in-progress run is an uncoded run; flush.*/
+        _fr->bits+=oc_block_run_bits(_fr->b_count);
+        _fr->b_count=_fr->b_pend;
+        _fr->b_last=1;
+      }
+    }
+    /*Add a skip block.*/
+    if(_fr->b_last==0)_fr->b_count++;
+    else{
+      if(_fr->b_count)_fr->bits+=oc_block_run_bits(_fr->b_count);
+      _fr->b_count=1;
+      _fr->b_last=0;
+    }
+  }
+  _fr->b_pend++;
+  _fr->sb_partial=1;
+}
+
+static void oc_fr_code_block(oc_frag_state *_fr){
+  if(_fr->sb_partial){
+    if(!_fr->sb_coded){
+      /*The super block was previously completely uncoded...*/
+      if(_fr->b_last==-1){
+        /*First run of the frame...*/
+        _fr->bits++;
+        _fr->b_last=0;
+      }
+      if(_fr->b_last==0){
+        /*The in-progress run is also an uncoded run.*/
+        _fr->b_count += _fr->b_pend;
+      }
+      else{
+        /*The in-progress run is a coded run; flush.*/
+        _fr->bits+=oc_block_run_bits(_fr->b_count);
+        _fr->b_count=_fr->b_pend;
+        _fr->b_last=0;
+      }
+    }
+    /*Add a coded block.*/
+    if(_fr->b_last==1)_fr->b_count++;
+    else{
+      _fr->bits+=oc_block_run_bits(_fr->b_count);
+      _fr->b_count=1;
+      _fr->b_last=1;
+    }
+  }
+  _fr->b_pend++;
+  _fr->sb_coded=1;
+}
+
+static void oc_fr_finish_sb(oc_frag_state *_fr){
+  /*Update the partial flag.*/
+  int partial;
+  partial=_fr->sb_partial&_fr->sb_coded;
+  if(_fr->sb_partial_last==-1){
+    _fr->bits++;
+    _fr->sb_partial_last=partial;
+  }
+  if(_fr->sb_partial_break){
+    _fr->bits++;
+    _fr->sb_partial_break=0;
+  }
+  if(_fr->sb_partial_last==partial&&_fr->sb_partial_count<4129){
+    _fr->sb_partial_count++;
+  }
+  else{
+    _fr->bits+=oc_sb_run_bits(_fr->sb_partial_count);
+    if(_fr->sb_partial_count>=4129)_fr->sb_partial_break=1;
+    _fr->sb_partial_count=1;
+  }
+  _fr->sb_partial_last=partial;
+  /*Fully coded/uncoded state.*/
+  if(!_fr->sb_partial||!_fr->sb_coded){
+    if(_fr->sb_full_last==-1){
+      _fr->bits++;
+      _fr->sb_full_last=_fr->sb_coded;
+    }
+    if(_fr->sb_full_break){
+      _fr->bits++;
+      _fr->sb_full_break=0;
+    }
+    if(_fr->sb_full_last==_fr->sb_coded&&_fr->sb_full_count<4129){
+      _fr->sb_full_count++;
+    }
+    else{
+      _fr->bits+=oc_sb_run_bits( _fr->sb_full_count);
+      if(_fr->sb_full_count>=4129)_fr->sb_full_break=1;
+      _fr->sb_full_count=1;
+    }
+    _fr->sb_full_last=_fr->sb_coded;
+  }
+  _fr->b_pend=0;
+  _fr->sb_partial=0;
+  _fr->sb_coded=0;
+}
+
+static void oc_fr_flush(oc_frag_state *_fr){
+  /*Flush any pending partial run.*/
+  if(_fr->sb_partial_break)_fr->bits++;
+  if(_fr->sb_partial_count)_fr->bits+=oc_sb_run_bits(_fr->sb_partial_count);
+  /*Flush any pending full run.*/
+  if(_fr->sb_full_break)_fr->bits++;
+  if(_fr->sb_full_count)_fr->bits+=oc_sb_run_bits(_fr->sb_full_count);
+  /*Flush any pending block run.*/
+  if(_fr->b_count)_fr->bits+=oc_block_run_bits(_fr->b_count);
+}
+
+static int oc_fr_cost1(const oc_frag_state *_fr){
+  oc_frag_state tmp;
+  int           bits;
+  *&tmp=*_fr;
+  oc_fr_skip_block(&tmp);
+  bits=tmp.bits;
+  *&tmp=*_fr;
+  oc_fr_code_block(&tmp);
+  return tmp.bits-bits;
+}
+
+static int oc_fr_cost4(const oc_frag_state *_pre,const oc_frag_state *_post){
+  oc_frag_state tmp;
+  *&tmp=*_pre;
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  return _post->bits-tmp.bits;
+}
+
+
+
+static void oc_enc_frag_uncode(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi){
+  const unsigned char *src;
+  unsigned char       *dst;
+  ptrdiff_t            frag_offs;
+  int                  ystride;
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  ystride=_enc->state.ref_ystride[_pli];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]];
+  oc_frag_copy(&_enc->state,dst+frag_offs,src+frag_offs,ystride);
+  _enc->state.frags[_fragi].coded=0;
+  /*We do NOT update frags[_fragi].mb_mode or frag_mvs[_fragi], since they are
+     not subsequently referenced by uncoded fragments.*/
+}
+
+static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
+ oc_plane_state *_ps,ptrdiff_t _fragi,int _overhead_bits,
+ oc_rd_metric *_mo,oc_token_checkpoint **_stack){
+  ogg_int16_t          buffer[64]OC_ALIGN16;
+  ogg_int16_t          data[64]OC_ALIGN16;
+  const ogg_uint16_t  *dequant;
+  const oc_iquant     *enquant;
+  ptrdiff_t            frag_offs;
+  int                  ystride;
+  const unsigned char *src;
+  const unsigned char *ref;
+  unsigned char       *dst;
+  int                  frame_type;
+  int                  nonzero;
+  int                  uncoded_ssd;
+  int                  coded_ssd;
+  int                  uncoded_dc;
+  int                  coded_dc;
+  int                  dc_flag;
+  oc_token_checkpoint *checkpoint;
+  oc_fragment         *frags;
+  int                  mb_mode;
+  int                  mv_offs[2];
+  int                  nmv_offs;
+  int                  ac_bits;
+  int                  pi;
+  int                  zzi;
+  frags=_enc->state.frags;
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  ystride=_enc->state.ref_ystride[_ps->pli];
+  mb_mode=frags[_fragi].mb_mode;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[mb_mode]]]+frag_offs;
+  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
+   +frag_offs;
+  /*Although the fragment coding overhead determination is accurate, it is
+     greedy, using very coarse-grained local information.
+    Allowing it to mildly discourage coding turns out to be beneficial, but
+     it's not clear that allowing it to encourage coding through negative
+     coding overhead deltas is useful.
+    For that reason, we disallow negative coding_overheads.*/
+  if(_overhead_bits<0)_overhead_bits=0;
+  /*Motion compensation:*/
+  switch(mb_mode){
+    case OC_MODE_INTRA:{
+      nmv_offs=0;
+      oc_enc_frag_sub_128(_enc,data,src,ystride);
+    }break;
+    case OC_MODE_GOLDEN_NOMV:
+    case OC_MODE_INTER_NOMV:{
+      nmv_offs=1;
+      mv_offs[0]=0;
+      oc_enc_frag_sub(_enc,data,src,ref,ystride);
+    }break;
+    default:{
+      const oc_mv *frag_mvs;
+      frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_ps->pli,
+       frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
+      if(nmv_offs>1){
+        oc_enc_frag_copy2(_enc,dst,
+         ref+mv_offs[0],ref+mv_offs[1],ystride);
+        oc_enc_frag_sub(_enc,data,src,dst,ystride);
+      }
+      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
+    }break;
+  }
+#if defined(OC_COLLECT_METRICS)
+  {
+    unsigned satd;
+    switch(nmv_offs){
+      case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break;
+      case 1:{
+        satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
+      }break;
+      default:{
+        satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX);
+      }
+    }
+    _enc->frag_satd[_fragi]=satd;
+  }
+#endif
+  frame_type=_enc->state.frame_type;
+  uncoded_ssd=uncoded_dc=0;
+  if(frame_type!=OC_INTRA_FRAME){
+    if(mb_mode==OC_MODE_INTER_NOMV){
+      for(pi=0;pi<64;pi++){
+        uncoded_ssd+=data[pi]*data[pi];
+        uncoded_dc+=data[pi];
+      }
+    }
+    else{
+      oc_enc_frag_sub(_enc,buffer,src,
+       _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]]
+       +frag_offs,ystride);
+      for(pi=0;pi<64;pi++){
+        uncoded_ssd+=buffer[pi]*buffer[pi];
+        uncoded_dc+=buffer[pi];
+      }
+    }
+    /*Scale to match DCT domain.*/
+    uncoded_ssd<<=4;
+  }
+  /*Transform:*/
+  oc_enc_fdct8x8(_enc,buffer,data);
+  /*Quantize:*/
+  dequant=_ps->dequant[0][mb_mode!=OC_MODE_INTRA];
+  enquant=_ps->enquant[0][mb_mode!=OC_MODE_INTRA];
+  nonzero=0;
+  for(zzi=0;zzi<64;zzi++){
+    int v;
+    int val;
+    int d;
+    v=buffer[OC_FZIG_ZAG[zzi]];
+    d=dequant[zzi];
+    val=v<<1;
+    v=abs(val);
+    if(v>=d){
+      int s;
+      s=OC_SIGNMASK(val);
+      /*The bias added here rounds ties away from zero, since token
+         optimization can only decrease the magnitude of the quantized
+         value.*/
+      val+=(d+s)^s;
+      /*Note the arithmetic right shift is not guaranteed by ANSI C.
+        Hopefully no one still uses ones-complement architectures.*/
+      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
+      data[zzi]=OC_CLAMPI(-580,val,580);
+      nonzero=zzi;
+    }
+    else data[zzi]=0;
+  }
+  frags[_fragi].dc=data[0];
+  frags[_fragi].coded=1;
+  /*Tokenize.*/
+  checkpoint=*_stack;
+  ac_bits=oc_enc_tokenize_ac(_enc,_fragi,data,dequant,buffer,
+   _ps->pli,_stack,mb_mode==OC_MODE_INTRA?3:0);
+  /*Reconstruct.
+    TODO: nonzero may need to be adjusted after tokenization.*/
+  oc_dequant_idct8x8(&_enc->state,buffer,data,
+   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
+  if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,buffer);
+  else{
+    oc_enc_frag_recon_inter(_enc,dst,
+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
+  }
+#if !defined(OC_COLLECT_METRICS)
+  if(frame_type!=OC_INTRA_FRAME)
+#endif
+  {
+    /*In retrospect, should we have skipped this block?*/
+    oc_enc_frag_sub(_enc,buffer,src,dst,ystride);
+    coded_ssd=coded_dc=0;
+    for(pi=0;pi<64;pi++){
+      coded_ssd+=buffer[pi]*buffer[pi];
+      coded_dc+=buffer[pi];
+    }
+    /*Scale to match DCT domain.*/
+    coded_ssd<<=4;
+    /*We actually only want the AC contribution to the SSDs.*/
+    uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
+    coded_ssd-=coded_dc*coded_dc>>2;
+#if defined(OC_COLLECT_METRICS)
+    _enc->frag_ssd[_fragi]=coded_ssd;
+  }
+  if(frame_type!=OC_INTRA_FRAME){
+#endif
+    _mo->uncoded_ac_ssd+=uncoded_ssd;
+    /*DC is a special case; if there's more than a full-quantizer improvement
+       in the effective DC component, always force-code the block.
+      One might expect this to be abs(uncoded_dc-coded_dc), but this performs
+       slightly better, since coded_dc will always be near zero, but may be on
+       the opposite side of zero from uncoded_dc.*/
+    dc_flag=abs(uncoded_dc)-abs(coded_dc)>dequant[0]<<1;
+    if(!dc_flag&&uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&&
+     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
+        is enabled.*/
+     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR)){
+      /*Hm, not worth it; roll back.*/
+      oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
+      *_stack=checkpoint;
+      oc_enc_frag_uncode(_enc,_ps->pli,_fragi);
+      _mo->coded_ac_ssd+=uncoded_ssd;
+      return 0;
+    }
+    else{
+      _mo->dc_flag|=dc_flag;
+      _mo->coded_ac_ssd+=coded_ssd;
+      _mo->ac_bits+=ac_bits;
+    }
+  }
+  return 1;
+}
+
+/* mode_overhead is scaled by << OC_BIT_SCALE */
+static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
+ oc_plane_state *_ps,int _mbi,int _mode_overhead,oc_frag_state *_fr){
+  /*Worst case token stack usage for 4 fragments.*/
+  oc_token_checkpoint  stack[64*4];
+  oc_token_checkpoint *stackptr;
+  const oc_sb_map     *sb_maps;
+  signed char         *mb_modes;
+  oc_fragment         *frags;
+  ptrdiff_t           *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  oc_rd_metric         mo;
+  oc_frag_state        fr_checkpoint;
+  int                  mb_mode;
+  int                  ncoded;
+  ptrdiff_t            fragi;
+  int                  bi;
+  *&fr_checkpoint=*_fr;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_modes=_enc->state.mb_modes;
+  frags=_enc->state.frags;
+  coded_fragis=_enc->state.coded_fragis;
+  ncoded_fragis=_enc->state.ncoded_fragis[0];
+  mb_mode=mb_modes[_mbi];
+  ncoded=0;
+  stackptr=stack;
+  memset(&mo,0,sizeof(mo));
+  for(bi=0;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].mb_mode=mb_mode;
+    if(oc_enc_block_transform_quantize(_enc,
+     _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
+      oc_fr_code_block(_fr);
+      coded_fragis[ncoded_fragis++]=fragi;
+      ncoded++;
+    }
+    else oc_fr_skip_block(_fr);
+  }
+  if(_enc->state.frame_type!=OC_INTRA_FRAME){
+    if(ncoded>0&&!mo.dc_flag){
+      int cost;
+      /*Some individual blocks were worth coding.
+        See if that's still true when accounting for mode and MV overhead.*/
+      cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
+       +oc_fr_cost4(&fr_checkpoint,_fr)+(_mode_overhead>>OC_BIT_SCALE));
+      if(mo.uncoded_ac_ssd<=cost){
+        /*Taking macroblock overhead into account, it is not worth coding this
+           MB.*/
+        oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
+        *_fr=*&fr_checkpoint;
+        for(bi=0;bi<4;bi++){
+          fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+          if(frags[fragi].coded)oc_enc_frag_uncode(_enc,0,fragi);
+          oc_fr_skip_block(_fr);
+        }
+        ncoded_fragis-=ncoded;
+        ncoded=0;
+      }
+    }
+    if(ncoded==0){
+      /*No luma blocks coded, mode is forced.*/
+      mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+      return 0;
+    }
+    /*Assume that a 1mv with a single coded block is always cheaper than a 4mv
+       with a single coded block.
+      This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
+       skipped blocks, while a 1MV does not.*/
+    else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
+      mb_modes[_mbi]=OC_MODE_INTER_MV;
+    }
+  }
+  _enc->state.ncoded_fragis[0]=ncoded_fragis;
+  return ncoded;
+}
+
+static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
+ oc_plane_state *_ps,int _sbi_start,int _sbi_end,oc_frag_state *_fr){
+  const oc_sb_map *sb_maps;
+  oc_sb_flags     *sb_flags;
+  ptrdiff_t       *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  int              sbi;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  coded_fragis=_enc->state.coded_fragis+_enc->state.fplanes[_ps->pli].froffset;
+  ncoded_fragis=_enc->state.ncoded_fragis[_ps->pli];
+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
+    /*Worst case token stack usage for 1 fragment.*/
+    oc_token_checkpoint stack[64];
+    oc_rd_metric        mo;
+    int                 quadi;
+    int                 bi;
+    memset(&mo,0,sizeof(mo));
+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
+      ptrdiff_t fragi;
+      fragi=sb_maps[sbi][quadi][bi];
+      if(fragi>=0){
+        oc_token_checkpoint *stackptr;
+        stackptr=stack;
+        if(oc_enc_block_transform_quantize(_enc,
+         _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
+          coded_fragis[ncoded_fragis++]=fragi;
+          oc_fr_code_block(_fr);
+        }
+        else oc_fr_skip_block(_fr);
+      }
+    }
+    oc_fr_finish_sb(_fr);
+    sb_flags[sbi].coded_fully=_fr->sb_full_last;
+    sb_flags[sbi].coded_partially=_fr->sb_partial_last;
+  }
+  _enc->state.ncoded_fragis[_ps->pli]=ncoded_fragis;
+}
+
+/*Mode decision is done by exhaustively examining all potential choices.
+  Obviously, doing the motion compensation, fDCT, tokenization, and then
+   counting the bits each token uses is computationally expensive.
+  Theora's EOB runs can also split the cost of these tokens across multiple
+   fragments, and naturally we don't know what the optimal choice of Huffman
+   codes will be until we know all the tokens we're going to encode in all the
+   fragments.
+  So we use a simple approach to estimating the bit cost and distortion of each
+   mode based upon the SATD value of the residual before coding.
+  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
+   the process (modified somewhat from that of the paper) is very simple.
+  We build a non-linear regression of the mappings from
+   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
+   SSD for each qi.
+  A separate set of mappings is kept for each quantization type and color
+   plane.
+  The mappings are constructed by partitioning the SATD values into a small
+   number of bins (currently 24) and using a linear regression in each bin
+   (as opposed to the 0th-order regression used by Kim).
+  The bit counts and SSD measurements are obtained by examining actual encoded
+   frames, with appropriate lambda values and optimal Huffman codes selected.
+  EOB bits are assigned to the fragment that started the EOB run (as opposed to
+   dividing them among all the blocks in the run; though the latter approach
+   seems more theoretically correct, Monty's testing showed a small improvement
+   with the former, though that may have been merely statistical noise).
+
+  @ARTICLE{Kim03,
+    author="Hyun Mun Kim",
+    title="Adaptive Rate Control Using Nonlinear Regression",
+    journal="IEEE Transactions on Circuits and Systems for Video Technology",
+    volume=13,
+    number=5,
+    pages="432--439",
+    month=May,
+    year=2003
+  }*/
+
+static void oc_mode_dct_cost_accum(oc_mode_choice *_modec,
+ int _qi,int _pli,int _qti,int _satd){
+  unsigned rmse;
+  int      bin;
+  int      dx;
+  int      y0;
+  int      z0;
+  int      dy;
+  int      dz;
+  /*STAD metrics for chroma planes vary much less than luma, so we scale them
+     by 4 to distribute them into the mode decision bins more evenly.*/
+  _satd<<=_pli+1&2;
+  bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
+  dx=_satd-(bin<<OC_SAD_SHIFT);
+  y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
+  z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
+  dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
+  dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
+  _modec->rate+=OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
+  rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
+  _modec->ssd+=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
+}
+
+static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
+ _modec->cost=_modec->ssd+(_modec->rate+_modec->overhead)*_lambda;
+}
+
+static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ int _mbi,int _qi){
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    ystride;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  _modec->rate=_modec->ssd=0;
+  ystride=_enc->state.ref_ystride[0];
+  for(bi=0;bi<4;bi++){
+    fragi=mb_map[0][bi];
+    frag_offs=frag_buf_offs[fragi];
+    oc_mode_dct_cost_accum(_modec,_qi,0,0,
+     oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride));
+  }
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    oc_mode_dct_cost_accum(_modec,_qi,pli,0,
+     oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride));
+  }
+  _modec->overhead=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ int _mbi,int _mb_mode,const signed char *_mv,int _qi){
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    dx;
+  int                    dy;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE[_mb_mode]]];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  dx=_mv[0];
+  dy=_mv[1];
+  _modec->rate=_modec->ssd=0;
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
+    for(bi=0;bi<4;bi++){
+      fragi=mb_map[0][bi];
+      frag_offs=frag_buf_offs[fragi];
+      oc_mode_dct_cost_accum(_modec,_qi,0,1,oc_enc_frag_satd2_thresh(_enc,
+       src+frag_offs,ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,
+       UINT_MAX));
+    }
+  }
+  else{
+    for(bi=0;bi<4;bi++){
+      fragi=mb_map[0][bi];
+      frag_offs=frag_buf_offs[fragi];
+      oc_mode_dct_cost_accum(_modec,_qi,0,1,
+       oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX));
+    }
+  }
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      oc_mode_dct_cost_accum(_modec,_qi,pli,1,oc_enc_frag_satd2_thresh(_enc,
+       src+frag_offs,ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,
+       UINT_MAX));
+    }
+  }
+  else{
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      oc_mode_dct_cost_accum(_modec,_qi,pli,1,oc_enc_frag_satd_thresh(_enc,
+       src+frag_offs,ref+frag_offs+mv_offs[0],ystride,UINT_MAX));
+    }
+  }
+  _modec->overhead=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ int _mbi,int _mb_mode,int _qi){
+  static const oc_mv OC_MV_ZERO;
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_qi);
+}
+
+static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ int _mbi,int _mb_mode,const signed char *_mv,int _qi){
+  int bits0;
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_qi);
+  bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31];
+  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+  return bits0;
+}
+
+static int oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,int _mbi,
+ oc_mv _mv[4],int _qi){
+  oc_mv                  cbmvs[4];
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  oc_mv                 *frag_mvs;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    dx;
+  int                    dy;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    bits0;
+  unsigned               satd;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  frag_mvs=_enc->state.frag_mvs;
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  _modec->rate=_modec->ssd=0;
+  bits0=0;
+  for(bi=0;bi<4;bi++){
+    fragi=mb_map[0][bi];
+    dx=_mv[bi][0];
+    dy=_mv[bi][1];
+    /*Save the block MVs as the current ones while we're here; we'll replace
+       them if we don't ultimately choose 4MV mode.*/
+    frag_mvs[fragi][0]=(signed char)dx;
+    frag_mvs[fragi][1]=(signed char)dy;
+    frag_offs=frag_buf_offs[fragi];
+    bits0+=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
+      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+    else{
+      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+    oc_mode_dct_cost_accum(_modec,_qi,0,1,satd);
+  }
+  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,
+   (const oc_mv *)_mv);
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    dx=cbmvs[bi][0];
+    dy=cbmvs[bi][1];
+    frag_offs=frag_buf_offs[fragi];
+    /*TODO: We could save half these calls by re-using the results for the Cb
+       and Cr planes; is it worth it?*/
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){
+      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+    else{
+      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+    oc_mode_dct_cost_accum(_modec,_qi,pli,1,satd);
+  }
+  _modec->overhead=oc_mode_scheme_chooser_cost(&_enc->chooser,
+   OC_MODE_INTER_MV_FOUR)+OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+48)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+  return bits0;
+}
+
+int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode){
+  oc_set_chroma_mvs_func  set_chroma_mvs;
+  oc_mcenc_ctx            mcenc;
+  oc_plane_state          ps;
+  oc_frag_state           fr;
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
+  ogg_int64_t             interbits;
+  ogg_int64_t             intrabits;
+  const unsigned char    *map_idxs;
+  int                     nmap_idxs;
+  unsigned               *coded_mbis;
+  unsigned               *uncoded_mbis;
+  size_t                  ncoded_mbis;
+  size_t                  nuncoded_mbis;
+  oc_sb_flags            *sb_flags;
+  signed char            *mb_modes;
+  const oc_mb_map        *mb_maps;
+  oc_mb_enc_info         *embs;
+  oc_fragment            *frags;
+  oc_mv                  *frag_mvs;
+  int                     qi;
+  unsigned                sbi;
+  unsigned                sbi_end;
+  int                     pli;
+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
+  _enc->state.frame_type=_frame_type;
+  if(!_recode)oc_mcenc_start(_enc,&mcenc);
+  oc_fr_state_init(&fr);
+  oc_mode_scheme_chooser_reset(&_enc->chooser);
+  oc_enc_tokenize_start(_enc);
+  oc_plane_state_plane_setup(_enc,&ps,0);
+  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
+  interbits=intrabits=0;
+  last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
+  /*Choose MVs and MB modes and quantize and code luma.
+    Must be done in Hilbert order.*/
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  qi=_enc->state.qis[0];
+  coded_mbis=_enc->coded_mbis;
+  uncoded_mbis=coded_mbis+_enc->state.nmbs;
+  ncoded_mbis=0;
+  nuncoded_mbis=0;
+  _enc->state.ncoded_fragis[0]=0;
+  _enc->state.ncoded_fragis[1]=0;
+  _enc->state.ncoded_fragis[2]=0;
+  sb_flags=_enc->state.sb_flags;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  embs=_enc->mb_info;
+  frags=_enc->state.frags;
+  frag_mvs=_enc->state.frag_mvs;
+  sbi_end=_enc->state.fplanes[0].nsbs;
+  for(sbi=0;sbi<sbi_end;sbi++){
+    int quadi;
+    /*Mode addressing is through Y plane, always 4 MB per SB.*/
+    for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+      unsigned  mbi;
+      int       mb_mode;
+      int       dx;
+      int       dy;
+      int       mapii;
+      int       mapi;
+      int       bi;
+      ptrdiff_t fragi;
+      mbi=sbi<<2|quadi;
+      if(!_recode&&_enc->state.curframe_num>0){
+        /*Motion estimation:
+          We always do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not.*/
+        /*Move the motion vector predictors back a frame.*/
+        memmove(embs[mbi].analysis_mv+1,
+         embs[mbi].analysis_mv,2*sizeof(embs[mbi].analysis_mv[0]));
+        /*Search the last frame.*/
+        oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_PREV);
+        /*Search the golden frame.*/
+        oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_GOLD);
+      }
+      dx=dy=0;
+      if(_enc->state.frame_type==OC_INTRA_FRAME){
+        mb_modes[mbi]=mb_mode=OC_MODE_INTRA;
+        oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,0,&fr);
+      }
+      else{
+        oc_mode_choice modes[8];
+        int            mb_mv_bits_0;
+        int            mb_gmv_bits_0;
+        int            mb_4mv_bits_0;
+        int            mb_4mv_bits_1;
+        int            inter_mv_pref;
+        /*Find the block choice with the lowest estimated coding cost.
+          If a Cb or Cr block is coded but no Y' block from a macro block then
+           the mode MUST be OC_MODE_INTER_NOMV.
+          This is the default state to which the mode data structure is
+           initialised in encoder and decoder at the start of each frame.*/
+        /*Block coding cost is estimated from correlated SATD metrics.*/
+        /*At this point, all blocks that are in frame are still marked coded.*/
+        if(!_recode){
+          memcpy(embs[mbi].unref_mv,
+           embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
+          embs[mbi].refined=0;
+        }
+        oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,OC_MODE_INTER_NOMV,qi);
+        oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,qi);
+        intrabits+=modes[OC_MODE_INTRA].rate;
+        mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+         OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],qi);
+        oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
+         OC_MODE_INTER_MV_LAST,last_mv,qi);
+        oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
+         OC_MODE_INTER_MV_LAST2,prior_mv,qi);
+        oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+         OC_MODE_GOLDEN_NOMV,qi);
+        mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+         OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],qi);
+        mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+         embs[mbi].block_mv,qi);
+        mb_4mv_bits_1=48;
+        /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+           refinement.
+          We choose the explicit MV mode that's already furthest ahead on bits
+           and refine only that one.
+          We have to be careful to remember which ones we've refined so that
+           we don't refine it again if we re-encode this frame.*/
+        inter_mv_pref=_enc->lambda*3<<OC_BIT_SCALE;
+        if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
+         modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+          if(!(embs[mbi].refined&0x80)){
+            oc_mcenc_refine4mv(_enc,mbi);
+            embs[mbi].refined|=0x80;
+          }
+          mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+           embs[mbi].ref_mv,qi);
+        }
+        else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
+         modes[OC_MODE_INTER_MV].cost){
+          if(!(embs[mbi].refined&0x40)){
+            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
+            embs[mbi].refined|=0x40;
+          }
+          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+           OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],qi);
+        }
+        if(!(embs[mbi].refined&0x04)){
+          oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
+          embs[mbi].refined|=0x04;
+        }
+        mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+         OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],qi);
+        /*Finally, pick the mode with the cheapest estimated bit cost.*/
+        mb_mode=0;
+        if(modes[1].cost<modes[0].cost)mb_mode=1;
+        if(modes[3].cost<modes[mb_mode].cost)mb_mode=3;
+        if(modes[4].cost<modes[mb_mode].cost)mb_mode=4;
+        if(modes[5].cost<modes[mb_mode].cost)mb_mode=5;
+        if(modes[6].cost<modes[mb_mode].cost)mb_mode=6;
+        if(modes[7].cost<modes[mb_mode].cost)mb_mode=7;
+        /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
+        if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
+          inter_mv_pref=0;
+        }
+        if(modes[2].cost<modes[mb_mode].cost+inter_mv_pref)mb_mode=2;
+        mb_modes[mbi]=mb_mode;
+        /*Propagate the MVs to the luma blocks.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
+              dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
+            }break;
+            case OC_MODE_INTER_MV_LAST:{
+              dx=last_mv[0];
+              dy=last_mv[1];
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              dx=prior_mv[0];
+              dy=prior_mv[1];
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
+              dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
+            }break;
+          }
+          for(bi=0;bi<4;bi++){
+            fragi=mb_maps[mbi][0][bi];
+            frag_mvs[fragi][0]=(signed char)dx;
+            frag_mvs[fragi][1]=(signed char)dy;
+          }
+        }
+        if(oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,
+         modes[mb_mode].overhead,&fr)>0){
+          int orig_mb_mode;
+          orig_mb_mode=mb_mode;
+          mb_mode=mb_modes[mbi];
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              /*If we're backing out from 4MV, find the MV we're actually
+                 using.*/
+              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
+                for(bi=0;;bi++){
+                  fragi=mb_maps[mbi][0][bi];
+                  if(frags[fragi].coded){
+                    memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+                    dx=frag_mvs[fragi][0];
+                    dy=frag_mvs[fragi][1];
+                    break;
+                  }
+                }
+                mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+              }
+              /*Otherwise we used the original analysis MV.*/
+              else{
+                memcpy(last_mv,
+                 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
+              }
+              _enc->mv_bits[0]+=mb_mv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              oc_mv tmp_mv;
+              memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              memcpy(last_mv,tmp_mv,sizeof(last_mv));
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              _enc->mv_bits[0]+=mb_gmv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_FOUR:{
+              oc_mv lbmvs[4];
+              oc_mv cbmvs[4];
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              for(bi=0;bi<4;bi++){
+                fragi=mb_maps[mbi][0][bi];
+                if(frags[fragi].coded){
+                  memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+                  memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
+                  _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
+                   +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
+                  _enc->mv_bits[1]+=12;
+                }
+                /*Replace the block MVs for not-coded blocks with (0,0).*/
+                else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
+              }
+              (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+              for(mapii=4;mapii<nmap_idxs;mapii++){
+                mapi=map_idxs[mapii];
+                pli=mapi>>2;
+                bi=mapi&3;
+                fragi=mb_maps[mbi][pli][bi];
+                frags[fragi].mb_mode=mb_mode;
+                memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
+              }
+            }break;
+          }
+          coded_mbis[ncoded_mbis++]=mbi;
+          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
+          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
+        }
+        else{
+          *(uncoded_mbis-++nuncoded_mbis)=mbi;
+          mb_mode=OC_MODE_INTER_NOMV;
+          dx=dy=0;
+        }
+      }
+      /*Propagate final MB mode and MVs to the chroma blocks.
+        This has already been done for 4MV mode, since it requires individual
+         block motion vectors.*/
+      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+        for(mapii=4;mapii<nmap_idxs;mapii++){
+          mapi=map_idxs[mapii];
+          pli=mapi>>2;
+          bi=mapi&3;
+          fragi=mb_maps[mbi][pli][bi];
+          frags[fragi].mb_mode=mb_mode;
+          frag_mvs[fragi][0]=(signed char)dx;
+          frag_mvs[fragi][1]=(signed char)dy;
+        }
+      }
+    }
+    oc_fr_finish_sb(&fr);
+    sb_flags[sbi].coded_fully=fr.sb_full_last;
+    sb_flags[sbi].coded_partially=fr.sb_partial_last;
+  }
+  /*Code Cb plane.*/
+  oc_plane_state_plane_setup(_enc,&ps,1);
+  sbi=sbi_end;
+  sbi_end=sbi+_enc->state.fplanes[1].nsbs;
+  oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
+  /*Code Cr plane.*/
+  oc_plane_state_plane_setup(_enc,&ps,2);
+  sbi=sbi_end;
+  sbi_end=sbi+_enc->state.fplanes[2].nsbs;
+  oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
+  if(_enc->state.frame_type!=OC_INTRA_FRAME){
+    if(interbits>intrabits)return 1;
+    /*Finish adding flagging overhead costs to inter bit counts.*/
+    oc_fr_flush(&fr);
+    interbits+=fr.bits<<OC_BIT_SCALE;
+    interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+    interbits+=
+     _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
+    if(interbits>intrabits)return 1;
+  }
+  _enc->ncoded_mbis=ncoded_mbis;
+  /*Compact the coded fragment list.*/
+  {
+    ptrdiff_t ncoded_fragis;
+    ncoded_fragis=_enc->state.ncoded_fragis[0];
+    for(pli=1;pli<3;pli++){
+      memmove(_enc->state.coded_fragis+ncoded_fragis,
+       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
+       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
+      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    }
+    _enc->state.ntotal_coded_fragis=ncoded_fragis;
+  }
+  return 0;
+}
+
+#if defined(OC_COLLECT_METRICS)
+# include <stdio.h>
+# include <math.h>
+
+# define OC_ZWEIGHT   (0.25)
+# define OC_BIN(_satd) (OC_MINI((_satd)>>OC_SAD_SHIFT,OC_SAD_BINS-1))
+
+static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _satd,int _rate,double _rmse){
+  double rate;
+  /*Accumulate statistics without the scaling; this lets us change the scale
+     factor yet still use old data.*/
+  rate=ldexp(_rate,-OC_BIT_SCALE);
+  if(_metrics->fragw>0){
+    double dsatd;
+    double drate;
+    double drmse;
+    double w;
+    dsatd=_satd-_metrics->satd/_metrics->fragw;
+    drate=rate-_metrics->rate/_metrics->fragw;
+    drmse=_rmse-_metrics->rmse/_metrics->fragw;
+    w=_metrics->fragw*_w/(_metrics->fragw+_w);
+    _metrics->satd2+=dsatd*dsatd*w;
+    _metrics->satdrate+=dsatd*drate*w;
+    _metrics->rate2+=drate*drate*w;
+    _metrics->satdrmse+=dsatd*drmse*w;
+    _metrics->rmse2+=drmse*drmse*w;
+  }
+  _metrics->fragw+=_w;
+  _metrics->satd+=_satd*_w;
+  _metrics->rate+=rate*_w;
+  _metrics->rmse+=_rmse*_w;
+}
+
+static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n){
+  int i;
+  /*Find a non-empty set of metrics.*/
+  for(i=0;i<_n&&_src[i].fragw<=0;i++);
+  if(i>=_n){
+    memset(_dst,0,sizeof(*_dst));
+    return;
+  }
+  memcpy(_dst,_src+i,sizeof(*_dst));
+  /*And iterate over the remaining non-empty sets of metrics.*/
+  for(i++;i<_n;i++)if(_src[i].fragw>0){
+    double wa;
+    double wb;
+    double dsatd;
+    double drate;
+    double drmse;
+    double w;
+    wa=_dst->fragw;
+    wb=_src[i].fragw;
+    dsatd=_src[i].satd/wb-_dst->satd/wa;
+    drate=_src[i].rate/wb-_dst->rate/wa;
+    drmse=_src[i].rmse/wb-_dst->rmse/wa;
+    w=wa*wb/(wa+wb);
+    _dst->fragw+=_src[i].fragw;
+    _dst->satd+=_src[i].satd;
+    _dst->rate+=_src[i].rate;
+    _dst->rmse+=_src[i].rmse;
+    _dst->satd2+=_src[i].satd2+dsatd*dsatd*w;
+    _dst->satdrate+=_src[i].satdrate+dsatd*drate*w;
+    _dst->rate2+=_src[i].rate2+drate*drate*w;
+    _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w;
+    _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
+  }
+}
+
+static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){
+  int pli;
+  int qti;
+  oc_restore_fpu(&_enc->state);
+  /*Compile collected SATD/rate/RMSE metrics into a form that's immediately
+     useful for mode decision.*/
+  /*Convert raw collected data into cleaned up sample points.*/
+  for(pli=0;pli<3;pli++){
+    for(qti=0;qti<2;qti++){
+      double fragw;
+      int    bin0;
+      int    bin1;
+      int    bin;
+      fragw=0;
+      bin0=bin1=0;
+      for(bin=0;bin<OC_SAD_BINS;bin++){
+        oc_mode_metrics metrics;
+        OC_MODE_RD[_qi][pli][qti][bin].rate=0;
+        OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
+        /*Find some points on either side of the current bin.*/
+        while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
+          fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
+        }
+        while(bin0+1<bin&&bin0+1<bin1&&
+         fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
+          fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
+        }
+        /*Merge statistics and fit lines.*/
+        oc_mode_metrics_merge(&metrics,
+         OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
+        if(metrics.fragw>0&&metrics.satd2>0){
+          double a;
+          double b;
+          double msatd;
+          double mrate;
+          double mrmse;
+          double rate;
+          double rmse;
+          msatd=metrics.satd/metrics.fragw;
+          mrate=metrics.rate/metrics.fragw;
+          mrmse=metrics.rmse/metrics.fragw;
+          /*Compute the points on these lines corresponding to the actual bin
+             value.*/
+          b=metrics.satdrate/metrics.satd2;
+          a=mrate-b*msatd;
+          rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
+          OC_MODE_RD[_qi][pli][qti][bin].rate=
+           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
+          b=metrics.satdrmse/metrics.satd2;
+          a=mrmse-b*msatd;
+          rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
+          OC_MODE_RD[_qi][pli][qti][bin].rmse=
+           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
+        }
+      }
+    }
+  }
+}
+
+
+static void ModeMetricsGroup(oc_enc_ctx *_enc, int group, int huffY, int huffC, int eobcounts[64], int *actual_bits){
+  int       *stack;
+  ptrdiff_t *tfi;
+  int        ty;
+  int        tn;
+  int        ti;
+  stack=_enc->dct_eob_fi_stack[group];
+  tfi=_enc->dct_token_frag[group];
+  ty=_enc->dct_token_ycount[group];
+  tn=_enc->dct_token_count[group];
+  for(ti=0;ti<tn;ti++){
+    ptrdiff_t fragi;
+    int       token;
+    int       bits;
+    token=_enc->dct_token[group][ti];
+    bits=_enc->huff_codes[ti<ty?huffY:huffC][token].nbits
+     +OC_DCT_TOKEN_EXTRA_BITS[token];
+    /*Not an EOB run; this token belongs to a single fragment.*/
+    if(token>=OC_NDCT_EOB_TOKEN_MAX)fragi=tfi[ti];
+    else{
+      int run;
+      int fragi;
+      run=-oc_dct_token_skip(token,_enc->dct_token_eb[group][ti]);
+      fragi=stack[eobcounts[group]];
+      /*Tokens follow EOB so it must be entirely contained within this
+         plane/group.*/
+      if(ti+1<tn)eobcounts[group]+=run;
+      /*EOB is the last token in this plane/group, so it may span into the
+         next plane/group.*/
+      else{
+        int n;
+        n=_enc->dct_eob_fi_count[group];
+        while(run){
+          int rem;
+          rem=n-eobcounts[group];
+          if(rem>run)rem=run;
+          eobcounts[group]+=rem;
+          run-=rem;
+          if(run){
+            group++;
+            n=_enc->dct_eob_fi_count[group];
+            stack=_enc->dct_eob_fi_stack[group];
+          }
+        }
+      }
+    }
+    actual_bits[fragi]+=bits<<OC_BIT_SCALE;
+  }
+}
+
+/*TODO: This code has bitrotted and needs to be re-written.*/
+void ModeMetrics(oc_enc_ctx *_enc){
+  int actual_bits[_enc->frag_total];
+  int          eobcounts[64];
+  int          huff[4];
+  oc_fragment *frags;
+  int         *sp;
+  int         *mp;
+  double       fragw;
+  int          pli;
+  int          qti;
+  int          qi;
+  int          zzi;
+  ptrdiff_t    fragi;
+  qti=_enc->state.frame_type;
+  frags=_enc->state.frags;
+  sp=_enc->frag_satd;
+  mp=_enc->frag_mbi;
+  oc_restore_fpu(&_enc->state);
+  /*Weight the fragments by the inverse frame size; this prevents HD content
+     from dominating the statistics.*/
+  memset(actual_bits,0,sizeof(actual_bits));
+  memset(eobcounts,0,sizeof(eobcounts));
+  huff[0]=_enc->huff_idxs[qti][0][0];
+  huff[1]=_enc->huff_idxs[qti][0][1];
+  huff[2]=_enc->huff_idxs[qti][1][0];
+  huff[3]=_enc->huff_idxs[qti][1][1];
+  memset(_enc->dist_dist,0,sizeof(_enc->dist_dist));
+  memset(_enc->dist_bits,0,sizeof(_enc->dist_bits));
+  if(!oc_has_mode_metrics){
+    FILE *fmetrics;
+    memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
+    fmetrics=fopen("modedec.stats","rb");
+    if(fmetrics!=NULL){
+      fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
+      fclose(fmetrics);
+    }
+    for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
+    oc_has_mode_metrics=1;
+  }
+  /*Count bits for tokens.*/
+  ModeMetricsGroup(_enc, 0, huff[0], huff[1], eobcounts, actual_bits);
+  for(zzi=1;zzi<6;zzi++)
+    ModeMetricsGroup(_enc, zzi,  huff[2]+16, huff[3]+16, eobcounts, actual_bits);
+  for(;zzi<15;zzi++)
+    ModeMetricsGroup(_enc, zzi, huff[2]+32, huff[3]+32, eobcounts, actual_bits);
+  for(;zzi<28;zzi++)
+    ModeMetricsGroup(_enc, zzi, huff[2]+48, huff[3]+48, eobcounts, actual_bits);
+  for(;zzi<64;zzi++)
+    ModeMetricsGroup(_enc, zzi, huff[2]+64, huff[3]+64, eobcounts, actual_bits);
+  /*Accumulate.*/
+  fragw=1.0/_enc->state.nfrags;
+  qi=_enc->state.qis[0];
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t fragi_end;
+    fragi=_enc->state.fplanes[pli].froffset;
+    fragi_end=fragi+_enc->state.fplanes[pli].nfrags;
+    for(;fragi<fragi_end;fragi++)if(frags[fragi].coded){
+      int mbi;
+      int mb_mode;
+      int bin;
+      mbi=mp[fragi];
+      mb_mode=_enc->state.mb_modes[mbi];
+      bin=OC_BIN(sp[fragi]);
+      oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin,
+       fragw,sp[fragi],actual_bits[fragi],sqrt(_enc->frag_ssd[fragi]));
+    }
+  }
+  /*Update global SAD/rate estimation matrix.*/
+  oc_enc_mode_metrics_update(_enc,qi);
+}
+
+void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){
+  FILE *fmetrics;
+  int   qi;
+  /*Generate sample points for complete list of QI values.*/
+  for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
+  fmetrics=fopen("modedec.stats","wb");
+  if(fmetrics!=NULL){
+    fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
+    fclose(fmetrics);
+  }
+  fprintf(stdout,
+   "/*File generated by libtheora with OC_COLLECT_METRICS"
+   " defined at compile time.*/\n"
+   "#if !defined(_modedec_H)\n"
+   "# define _modedec_H (1)\n"
+   "\n"
+   "\n"
+   "\n"
+   "# if defined(OC_COLLECT_METRICS)\n"
+   "typedef struct oc_mode_metrics oc_mode_metrics;\n"
+   "# endif\n"
+   "typedef struct oc_mode_rd      oc_mode_rd;\n"
+   "\n"
+   "\n"
+   "\n"
+   "/*The number of extra bits of precision at which to store rate"
+   " metrics.*/\n"
+   "# define OC_BIT_SCALE  (%i)\n"
+   "/*The number of extra bits of precision at which to store RMSE metrics.\n"
+   "  This must be at least half OC_BIT_SCALE (rounded up).*/\n"
+   "# define OC_RMSE_SCALE (%i)\n"
+   "/*The number of bins to partition statistics into.*/\n"
+   "# define OC_SAD_BINS   (%i)\n"
+   "/*The number of bits of precision to drop"
+   " from SAD scores to assign them to a\n"
+   "   bin.*/\n"
+   "# define OC_SAD_SHIFT  (%i)\n"
+   "\n"
+   "\n"
+   "\n"
+   "# if defined(OC_COLLECT_METRICS)\n"
+   "struct oc_mode_metrics{\n"
+   "  double fragw;\n"
+   "  double satd;\n"
+   "  double rate;\n"
+   "  double rmse;\n"
+   "  double satd2;\n"
+   "  double satdrate;\n"
+   "  double rate2;\n"
+   "  double satdrmse;\n"
+   "  double rmse2;\n"
+   "};\n"
+   "\n"
+   "\n"
+   "int             oc_has_mode_metrics;\n"
+   "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
+   "# endif\n"
+   "\n"
+   "\n"
+   "\n"
+   "struct oc_mode_rd{\n"
+   "  ogg_int16_t rate;\n"
+   "  ogg_int16_t rmse;\n"
+   "};\n"
+   "\n"
+   "\n"
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
+   OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
+  for(qi=0;qi<64;qi++){
+    int pli;
+    fprintf(stdout,"  {\n");
+    for(pli=0;pli<3;pli++){
+      int qti;
+      fprintf(stdout,"    {\n");
+      for(qti=0;qti<2;qti++){
+        int bin;
+        static const char *pl_names[3]={"Y'","Cb","Cr"};
+        static const char *qti_names[2]={"INTRA","INTER"};
+        fprintf(stdout,"      /*%s  qi=%i  %s*/\n",
+         pl_names[pli],qi,qti_names[qti]);
+        fprintf(stdout,"      {\n");
+        fprintf(stdout,"        ");
+        for(bin=0;bin<OC_SAD_BINS;bin++){
+          if(bin&&!(bin&0x3))fprintf(stdout,"\n        ");
+          fprintf(stdout,"{%5i,%5i}",
+           OC_MODE_RD[qi][pli][qti][bin].rate,
+           OC_MODE_RD[qi][pli][qti][bin].rmse);
+          if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
+        }
+        fprintf(stdout,"\n      }");
+        if(qti<1)fprintf(stdout,",");
+        fprintf(stdout,"\n");
+      }
+      fprintf(stdout,"    }");
+      if(pli<2)fprintf(stdout,",");
+      fprintf(stdout,"\n");
+    }
+    fprintf(stdout,"  }");
+    if(qi<63)fprintf(stdout,",");
+    fprintf(stdout,"\n");
+  }
+  fprintf(stdout,
+   "};\n"
+   "\n"
+   "#endif\n");
+}
+#endif

Deleted: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,530 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#ifndef ENCODER_INTERNAL_H
-#define ENCODER_INTERNAL_H
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-/*#define OC_COLLECT_METRICS*/
-
-#include "theora/theora.h"
-#include "../internal.h"
-#include "encoder_huffman.h"
-#include "huffenc.h"
-#include "../dec/ocintrin.h"
-
-
-
-typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
-typedef struct CP_INSTANCE       CP_INSTANCE;
-
-
-struct oc_enc_opt_vtable{
-  unsigned (*frag_sad)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  unsigned (*frag_sad_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_satd_thresh)(const unsigned char *_src,
-   const unsigned char *_ref,int _ystride,unsigned _thresh);
-  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
-   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
-   unsigned _thresh);
-  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
-  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
-   const unsigned char *_ref,int _ystride);
-  void     (*frag_sub_128)(ogg_int16_t _diff[64],
-   const unsigned char *_src,int _ystride);
-  void     (*frag_copy)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride);
-  void     (*frag_copy2)(unsigned char *_dst,
-   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
-   const ogg_int16_t _residue[64]);
-  void     (*frag_recon_inter)(unsigned char *_dst,
-   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-  void     (*dequant_idct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64],
-   int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
-   const ogg_uint16_t _ac_quant[64]);
-  void     (*enc_loop_filter)(CP_INSTANCE *cpi,int _flimit);
-  void     (*restore_fpu)(void);
-};
-
-
-void oc_enc_vtable_init(CP_INSTANCE *_cpi);
-
-/* Baseline dct height and width. */
-#define BLOCK_HEIGHT_WIDTH          8
-#define HFRAGPIXELS                 8
-#define VFRAGPIXELS                 8
-
-/* Baseline dct block size */
-#define BLOCK_SIZE              (BLOCK_HEIGHT_WIDTH * BLOCK_HEIGHT_WIDTH)
-
-/* Border is for unrestricted mv's */
-#define UMV_BORDER              16
-#define STRIDE_EXTRA            (UMV_BORDER * 2)
-
-#define KEY_FRAME               0
-#define DELTA_FRAME             1
-
-#define MAX_MODES               8
-#define MODE_BITS               3
-#define MODE_METHODS            8
-#define MODE_METHOD_BITS        3
-
-#define MAX_MV_EXTENT 31  /* Max search distance in half pixel increments */
-
-/** block coding modes */
-typedef enum{
-  CODE_INTER_NO_MV        = 0x0, /* INTER prediction, (0,0) motion
-                                    vector implied.  */
-    CODE_INTRA            = 0x1, /* INTRA i.e. no prediction. */
-    CODE_INTER_PLUS_MV    = 0x2, /* INTER prediction, non zero motion
-                                    vector. */
-    CODE_INTER_LAST_MV    = 0x3, /* Use Last Motion vector */
-    CODE_INTER_PRIOR_LAST = 0x4, /* Prior last motion vector */
-    CODE_USING_GOLDEN     = 0x5, /* 'Golden frame' prediction (no MV). */
-    CODE_GOLDEN_MV        = 0x6, /* 'Golden frame' prediction plus MV. */
-    CODE_INTER_FOURMV     = 0x7  /* Inter prediction 4MV per macro block. */
-} coding_mode_t;
-
-/** Huffman table entry */
-typedef struct HUFF_ENTRY {
-  struct HUFF_ENTRY *ZeroChild;
-  struct HUFF_ENTRY *OneChild;
-  struct HUFF_ENTRY *Previous;
-  struct HUFF_ENTRY *Next;
-  ogg_int32_t        Value;
-  ogg_uint32_t       Frequency;
-} HUFF_ENTRY;
-
-typedef struct mc_state mc_state;
-
-struct mc_state{
-  int                candidates[12][2];
-  int                setb0;
-  int                ncandidates;
-  ogg_int32_t        mvapw1[2];
-  ogg_int32_t        mvapw2[2];
-};
-
-typedef struct macroblock {
-  /* the blocks comprising this macroblock */
-  int Ryuv[3][4]; /* [Y,U,V][raster order] */
-  int Hyuv[3][4]; /* [Y,U,V][hilbert order] */
-  int ysb;
-  int usb;
-  int vsb;
-
-  int cneighbors[4];
-  int ncneighbors;
-  int pneighbors[4];
-  int npneighbors; 
-
-  coding_mode_t mode;
-
-  oc_mv block_mv[4];
-  oc_mv ref_mv[4];
-  /* per-block final motion vectors */
-  /* raster order */
-  oc_mv mv[4];
-  /*Per-block final chroma motion vectors.*/
-  oc_mv cbmvs[4];
-
-  /* Motion vectors for a macro block for the current frame and the
-     previous two frames.
-
-     Each is a set of 2 vectors against the previous frame and against
-     the golden frame, which can be used to judge constant velocity
-     and constant acceleration.
-
-     Uninitialized MVs are (0,0).*/
-  oc_mv analysis_mv[3][2]; /* [cur,prev,prev2][frame,golden] */
-  oc_mv unref_mv[2];
-  /*Minimum motion estimation error from the analysis stage.*/
-  int    aerror;
-  int    gerror;
-  int    asatd;
-  int    gsatd;
-  int    block_satd[4];
-
-  char coded;
-  char refined;
-} macroblock_t;
-
-#define SB_MB_BLFRAG(sb,mbnum) ((sb).f[ ((mbnum)<2? ((mbnum)==0?0:4) : ((mbnum)==2?8:14)) ])
-typedef struct superblock {
-  int f[16]; // hilbert order
-  int m[16]; // hilbert order: only 4 for luma, but 16 for U/V (to match f) */
-} superblock_t;
-
-typedef ogg_int16_t    quant_table[64];
-typedef quant_table    quant_tables[64]; /* [zigzag][qi] */
-
-#include "enquant.h"
-
-typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
-
-struct oc_mode_scheme_chooser{
-  /*Pointers to the a list containing the index of each mode in the mode
-    alphabet used by each scheme.
-    The first entry points to the dynamic scheme0_ranks, while the remaining
-    7 point to the constant entries stored in OC_MODE_SCHEMES.*/
-  const unsigned char *mode_ranks[8];
-  /*The ranks for each mode when coded with scheme 0.
-    These are optimized so that the more frequent modes have lower ranks.*/
-  unsigned char        scheme0_ranks[OC_NMODES];
-  /*The list of modes, sorted in descending order of frequency, that
-    corresponds to the ranks above.*/
-  unsigned char        scheme0_list[OC_NMODES];
-  /*The number of times each mode has been chosen so far.*/
-  int                  mode_counts[OC_NMODES];
-  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
-  unsigned char        scheme_list[8];
-  /*The number of bits used by each mode coding scheme.*/
-  int                  scheme_bits[8];
-};
-
-void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
-
-typedef struct oc_rc_state oc_rc_state;
-
-struct oc_rc_state{
-  ogg_int64_t bits_per_frame;
-  ogg_int64_t fullness;
-  ogg_int64_t target;
-  ogg_int64_t max;
-  ogg_int64_t log_npixels;
-  unsigned    exp[2];
-  ogg_int64_t log_scale[2];
-  ogg_int64_t log_qtarget;
-  int         buf_delay;
-};
-
-/* Encoder (Compressor) instance -- installed in a theora_state */
-struct CP_INSTANCE {
-  /*This structure must be first.
-    It contains entry points accessed by the decoder library's API wrapper, and
-     is the only assumption that library makes about our internal format.*/
-  oc_state_dispatch_vtable  dispatch_vtbl;
-
-  theora_info               info;
-
-  /* ogg bitpacker for use in packet coding, other API state */
-  oggpack_buffer           *oggbuffer;
-  /*The number of duplicates to produce for the next frame.*/
-  int                       dup_count;
-  /*The number of duplicates remaining to be emitted for the current frame.*/
-  int                       nqueued_dups;
-
-  unsigned char            *frame;
-  unsigned char            *recon;
-  unsigned char            *golden;
-  unsigned char            *lastrecon;
-  ogg_uint32_t              frame_size;
-
-  /*Superblock, macroblock and fragment Information.*/
-  unsigned char            *frag_coded;
-  ogg_uint32_t             *frag_buffer_index;
-  ogg_int16_t              *frag_dc;
-  ogg_int16_t              *frag_dc_tmp;
-
-  macroblock_t             *macro;
-  superblock_t             *super[3];
-
-  ogg_uint32_t              frag_h[3];
-  ogg_uint32_t              frag_v[3];
-  ogg_uint32_t              frag_n[3];
-  ogg_uint32_t              frag_total;
-
-  ogg_uint32_t              macro_h;
-  ogg_uint32_t              macro_v;
-  ogg_uint32_t              macro_total;
-  
-  ogg_uint32_t              super_h[3];
-  ogg_uint32_t              super_v[3];
-  ogg_uint32_t              super_n[3];
-  ogg_uint32_t              super_total;
-
-  /*Stride of image and recon planes, accounting for borders.*/
-  ogg_uint32_t              stride[3];
-  /*Data offset of first coded pixel in plane.*/
-  ogg_uint32_t              offset[3];
-
-  /*********************************************************************/
-  /* state and stats */
-
-  int                       HeadersWritten;
-  ogg_uint32_t              LastKeyFrame;
-  ogg_int64_t               CurrentFrame;
-  unsigned char             FrameType;
-  int                       readyflag;
-  int                       packetflag;
-  int                       doneflag;
-  int                       first_inter_frame;
-
-  /*Indexed via [key/inter][dc/ac][luma/chroma].*/
-  int                       huffchoice[2][2][2];
-
-  ogg_uint32_t              dc_bits[2][DC_HUFF_CHOICES];
-  ogg_uint32_t              ac1_bits[2][AC_HUFF_CHOICES];
-  ogg_uint32_t              acN_bits[2][AC_HUFF_CHOICES];
-  /*Count of bits used by MV coding mode 0.*/
-  ogg_uint32_t              MVBits_0;
-  /*Count of bits used by MV coding mode 1.*/
-  ogg_uint32_t              MVBits_1;
-  oc_mode_scheme_chooser    chooser;
-
-  /*********************************************************************/
-  /* Token Buffers */
-  int                      *fr_partial;
-  unsigned char            *fr_partial_bits;
-  int                      *fr_full;
-  unsigned char            *fr_full_bits;
-  ogg_int16_t              *fr_block;
-  unsigned char            *fr_block_bits;
-  int                       fr_partial_count;
-  int                       fr_full_count;
-  int                       fr_block_count;
-
-
-  int                       stack_offset;
-  unsigned char            *dct_token_storage;
-  ogg_uint16_t             *dct_token_eb_storage;
-  unsigned char            *dct_token[64];
-  ogg_uint16_t             *dct_token_eb[64];
-
-  ogg_int32_t               dct_token_count[64];
-  ogg_int32_t               dct_token_ycount[64];
-
-  int                       eob_run[64];
-  int                       eob_pre[64];
-  int                       eob_ypre[64];
-
-  /********************************************************************/
-  /* Fragment SAD->bitrate estimation tracking metrics */
-  long                      rho_count[65]; 
-
-#if defined(OC_COLLECT_METRICS)
-  long                      rho_postop;
-  int                      *frag_mbi;
-  int                      *frag_sad;
-  int                      *frag_ssd;
-  int                      *dct_token_frag_storage;
-  int                      *dct_token_frag[64];
-  int                      *dct_eob_fi_storage;
-  int                      *dct_eob_fi_stack[64];
-  int                       dct_eob_fi_count[64];
-  ogg_int64_t               dist_dist[3][8];
-  ogg_int64_t               dist_bits[3][8];
-#endif
-
-  /********************************************************************/
-  /* Setup */
-  int                     keyframe_granule_shift;
-  int                     lambda;
-  int                     BaseQ;
-  int                     MinQ;
-  int                     GoldenFrameEnabled;
-  int                     InterPrediction;
-  int                     MotionCompensation;
-
-  /* hufftables and quant setup ****************************************/
-
-  th_huff_code            huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
-
-  th_quant_info           quant_info;
-  quant_tables            quant_tables[2][3];
-  oc_iquant_tables        iquant_tables[2][3];
-  /*An "average" quantizer for each quantizer type (INTRA or INTER) and QI
-     value.
-    This is used to paramterize the rate control decisions.
-    They are kept in the log domain to simplify later processing.
-    Keep in mind these are DCT domain quantizers, and so are scaled by an
-     additional factor of 4 from the pixel domain.*/
-  ogg_int64_t             log_qavg[2][64];
-  /*The buffer state used to drive rate control.*/
-  oc_rc_state             rc;
-  /*Table for encoder acceleration functions.*/
-  oc_enc_opt_vtable       opt_vtable;
-};
-
-extern void ReconRefFrames (CP_INSTANCE *cpi);
-
-typedef struct {
-  int coeff;
-  int count; /* -1 indicates no token, ie, midst of an EOB run */
-  int chroma;
-  int pre;
-  int run;
-#if defined(OC_COLLECT_METRICS)
-  int runstack;
-#endif
-} token_checkpoint_t;
-
-extern void tokenlog_commit(CP_INSTANCE *cpi, 
-			    token_checkpoint_t *stack, 
-			    int n);
-extern void tokenlog_rollback(CP_INSTANCE *cpi, 
-			      token_checkpoint_t *stack,
-			      int n);
-extern void dct_tokenize_init (CP_INSTANCE *cpi);
-extern int dct_tokenize_AC (CP_INSTANCE *cpi, 
-			    const int fi, 
-			    ogg_int16_t *dct, 
-			    const ogg_int16_t *dequant, 
-			    const ogg_int16_t *origdct, 
-			    const int chroma, 
-			    token_checkpoint_t **stack,int _acmin);
-extern void dct_tokenize_finish (CP_INSTANCE *cpi);
-extern void dct_tokenize_mark_ac_chroma (CP_INSTANCE *cpi);
-
-extern void InitQTables( CP_INSTANCE *cpi );
-
-extern void WriteFrameHeader( CP_INSTANCE *cpi) ;
-
-extern void EncodeData(CP_INSTANCE *cpi);
-
-extern void oc_mcenc_start(CP_INSTANCE *cpi,
-			   mc_state *mcenc);
-
-extern void oc_mcenc_search(CP_INSTANCE *cpi, 
-			    mc_state *_mcenc,
-			    int _mbi,
-			    int _goldenp,
-			    oc_mv _bmvs[4],
-			    int *best_err,
-			    int best_block_err[4]);
-
-extern void oc_mcenc_refine1mv(CP_INSTANCE *cpi, 
-			      int _mbi,
-			      int _goldenp,
-			      int err);
-
-extern void oc_mcenc_refine4mv(CP_INSTANCE *cpi, 
-			      int _mbi,
-			      int err[4]);
-
-extern int PickModes(CP_INSTANCE *cpi, int recode);
-
-extern void InitFrameInfo(CP_INSTANCE *cpi);
-
-extern void ClearFrameInfo (CP_INSTANCE *cpi);
-
-typedef struct {
-  ogg_uint16_t sb_partial_count;
-  ogg_uint16_t sb_full_count;
-
-  signed char sb_partial_last;
-  signed char sb_full_last;
-  signed char b_last;
-  signed char b_count;
-  signed char b_pend;
-
-  char sb_partial_break;
-  char sb_full_break;
-  char sb_partial;
-  char sb_coded;
-
-  int cost;
-} fr_state_t;
-
-extern void fr_clear(CP_INSTANCE *cpi, fr_state_t *fr);
-extern void fr_skipblock(CP_INSTANCE *cpi, fr_state_t *fr);
-extern void fr_codeblock(CP_INSTANCE *cpi, fr_state_t *fr);
-extern void fr_finishsb(CP_INSTANCE *cpi, fr_state_t *fr);
-extern void fr_write(CP_INSTANCE *cpi, fr_state_t *fr);
-
-extern int fr_cost1(fr_state_t *fr);
-extern int fr_cost4(fr_state_t *pre, fr_state_t *post);
-
-#if defined(OC_COLLECT_METRICS)
-extern void ModeMetrics(CP_INSTANCE *cpi);
-extern void oc_enc_mode_metrics_dump(CP_INSTANCE *cpi);
-#endif
-
-/*Encoder-specific accelerated functions.*/
-void oc_enc_frag_sub(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_sad(const CP_INSTANCE *_cpi,const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh(const CP_INSTANCE *_cpi,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh(const CP_INSTANCE *_cpi,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh(const CP_INSTANCE *_cpi,
- const unsigned char *_src,const unsigned char *_ref,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh(const CP_INSTANCE *_cpi,
- const unsigned char *_src,const unsigned char *_ref1,
- const unsigned char *_ref2,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_intra_satd(const CP_INSTANCE *_cpi,
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy(const CP_INSTANCE *_cpi,unsigned char *_dst,
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2(const CP_INSTANCE *_cpi,unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_frag_recon_intra(const CP_INSTANCE *_cpi,
- unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_frag_recon_inter(const CP_INSTANCE *_cpi,unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
-void oc_enc_fdct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
- const ogg_int16_t _x[64]);
-void oc_enc_dequant_idct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
-void oc_enc_loop_filter(CP_INSTANCE *_cpi,int _flimit);
-void oc_enc_restore_fpu(const CP_INSTANCE *_cpi);
-
-/*Default pure-C implementations.*/
-void oc_enc_vtable_init_c(CP_INSTANCE *_cpi);
-
-void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride);
-void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
- const unsigned char *_src,int _ystride);
-void oc_enc_frag_copy2_c(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-unsigned oc_enc_frag_sad_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
-void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_enc_loop_filter_c(CP_INSTANCE *_cpi,int _flimit);
-void oc_enc_restore_fpu_c(void);
-
-#endif /* ENCODER_INTERNAL_H */

Modified: branches/theora-thusnelda/lib/enc/dct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/dct.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -15,7 +15,7 @@
 
  ********************************************************************/
 
-#include "codec_internal.h"
+#include "encint.h"
 #include "../dec/dct.h"
 
 
@@ -122,9 +122,9 @@
   _y[7]=v;
 }
 
-void oc_enc_fdct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
+void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
  const ogg_int16_t _x[64]){
-  (*_cpi->opt_vtable.fdct8x8)(_y,_x);
+  (*_enc->opt_vtable.fdct8x8)(_y,_x);
 }
 
 /*Performs a forward 8x8 Type-II DCT transform.

Deleted: branches/theora-thusnelda/lib/enc/dct_decode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_decode.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/dct_decode.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,232 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "codec_internal.h"
-#include "quant_lookup.h"
-
-static void SetupBoundingValueArray_Generic(ogg_int16_t *BoundingValuePtr,
-                                            ogg_int32_t FLimit){
-
-  ogg_int32_t i;
-
-  /* Set up the bounding value array. */
-  memset ( BoundingValuePtr, 0, (256*sizeof(*BoundingValuePtr)) );
-  for ( i = 0; i < FLimit; i++ ){
-    BoundingValuePtr[127-i-FLimit] = (-FLimit+i);
-    BoundingValuePtr[127-i] = -i;
-    BoundingValuePtr[127+i] = i;
-    BoundingValuePtr[127+i+FLimit] = FLimit-i;
-  }
-}
-
-static void UpdateUMV_HBorders( CP_INSTANCE *cpi,
-                                unsigned char *DestReconPtr,
-				int plane){
-  ogg_uint32_t  i;
-  ogg_uint32_t  PixelIndex;
-
-  ogg_uint32_t  PlaneStride = cpi->stride[plane];
-  ogg_uint32_t  BlockVStep = cpi->stride[plane] * (VFRAGPIXELS - 1);
-  ogg_uint32_t  PlaneFragments = cpi->frag_n[plane];
-  ogg_uint32_t  LineFragments = cpi->frag_h[plane];
-  ogg_uint32_t  PlaneBorderWidth = (plane ? UMV_BORDER / 2 : UMV_BORDER );
-
-  unsigned char   *SrcPtr1;
-  unsigned char   *SrcPtr2;
-  unsigned char   *DestPtr1;
-  unsigned char   *DestPtr2;
-  ogg_uint32_t    *bp = cpi->frag_buffer_index;
-
-  if(plane) bp += cpi->frag_n[0];
-  if(plane>1) bp += cpi->frag_n[1];
-
-  /* Setup the source and destination pointers for the top and bottom
-     borders */
-  PixelIndex = bp[0];
-  SrcPtr1 = &DestReconPtr[ PixelIndex - PlaneBorderWidth ];
-  DestPtr1 = SrcPtr1 - (PlaneBorderWidth * PlaneStride);
-
-  PixelIndex = bp[PlaneFragments - LineFragments] + BlockVStep;
-  SrcPtr2 = &DestReconPtr[ PixelIndex - PlaneBorderWidth];
-  DestPtr2 = SrcPtr2 + PlaneStride;
-
-  /* Now copy the top and bottom source lines into each line of the
-     respective borders */
-  for ( i = 0; i < PlaneBorderWidth; i++ ) {
-    memcpy( DestPtr1, SrcPtr1, PlaneStride );
-    memcpy( DestPtr2, SrcPtr2, PlaneStride );
-    DestPtr1 += PlaneStride;
-    DestPtr2 += PlaneStride;
-  }
-}
-
-static void UpdateUMV_VBorders( CP_INSTANCE *cpi,
-                                unsigned char * DestReconPtr,
-                                int plane){
-  ogg_uint32_t   i;
-  ogg_uint32_t   PixelIndex;
-
-  ogg_uint32_t   PlaneStride = cpi->stride[plane];
-  ogg_uint32_t   LineFragments = cpi->frag_h[plane];
-  ogg_uint32_t   PlaneBorderWidth = (plane ? UMV_BORDER / 2 : UMV_BORDER );
-  ogg_uint32_t   PlaneHeight = (plane ? cpi->info.height/2 : cpi->info.height );
-
-  unsigned char   *SrcPtr1;
-  unsigned char   *SrcPtr2;
-  unsigned char   *DestPtr1;
-  unsigned char   *DestPtr2;
-  ogg_uint32_t    *bp = cpi->frag_buffer_index;
-
-  if(plane) bp += cpi->frag_n[0];
-  if(plane>1) bp += cpi->frag_n[1];
-
-  /* Setup the source data values and destination pointers for the
-     left and right edge borders */
-  PixelIndex = bp[0];
-  SrcPtr1 = &DestReconPtr[ PixelIndex ];
-  DestPtr1 = &DestReconPtr[ PixelIndex - PlaneBorderWidth ];
-
-  PixelIndex = bp[LineFragments - 1] + (HFRAGPIXELS - 1);
-  SrcPtr2 = &DestReconPtr[ PixelIndex ];
-  DestPtr2 = &DestReconPtr[ PixelIndex + 1 ];
-
-  /* Now copy the top and bottom source lines into each line of the
-     respective borders */
-  for ( i = 0; i < PlaneHeight; i++ ) {
-    memset( DestPtr1, SrcPtr1[0], PlaneBorderWidth );
-    memset( DestPtr2, SrcPtr2[0], PlaneBorderWidth );
-    SrcPtr1 += PlaneStride;
-    SrcPtr2 += PlaneStride;
-    DestPtr1 += PlaneStride;
-    DestPtr2 += PlaneStride;
-  }
-}
-
-void UpdateUMVBorder( CP_INSTANCE *cpi,
-                      unsigned char * DestReconPtr ) {
-  /* Y plane */
-  UpdateUMV_VBorders( cpi, DestReconPtr, 0);
-  UpdateUMV_HBorders( cpi, DestReconPtr, 0);
-
-  /* Then the U and V Planes */
-  UpdateUMV_VBorders( cpi, DestReconPtr, 1);
-  UpdateUMV_HBorders( cpi, DestReconPtr, 1);
-
-  UpdateUMV_VBorders( cpi, DestReconPtr, 2);
-  UpdateUMV_HBorders( cpi, DestReconPtr, 2);
-}
-
-static void loop_filter_h(unsigned char * PixelPtr,
-			  ogg_int32_t LineLength,
-			  ogg_int16_t *BoundingValuePtr){
-  ogg_int32_t j;
-  ogg_int32_t FiltVal;
-  PixelPtr-=2;
-
-  for ( j = 0; j < 8; j++ ){
-    FiltVal =
-      ( PixelPtr[0] ) -
-      ( PixelPtr[1] * 3 ) +
-      ( PixelPtr[2] * 3 ) -
-      ( PixelPtr[3] );
-
-    FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
-
-    PixelPtr[1] = OC_CLAMP255(PixelPtr[1] + FiltVal);
-    PixelPtr[2] = OC_CLAMP255(PixelPtr[2] - FiltVal);
-
-    PixelPtr += LineLength;
-  }
-}
-
-static void loop_filter_v(unsigned char * PixelPtr,
-			  ogg_int32_t LineLength,
-			  ogg_int16_t *BoundingValuePtr){
-  ogg_int32_t j;
-  ogg_int32_t FiltVal;
-  PixelPtr -= 2*LineLength;
-
-  for ( j = 0; j < 8; j++ ) {
-    FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
-      ( (ogg_int32_t)PixelPtr[LineLength] * 3 ) +
-      ( (ogg_int32_t)PixelPtr[2 * LineLength] * 3 ) -
-      ( (ogg_int32_t)PixelPtr[3 * LineLength] );
-
-    FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
-
-    PixelPtr[LineLength] = OC_CLAMP255(PixelPtr[LineLength] + FiltVal);
-    PixelPtr[2 * LineLength] = OC_CLAMP255(PixelPtr[2*LineLength] - FiltVal);
-
-    PixelPtr ++;
-  }
-}
-
-void oc_enc_loop_filter_c(CP_INSTANCE *cpi, int FLimit){
-
-  int j;
-  ogg_int16_t BoundingValues[256];
-  ogg_int16_t *bvp = BoundingValues+127;
-  unsigned char *cp = cpi->frag_coded;
-  ogg_uint32_t *bp = cpi->frag_buffer_index;
-
-  if ( FLimit == 0 ) return;
-  SetupBoundingValueArray_Generic(BoundingValues, FLimit);
-
-  for ( j = 0; j < 3 ; j++){
-    ogg_uint32_t *bp_begin = bp;
-    ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
-    int stride = cpi->stride[j];
-    int h = cpi->frag_h[j];
-
-    while(bp<bp_end){
-      ogg_uint32_t *bp_left = bp;
-      ogg_uint32_t *bp_right = bp + h;
-      while(bp<bp_right){
-        if(cp[0]){
-          if(bp>bp_left)
-            loop_filter_h(&cpi->lastrecon[bp[0]],stride,bvp);
-          if(bp_left>bp_begin)
-            loop_filter_v(&cpi->lastrecon[bp[0]],stride,bvp);
-          if(bp+1<bp_right && !cp[1])
-            loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,bvp);
-          if(bp+h<bp_end && !cp[h])
-            loop_filter_v(&cpi->lastrecon[bp[h]],stride,bvp);
-        }
-        bp++;
-        cp++;
-      }
-    }
-  }
-}
-
-void ReconRefFrames (CP_INSTANCE *cpi){
-  unsigned char *temp;
-  /*Swap.*/
-  temp=cpi->lastrecon;
-  cpi->lastrecon=cpi->recon;
-  cpi->recon=temp;
-  /* Apply a loop filter to edge pixels of updated blocks */
-  oc_enc_loop_filter(cpi,cpi->quant_info.loop_filter_limits[cpi->BaseQ]);
-  /* We may need to update the UMV border */
-  UpdateUMVBorder(cpi, cpi->lastrecon);
-  /*Swap back.*/
-  temp=cpi->lastrecon;
-  cpi->lastrecon=cpi->recon;
-  cpi->recon=temp;
-}

Deleted: branches/theora-thusnelda/lib/enc/dct_encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_encode.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/dct_encode.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,783 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "codec_internal.h"
-#include "quant_lookup.h"
-
-static void make_eobrun_token(int run, int *token, int *eb){
-  if ( run <= 3 ) {
-    if ( run == 1 ) {
-      *token = DCT_EOB_TOKEN;
-    } else if ( run == 2 ) {
-      *token = DCT_EOB_PAIR_TOKEN;
-    } else {
-      *token = DCT_EOB_TRIPLE_TOKEN;
-    }
-    *eb=0;
-    
-  } else {
-    
-    if ( run < 8 ) {
-      *token = DCT_REPEAT_RUN_TOKEN;
-      *eb = run-4;
-    } else if ( run < 16 ) {
-      *token = DCT_REPEAT_RUN2_TOKEN;
-      *eb = run-8;
-    } else if ( run < 32 ) {
-      *token = DCT_REPEAT_RUN3_TOKEN;
-      *eb = run-16;
-    } else if ( run < 4096) {
-      *token = DCT_REPEAT_RUN4_TOKEN;
-      *eb = run;
-    }
-  }
-}
-
-static int make_dct_token(CP_INSTANCE *cpi, 
-			  int coeff,
-			  int coeff2,
-			  int val,
-			  int *eb){
-  
-  ogg_uint32_t absval = abs(val);
-  int neg = (val<0);
-  int zero_run = coeff2-coeff;
-  int token;
-  *eb=0;
-
-  if (zero_run){
-    int adj = (coeff!=1); /* implement a minor restriction on
-			     stack 1 so that we know during DC
-			     fixups that extended a dctrun token
-			     from stack 1 will never overflow */
-    if ((absval==1) && (zero_run<17+adj)){
-      if ( zero_run <= 5 ) {
-	token = DCT_RUN_CATEGORY1+zero_run-1; 
-	*eb   = neg;
-      }else if ( zero_run <= 9 ) {
-	token = DCT_RUN_CATEGORY1B; 
-	*eb   = zero_run-6+(neg<<2);
-      }else{
-	token = DCT_RUN_CATEGORY1C;
-	*eb   = zero_run-10+(neg<<3);
-      }
-    }else if((absval==2 || absval==3) && (zero_run < 3+adj)){
-      if ( zero_run == 1 ) {
-	token = DCT_RUN_CATEGORY2;
-	*eb   = absval-2+(neg<<1);
-      }else{
-	token = DCT_RUN_CATEGORY2B;
-	*eb   = (neg<<2)+((absval-2)<<1)+zero_run-2;
-      }
-    }else{
-      if ( zero_run <= 8 )
-	token = DCT_SHORT_ZRL_TOKEN;
-      else
-	token = DCT_ZRL_TOKEN;
-      *eb = zero_run-1;
-    }
-  } else if ( absval == 1 ){
-    token = (neg ? MINUS_ONE_TOKEN : ONE_TOKEN);
-  } else if ( absval == 2 ) {
-    token = (neg ? MINUS_TWO_TOKEN : TWO_TOKEN);
-  } else if ( absval <= MAX_SINGLE_TOKEN_VALUE ) {
-    token = LOW_VAL_TOKENS + (absval - DCT_VAL_CAT2_MIN);
-    *eb   = neg;
-  } else if ( absval <= 8 ) {
-    token = DCT_VAL_CATEGORY3;
-    *eb   = (absval - DCT_VAL_CAT3_MIN) + (neg << 1);
-  } else if ( absval <= 12 ) {
-    token = DCT_VAL_CATEGORY4;
-    *eb   = (absval - DCT_VAL_CAT4_MIN) + (neg << 2);
-  } else if ( absval <= 20 ) {
-    token = DCT_VAL_CATEGORY5;
-    *eb   = (absval - DCT_VAL_CAT5_MIN) + (neg << 3);
-  } else if ( absval <= 36 ) {
-    token = DCT_VAL_CATEGORY6;
-    *eb   = (absval - DCT_VAL_CAT6_MIN) + (neg << 4);
-  } else if ( absval <= 68 ) {
-    token = DCT_VAL_CATEGORY7;
-    *eb   = (absval - DCT_VAL_CAT7_MIN) + (neg << 5);
-  } else {
-    token = DCT_VAL_CATEGORY8;
-    *eb   = (absval - DCT_VAL_CAT8_MIN) + (neg << 9);
-  } 
-
-  return token;
-}
-
-static int decode_eob_token(int token, int eb){
-  switch(token){
-  case DCT_EOB_TOKEN:
-    return 1;
-  case DCT_EOB_PAIR_TOKEN:
-    return 2; 
-  case DCT_EOB_TRIPLE_TOKEN:
-    return 3;
-  case DCT_REPEAT_RUN_TOKEN:
-    return eb+4;
-  case DCT_REPEAT_RUN2_TOKEN:
-    return eb+8;
-  case DCT_REPEAT_RUN3_TOKEN:
-    return eb+16;	
-  case DCT_REPEAT_RUN4_TOKEN:
-    return eb;
-  default:
-    return 0;
-  }
-}
-
-static int decode_token(int token, int eb, int *val){
-  switch(token){
-  case DCT_SHORT_ZRL_TOKEN:
-  case DCT_ZRL_TOKEN:
-    *val=0;
-    return eb+1;
-  case ONE_TOKEN:
-    *val = 1;
-    return 0;
-  case MINUS_ONE_TOKEN:
-    *val = -1;
-    return 0;
-  case TWO_TOKEN:
-    *val = 2;
-    return 0;
-  case MINUS_TWO_TOKEN:
-    *val = -2;
-    return 0;
-  case LOW_VAL_TOKENS:
-  case LOW_VAL_TOKENS+1:
-  case LOW_VAL_TOKENS+2:
-  case LOW_VAL_TOKENS+3:
-    *val = (eb ? -(DCT_VAL_CAT2_MIN+token-LOW_VAL_TOKENS) : DCT_VAL_CAT2_MIN+token-LOW_VAL_TOKENS);
-    return 0;
-  case DCT_VAL_CATEGORY3:
-    *val = ((eb & 0x2) ? -(DCT_VAL_CAT3_MIN+(eb&0x1)) : DCT_VAL_CAT3_MIN+(eb&0x1));
-    return 0;
-  case DCT_VAL_CATEGORY4:
-    *val = ((eb & 0x4) ? -(DCT_VAL_CAT4_MIN+(eb&0x3)) : DCT_VAL_CAT4_MIN+(eb&0x3));
-    return 0;
-  case DCT_VAL_CATEGORY5:
-    *val = ((eb & 0x8) ? -(DCT_VAL_CAT5_MIN+(eb&0x7)) : DCT_VAL_CAT5_MIN+(eb&0x7));
-    return 0;
-  case DCT_VAL_CATEGORY6:
-    *val = ((eb & 0x10) ? -(DCT_VAL_CAT6_MIN+(eb&0xf)) : DCT_VAL_CAT6_MIN+(eb&0xf));
-    return 0;
-  case DCT_VAL_CATEGORY7:
-    *val = ((eb & 0x20) ? -(DCT_VAL_CAT7_MIN+(eb&0x1f)) : DCT_VAL_CAT7_MIN+(eb&0x1f));
-    return 0;
-  case DCT_VAL_CATEGORY8:
-    *val = ((eb & 0x200) ? -(DCT_VAL_CAT8_MIN+(eb&0x1ff)) : DCT_VAL_CAT8_MIN+(eb&0x1ff));
-    return 0;
-  case DCT_RUN_CATEGORY1:
-  case DCT_RUN_CATEGORY1+1:
-  case DCT_RUN_CATEGORY1+2:
-  case DCT_RUN_CATEGORY1+3:
-  case DCT_RUN_CATEGORY1+4:
-    *val = (eb ? -1 : 1);
-    return token - DCT_RUN_CATEGORY1 + 1;
-  case DCT_RUN_CATEGORY1B:
-    *val = ((eb&0x4) ? -1 : 1);
-    return (eb&0x3)+6;
-  case DCT_RUN_CATEGORY1C:
-    *val = ((eb&0x8) ? -1 : 1);
-    return (eb&0x7)+10;
-  case DCT_RUN_CATEGORY2:
-    *val = ( (eb&0x2) ? -((eb&0x1)+2) : (eb&0x1)+2 );
-    return 1;
-  case DCT_RUN_CATEGORY2B:
-    *val = ( (eb&0x4) ? -(((eb&0x2)>>1)+2) : ((eb&0x2)>>1)+2);
-    return (eb&0x1)+2;
-  default:
-    *val = 0;
-    return 0;
-  }
-}
-
-/* token logging to allow a few fragments of efficient rollback.  SKIP
-   analysis is tied up in the tokenization process, so we need to be
-   able to undo a fragment's tokens on a whim */
-
-static int acoffset[64]={
-  00,00,00,00,00,00,16,16,
-  16,16,16,16,16,16,16,32,
-  32,32,32,32,32,32,32,32,
-  32,32,32,32,48,48,48,48,
-  48,48,48,48,48,48,48,48,
-  48,48,48,48,48,48,48,48,
-  48,48,48,48,48,48,48,48};
-
-/* only counts bits */
-static int tokencost(CP_INSTANCE *cpi, int huff, int coeff, int token){
-  huff += acoffset[coeff];
-  return cpi->huff_codes[huff][token].nbits+OC_DCT_TOKEN_EXTRA_BITS[token];
-}
-
-void tokenlog_rollback(CP_INSTANCE *cpi, token_checkpoint_t *stack,int n){
-  int i;
-  for(i=n-1;i>=0;i--){
-    int coeff = stack[i].coeff;
-    if(stack[i].count>=0) cpi->dct_token_count[coeff] = stack[i].count; 
-    cpi->eob_run[coeff] = stack[i].run;
-    cpi->eob_pre[coeff] = stack[i].pre;
-#if defined(OC_COLLECT_METRICS)
-    cpi->dct_eob_fi_count[coeff] = stack[i].runstack;
-#endif
-  }
-}
-
-static void tokenlog_metrics(CP_INSTANCE *cpi, int coeff, int chroma, int token){
-  if(coeff == 0){
-    /* DC */
-    int i;
-    for ( i = 0; i < DC_HUFF_CHOICES; i++)
-      cpi->dc_bits[chroma][i] += cpi->huff_codes[i][token].nbits;
-  }else if (coeff == 1){
-    /* AC == 1*/
-    int i,offset = acoffset[1]+AC_HUFF_OFFSET;
-    for ( i = 0; i < AC_HUFF_CHOICES; i++)
-      cpi->ac1_bits[chroma][i] += cpi->huff_codes[offset+i][token].nbits;
-  }else{
-    /* AC > 1*/
-    int i,offset = acoffset[coeff]+AC_HUFF_OFFSET;
-    for ( i = 0; i < AC_HUFF_CHOICES; i++)
-      cpi->acN_bits[chroma][i] += cpi->huff_codes[offset+i][token].nbits;
-  }
-}
-
-void tokenlog_commit(CP_INSTANCE *cpi, token_checkpoint_t *stack, int n){
-  int i;
-  for(i=0;i<n;i++){
-    int pos = stack[i].count;
-    if(pos>=0){
-      int coeff = stack[i].coeff;
-      int token = cpi->dct_token[coeff][pos];
-      int chroma = stack[i].chroma;
-      tokenlog_metrics(cpi,coeff,chroma,token);
-    }
-  }
-}
-
-static void tokenlog_mark(CP_INSTANCE *cpi, int coeff, token_checkpoint_t **stack){
-  (*stack)->coeff = coeff;
-  (*stack)->count = -1;
-  (*stack)->run = cpi->eob_run[coeff];
-  (*stack)->pre = cpi->eob_pre[coeff];
-#if defined(OC_COLLECT_METRICS)
-  (*stack)->runstack = cpi->dct_eob_fi_count[coeff];
-#endif
-  (*stack)++;
-}
-
-static void token_add(CP_INSTANCE *cpi, int chroma, int coeff, 
-			 unsigned char token, ogg_uint16_t eb,
-			 token_checkpoint_t **stack){
-  int pos = cpi->dct_token_count[coeff]++;
-  cpi->dct_token[coeff][pos] = token;
-  cpi->dct_token_eb[coeff][pos] = eb;
-  if(stack){
-    (*stack)->coeff = coeff;
-    (*stack)->count = pos;
-    (*stack)->chroma = chroma;
-    (*stack)->run = cpi->eob_run[coeff];
-    (*stack)->pre = cpi->eob_pre[coeff];
-#if defined(OC_COLLECT_METRICS)
-    (*stack)->runstack = cpi->dct_eob_fi_count[coeff];
-#endif
-    (*stack)++;
-  }else{
-    tokenlog_metrics(cpi,coeff,chroma,token);
-  }
-}
-
-/* does not offer logging option; only used in nonconditional EOBrun welding */
-static void token_prepend(CP_INSTANCE *cpi, int chroma, int coeff, 
-			  unsigned char token, ogg_uint16_t eb){
-  
-  cpi->dct_token[coeff]--;
-  cpi->dct_token_eb[coeff]--;
-#if defined(OC_COLLECT_METRICS)
-  cpi->dct_token_frag[coeff]--;
-#endif
-  cpi->dct_token[coeff][0] = token;
-  cpi->dct_token_eb[coeff][0] = eb;
-  cpi->dct_token_count[coeff]++;
-  tokenlog_metrics(cpi,coeff,chroma,token);
-}
-
-static int tokenize_eobrun(CP_INSTANCE *cpi, int pos, int run, token_checkpoint_t **stack){
-  int token=0,eb=0;
-  int chroma = !(run&0x8000);
-  int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][chroma];
-
-  make_eobrun_token(run&0x7fff, &token, &eb);
-  token_add(cpi, chroma, pos, token, eb, stack);
-
-  return tokencost(cpi,huff,pos,token);
-}
-
-
-static void tokenize_prepend_eobrun(CP_INSTANCE *cpi, int chroma, int pos, int run){
-  int token=0,eb=0;
-  make_eobrun_token(run, &token, &eb);
-  token_prepend(cpi, chroma, pos, token, eb);
-}
-
-/* only used in nonconditional DC/stack1 fixups */
-static void token_add_raw(CP_INSTANCE *cpi, 
-			  int chroma,
-			  int fi,
-			  int coeff,
-			  int token,
-			  int eb){
-  
-  /* Emit pending EOB run if any */
-  if(cpi->eob_run[coeff]){
-    tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],NULL);
-    cpi->eob_run[coeff]=0;
-  }
-#if defined(OC_COLLECT_METRICS)
-  cpi->dct_token_frag[coeff][cpi->dct_token_count[coeff]] = fi;
-#endif
-  token_add(cpi,chroma,coeff,token,eb,NULL);
-  
-}
-
-/* NULL stack to force commit */
-static int tokenize_dctval(CP_INSTANCE *cpi, 
-			   int chroma,
-			   int fi,
-			   int coeff,
-			   int coeff2,
-			   int val,
-			   token_checkpoint_t **stack){
-  int eb=0;
-  int token=make_dct_token(cpi,coeff,coeff2,val,&eb);
-
-  /* Emit pending EOB run if any */
-  if(cpi->eob_run[coeff]){
-    tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],stack);
-    cpi->eob_run[coeff]=0;
-  }
-#if defined(OC_COLLECT_METRICS)
-  cpi->dct_token_frag[coeff][cpi->dct_token_count[coeff]] = fi;
-#endif
-  
-  token_add(cpi,chroma,coeff,token,eb,stack);
-  
-  if( ((token==DCT_SHORT_ZRL_TOKEN) || (token==DCT_ZRL_TOKEN)) && val)
-    return 0; /* we only flushed a preceeding zero run, not the value token. */
-  
-  return 1;
-}
-
-static int tokenize_mark_run(CP_INSTANCE *cpi, 
-			      int chroma,
-			      int fi,
-			      int pre,
-			      int coeff,
-			      token_checkpoint_t **stack){
-  int cost = 0;
-
-  if(pre && cpi->dct_token_count[coeff] == 0){
-    if(stack)tokenlog_mark(cpi,coeff,stack); /* log an undo without logging a token */
-    cpi->eob_pre[coeff]++;
-  }else{
-    if((cpi->eob_run[coeff]&0x7fff) == 4095){
-      cost += tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],stack);
-      cpi->eob_run[coeff] = 0;
-    }
-    
-    if(stack)tokenlog_mark(cpi,coeff,stack); /* log an undo without logging a token */
-    cpi->eob_run[coeff]++;
-    cpi->eob_run[coeff]|= !chroma<<15;
-  }	  
-#if defined(OC_COLLECT_METRICS)
-  cpi->dct_eob_fi_stack[coeff][cpi->dct_eob_fi_count[coeff]++]=fi;
-#endif
-  return cost;
-}
-
-static int tokenize_dctcost(CP_INSTANCE *cpi,int chroma,
-			     int coeff, int coeff2, int val){
-  int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][chroma];
-  int eb=0,token=0;
-  int cost = 0;
-  
-  /* if there was an EOB run pending, count the cost of flushing it */
-  if(cpi->eob_run[coeff]){
-    int rchroma = !(cpi->eob_run[coeff]&0x8000); 
-    int rhuff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][rchroma];
-    make_eobrun_token(cpi->eob_run[coeff]&0x7fff,&token,&eb);
-    cost += tokencost(cpi,rhuff,coeff,token);
-  }
-
-  /* count cost of token */
-  token = make_dct_token(cpi,coeff,coeff2,val,&eb);
-  cost += tokencost(cpi,huff, coeff, token);
-  
-  /* if token was a zero run, we've not yet coded up to the value */
-  if( (token==DCT_SHORT_ZRL_TOKEN) || (token==DCT_ZRL_TOKEN))
-    return cost + tokenize_dctcost(cpi,chroma,coeff2,coeff2,val);
-  else
-    return cost;
-}
-
-/* The opportunity cost of an in-progress EOB run is the cost to flush
-   the run up to 'n+1' minus the cost of flushing the run up to 'n' */
-static int tokenize_eobcost(CP_INSTANCE *cpi,int chroma, int coeff){
-  int n = cpi->eob_run[coeff];
-  int eb=0,token=0;
-  int cost0=0,cost1;
-  
-  if(n>0){
-    int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][!(n&0x8000)];
-
-    make_eobrun_token(n&0x7fff, &token, &eb);
-    cost0 = tokencost(cpi,huff,coeff,token);
-
-    make_eobrun_token((n+1)&0x7fff, &token, &eb);
-    cost1 = tokencost(cpi,huff,coeff,token);
-    
-  }else{
-    int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][chroma];
-    cost1 = tokencost(cpi,huff,coeff,DCT_EOB_TOKEN);
-  }    
-
-  return cost1-cost0;
-}
-
-/* No final DC to encode yet (DC prediction hasn't been done) So
-   simply assume there will be a nonzero DC value and code.  That's
-   not a true assumption but it can be fixed-up as DC is tokenized
-   later */
-int dct_tokenize_AC(CP_INSTANCE *cpi, const int fi, 
-		    ogg_int16_t *dct, const ogg_int16_t *dequant, 
-		    const ogg_int16_t *origdct, const int chroma, 
-		    token_checkpoint_t **stack,int _acmin){
-  int coeff = 1; /* skip DC for now */
-  int i = coeff;
-  int retcost = 0;
-
-  while( !dct[i] && (++i < BLOCK_SIZE) );
-    
-  while(i < BLOCK_SIZE){
-    int ret;
-    int od = origdct[dezigzag_index[i]];
-    int bestd=0,d = dct[i];
-    int bestmin;
-    int cost,cost2=0,bestcost=0;
-    int j=i+1,k;
-
-    while((j < BLOCK_SIZE) && !dct[j] ) j++;
-
-  if(i>=_acmin){
-    if(j==BLOCK_SIZE){
-      cost = tokenize_eobcost(cpi,chroma,coeff);
-      if(i+1<BLOCK_SIZE) 
-	cost2 = tokenize_eobcost(cpi,chroma,i+1);
-    }else{
-      cost = tokenize_dctcost(cpi,chroma,coeff,j,dct[j]);
-      cost2 = tokenize_dctcost(cpi,chroma,i+1,j,dct[j]);
-    }
-    bestmin = od*od+cost*cpi->lambda;
-    
-
-    for(k=1;k<=abs(d);k++){
-      int dval = (d>0 ? k : -k);
-      int dd = dval*dequant[i] - od;
-      int min = dd*dd;
-      cost = tokenize_dctcost(cpi,chroma,coeff,i,dval);
-
-      min += (cost+cost2)*cpi->lambda;
-      if(min<bestmin){
-	bestmin=min;
-	bestcost=cost;
-	bestd=dval;
-      }
-    }
-
-    dct[i]=bestd;
-    if(bestd==0){
-      if(j==BLOCK_SIZE) break;
-      i=j;
-      continue;
-    }
-  }
-  else{
-    bestcost = tokenize_dctcost(cpi,chroma,coeff,i,d);
-  }
-    
-    retcost+=bestcost;
-	
-    ret = tokenize_dctval(cpi, chroma, fi, coeff, i, dct[i], stack);
-    if(!ret)
-      tokenize_dctval(cpi, chroma, fi, i, i, dct[i], stack);
-    coeff=i+1;
-    i=j;
-    
-  }
-  if(coeff<BLOCK_SIZE) retcost+=tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
-  return retcost;
-}
-
-/* called after AC tokenization is complete, because DC coding has to
-   happen after DC predict, which has to happen after the
-   Hilbert-ordered TQT loop */
-/* Convention: All tokens and runs in the coeff1 stack are
-   'regenerated' as the stack is tracked. This can be done in-place;
-   stack 1 can only shrink or stay the same size */
-static void tokenize_DC(CP_INSTANCE *cpi, int fi, int chroma,
-			int *idx1, int *run1){
-  
-  int val = cpi->frag_dc[fi];
-  int token1 = cpi->dct_token[1][*idx1];
-  int eb1 = cpi->dct_token_eb[1][*idx1];
-  
-  if(!*run1) *run1 = decode_eob_token(token1, eb1);
-  
-  if(val){
-    /* nonzero DC val, no coeff 1 stack 'fixup'. */
-    
-    tokenize_dctval(cpi,chroma,fi,0,0,val,NULL);
-    
-    /* there was a nonzero DC value, so there's no alteration to the
-       track1 stack for this fragment; track/regenerate stack 1
-	 state unchanged */
-    if(*run1){
-      /* in the midst of an EOB run in stack 1 */
-      tokenize_mark_run(cpi,chroma,fi,1,1,NULL);
-      (*run1)--;
-      
-    }else{
-      
-      /* non-EOB run token to emit for stack 1 */
-      token_add_raw(cpi,chroma,fi,1,token1,eb1);
-      
-    }
-    
-  }else{
-
-    /* zero DC value; that means the entry in coeff position 1
-       should have been coded from the DC coeff position. This
-       requires a stack 1 fixup. */
-    
-    if(*run1){
-      
-      /* current stack 1 token an EOB run; conceptually move this fragment's EOBness to stack 0 */
-      tokenize_mark_run(cpi,chroma,fi,0,0,NULL);
-      
-      /* decrement current EOB run for coeff 1 without adding to coded run */
-      (*run1)--;
-      
-    }else{
-      int run,val=0;
-      
-      /* stack 1 token is one of: zerorun, dctrun or dctval */
-      /* A zero-run token is expanded and moved to token stack 0 (stack 1 entry dropped) */
-      /* A dctval may be transformed into a single dctrun that is moved to stack 0,
-	 or if it does not fit in a dctrun, we leave the stack 1 entry alone and emit 
-	 a single length-1 zerorun token for stack 0 */
-      /* A dctrun is extended and moved to stack 0.  During AC
-	 coding, we restrict the run lengths on dctruns for stack 1
-	 so we know there's no chance of overrunning the
-	 representable range */
-      
-      run = decode_token(token1,eb1,&val)+1;
-      
-      if(!tokenize_dctval(cpi,chroma,fi,0,run,val,NULL)){
-	token_add_raw(cpi,chroma,fi,1,token1,eb1);
-      }
-    }
-  }
-  
-  /* update token counter if not in a run */
-  if (!*run1) (*idx1)++;
-}
-
-void dct_tokenize_init (CP_INSTANCE *cpi){
-  int i;
-
-  memset(cpi->eob_run, 0, sizeof(cpi->eob_run));
-  memset(cpi->eob_pre, 0, sizeof(cpi->eob_pre));
-  memset(cpi->dc_bits, 0, sizeof(cpi->dc_bits));
-  memset(cpi->ac1_bits, 0, sizeof(cpi->ac1_bits));
-  memset(cpi->acN_bits, 0, sizeof(cpi->acN_bits));
-  memset(cpi->dct_token_count, 0, sizeof(cpi->dct_token_count));
-#if defined(OC_COLLECT_METRICS)
-  memset(cpi->dct_eob_fi_count, 0, sizeof(cpi->dct_eob_fi_count));
-#endif
-
-  for(i=0;i<BLOCK_SIZE;i++){
-    cpi->dct_token[i] = cpi->dct_token_storage + cpi->stack_offset*i;
-    cpi->dct_token_eb[i] = cpi->dct_token_eb_storage + cpi->stack_offset*i;
-
-#if defined(OC_COLLECT_METRICS)
-    cpi->dct_eob_fi_stack[i] = cpi->dct_eob_fi_storage + cpi->frag_total*i;
-    cpi->dct_token_frag[i] = cpi->dct_token_frag_storage + cpi->stack_offset*i;
-#endif
-  }
-}
-
-void dct_tokenize_mark_ac_chroma (CP_INSTANCE *cpi){
-  int i;
-  for(i=1;i<64;i++){
-    cpi->dct_token_ycount[i]=cpi->dct_token_count[i];
-    if(cpi->eob_run[i])
-      cpi->dct_token_ycount[i]++; /* there will be another y plane token after welding */
-    cpi->eob_ypre[i]=cpi->eob_pre[i];
-  }
-}
-
-/* post-facto DC tokenization (has to be completed after DC predict)
-   coeff 1 fixups and eobrun welding */
-void dct_tokenize_finish (CP_INSTANCE *cpi){
-  int i,sbi;
-  int idx1=0,run1=0;
-  unsigned char *cp=cpi->frag_coded;
-  
-  /* we parse the token stack for coeff1 to stay in sync, and re-use
-     the token stack counters to track */
-  /* emit an eob run for the end run of stack 1; this is used to
-     reparse the stack in the DC code loop.  The current state will be
-     recreated by the end of DC encode */
-
-  if(cpi->eob_run[1]) tokenize_eobrun(cpi,1,cpi->eob_run[1],NULL);
-  memset(cpi->ac1_bits, 0, sizeof(cpi->ac1_bits));
-  cpi->dct_token_count[1]=0;
-  cpi->eob_pre[1]=cpi->eob_run[1]=0;
-#if defined(OC_COLLECT_METRICS)
-  /* reset and reuse as a counter */
-  cpi->dct_eob_fi_count[1]=0;
-#endif
-  
-  for (sbi=0; sbi < cpi->super_n[0]; sbi++ ){
-    superblock_t *sb = &cpi->super[0][sbi];
-    int bi;
-    for (bi=0; bi<16; bi++, i++ ) {
-      int fi = sb->f[bi];
-      if(cp[fi]) 
-        tokenize_DC(cpi, fi, 0, &idx1, &run1);
-    }
-  }
-
-  for(i=0;i<2;i++){
-    cpi->dct_token_ycount[i]=cpi->dct_token_count[i];
-    if(cpi->eob_run[i])
-      cpi->dct_token_ycount[i]++; /* there will be another y plane token after welding */
-    cpi->eob_ypre[i]=cpi->eob_pre[i];
-  }
-
-  for (; sbi < cpi->super_total; sbi++ ){
-    superblock_t *sb = &cpi->super[0][sbi];
-    int bi;
-    for (bi=0; bi<16; bi++,i++ ) {
-      int fi = sb->f[bi];
-      if(cp[fi]) 
-	tokenize_DC(cpi, fi, 1, &idx1, &run1);
-    }
-  }
-
-  /* DC coded, AC coeff 1 state fixed up/regenerated */
-
-  /* tie together eob runs at the beginnings/ends of coeff groups */
-  {
-    int coeff = 0;
-    int run = 0;
-    
-    for(i=0;i<BLOCK_SIZE;i++){
-      if(cpi->eob_pre[i]){
-	/* group begins with an EOB run */
-	
-	/* special case the ongoing run + eob is at or over the max run size;
-	   we know the ongoing run is < 4095 or it would have been flushed already. */
-	if(run && (run&0x7fff) + cpi->eob_pre[i] >= 4095){ /* 1 */
-	  tokenize_eobrun(cpi,coeff,4095 | (run&0x8000),NULL);
-	  cpi->eob_pre[i] -= 4095-(run&0x7fff); 
-	  cpi->eob_ypre[i] -= 4095-(run&0x7fff); 
-	  run = 0;
-	  coeff = i;
-	}
-	
-	if(run){
-	  if(cpi->dct_token_count[i]){ /* 2 */
-	    /* group is not only an EOB run; emit the run token */
-	    tokenize_eobrun(cpi,coeff,run + cpi->eob_pre[i],NULL);
-	    cpi->eob_ypre[i] = 0;
-	    cpi->eob_pre[i] = 0;
-	    run = cpi->eob_run[i];
-	    coeff = i;
-	  }else{ /* 3 */
-	    /* group consists entirely of EOB run.  Add, iterate */
-	    run += cpi->eob_pre[i];
-	    cpi->eob_pre[i] = 0;
-	    cpi->eob_ypre[i] = 0;
-	  }
-	}else{
-	    
-	  if(cpi->dct_token_count[i]){
-	    /* there are other tokens in this group; work backwards as we need to prepend */
-	    while(cpi->eob_pre[i] >= 4095){ /* 4 */
-	      int lchroma = (cpi->eob_pre[i]-4095 >= cpi->eob_ypre[i]);
-	      tokenize_prepend_eobrun(cpi,lchroma,i,4095);
-	      if(!lchroma)cpi->dct_token_ycount[i]++;
-	      cpi->eob_pre[i] -= 4095;
-	    }
-	    if(cpi->eob_pre[i]){ /* 5 */
-	      int lchroma = (cpi->eob_ypre[i]<=0); /* possible when case 1 triggered */
-	      tokenize_prepend_eobrun(cpi, lchroma, i, cpi->eob_pre[i]);
-	      if(!lchroma)cpi->dct_token_ycount[i]++;
-	      cpi->eob_pre[i] = 0;
-	      cpi->eob_ypre[i] = 0;
-	    }
-	    run = cpi->eob_run[i];
-	    coeff = i;
-	  }else{
-	    /* group consists entirely of EOB run.  Add, flush overs, iterate */
-	    int lchroma = (cpi->eob_ypre[i]<=0);
-	    while(cpi->eob_pre[i] >= 4095){
-	      tokenize_eobrun(cpi,i,4095|(!lchroma<<15),NULL);
-	      if(!lchroma)cpi->dct_token_ycount[i]++;
-	      cpi->eob_pre[i] -= 4095;
-	      cpi->eob_ypre[i] -= 4095;
-	      lchroma = (cpi->eob_ypre[i]<=0);
-	    }
-	    run = cpi->eob_pre[i] | (!lchroma<<15);
-	    coeff = i;
-	    /* source is pre-run, so the eventual eob_emit_run also needs to increment ycount if coded into Y plane */
-	    if(!lchroma)cpi->dct_token_ycount[i]++;
-	  }
-	}
-      }else{
-	/* no eob run to begin group */
-	if(i==0 || cpi->dct_token_count[i]){
-	  if(run)
-	    tokenize_eobrun(cpi,coeff,run,NULL);
-	  
-	  run = cpi->eob_run[i];
-	  coeff = i;
-	}
-      }
-    }
-    
-    if(run)
-      tokenize_eobrun(cpi,coeff,run,NULL);
-    
-  }
-}

Deleted: branches/theora-thusnelda/lib/enc/encapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,329 +0,0 @@
-#include <string.h>
-#include "theora/theoraenc.h"
-#include "theora/theora.h"
-#include "codec_internal.h"
-#include "mathops.h"
-#include "../dec/ocintrin.h"
-
-/*Wrapper to translate the new API into the old API.
-  Eventually we need to convert the old functions to support the new API
-   natively and do the translation the other way.
-  theora-exp already has the necessary code to do so.*/
-
-
-
-static void th_info2theora_info(theora_info *_ci,const th_info *_info){
-  _ci->version_major=_info->version_major;
-  _ci->version_minor=_info->version_minor;
-  _ci->version_subminor=_info->version_subminor;
-  _ci->width=_info->frame_width;
-  _ci->height=_info->frame_height;
-  _ci->frame_width=_info->pic_width;
-  _ci->frame_height=_info->pic_height;
-  _ci->offset_x=_info->pic_x;
-  _ci->offset_y=_info->pic_y;
-  _ci->fps_numerator=_info->fps_numerator;
-  _ci->fps_denominator=_info->fps_denominator;
-  _ci->aspect_numerator=_info->aspect_numerator;
-  _ci->aspect_denominator=_info->aspect_denominator;
-  switch(_info->colorspace){
-    case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
-    case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
-    default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
-  }
-  switch(_info->pixel_fmt){
-    case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
-    case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
-    case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
-    default:_ci->pixelformat=OC_PF_RSVD;
-  }
-  _ci->target_bitrate=_info->target_bitrate;
-  _ci->quality=_info->quality;
-  _ci->codec_setup=NULL;
-  /*Defaults from old encoder_example... eventually most of these should go
-     away when we make the encoder no longer use them.*/
-  _ci->dropframes_p=0;
-  _ci->keyframe_auto_p=1;
-  _ci->keyframe_frequency=1<<_info->keyframe_granule_shift;
-  _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
-  _ci->keyframe_data_target_bitrate=
-   _info->target_bitrate+(_info->target_bitrate>>1);
-  _ci->keyframe_auto_threshold=80;
-  _ci->keyframe_mindistance=8;
-  _ci->noise_sensitivity=1;
-  _ci->sharpness=0;
-  _ci->quick_p=1;
-}
-
-
-
-struct th_enc_ctx{
-  /*This is required at the start of the struct for the common functions to
-     work.*/
-  th_info        info;
-  /*The actual encoder.*/
-  theora_state   state;
-  /*A temporary buffer for input frames.
-    This is needed if the U and V strides differ, or padding is required.*/
-  unsigned char *buf;
-};
-
-
-th_enc_ctx *th_encode_alloc(const th_info *_info){
-  theora_info  ci;
-  th_enc_ctx  *enc;
-  th_info2theora_info(&ci,_info);
-  /*Do a bunch of checks the new API does, but the old one didn't.*/
-  if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
-   _info->frame_width>=0x100000||_info->frame_height>=0x100000||
-   _info->pic_x+_info->pic_width>_info->frame_width||
-   _info->pic_y+_info->pic_height>_info->frame_height||
-   _info->pic_x>255||
-   _info->frame_height-_info->pic_height-_info->pic_y>255||
-   _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
-   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
-    enc=NULL;
-  }
-  else{
-    enc=(th_enc_ctx *)_ogg_malloc(sizeof(*enc));
-    if(theora_encode_init(&enc->state,&ci)<0){
-      _ogg_free(enc);
-      enc=NULL;
-    }
-    else{
-      if(_info->frame_width>_info->pic_width||
-       _info->frame_height>_info->pic_height){
-        enc->buf=_ogg_malloc((_info->frame_width*_info->frame_height+
-         ((_info->frame_width>>!(_info->pixel_fmt&1))*
-         (_info->frame_height>>!(_info->pixel_fmt&2))<<1))*sizeof(*enc->buf));
-      }
-      else enc->buf=NULL;
-      memcpy(&enc->info,_info,sizeof(enc->info));
-      /*Overwrite values theora_encode_init() can change; don't trust the user.*/
-      enc->info.version_major=ci.version_major;
-      enc->info.version_minor=ci.version_minor;
-      enc->info.version_subminor=ci.version_subminor;
-      enc->info.quality=ci.quality;
-      enc->info.target_bitrate=ci.target_bitrate;
-      enc->info.fps_numerator=ci.fps_numerator;
-      enc->info.fps_denominator=ci.fps_denominator;
-      enc->info.keyframe_granule_shift=
-       OC_ILOG_32(ci.keyframe_frequency_force-1);
-    }
-  }
-  return enc;
-}
-
-int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
-  return theora_control(&_enc->state,_req,_buf,_buf_sz);
-}
-
-int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_comments,
- ogg_packet *_op){
-  theora_state *te;
-  CP_INSTANCE  *cpi;
-  if(_enc==NULL||_op==NULL)return OC_FAULT;
-  te=&_enc->state;
-  cpi=(CP_INSTANCE *)te->internal_encode;
-  switch(cpi->doneflag){
-    case -3:{
-      theora_encode_header(te,_op);
-      return -cpi->doneflag++;
-    }break;
-    case -2:{
-      if(_comments==NULL)return OC_FAULT;
-      theora_encode_comment((theora_comment *)_comments,_op);
-      /*The old API does not require a theora_state struct when writing the
-         comment header, so it can't use its internal buffer and relies on the
-         application to free it.
-        The old documentation is wrong on this subject, and this breaks on
-         Windows when linking against multiple versions of libc (which is
-         almost always done when, e.g., using DLLs built with mingw32).
-        The new API _does_ require a th_enc_ctx, and states that libtheora owns
-         the memory.
-        Thus we move the contents of this packet into our internal
-         oggpack_buffer so it can be properly reclaimed.*/
-      oggpackB_reset(cpi->oggbuffer);
-      oggpackB_writecopy(cpi->oggbuffer,_op->packet,_op->bytes*8);
-      _ogg_free(_op->packet);
-      _op->packet=oggpackB_get_buffer(cpi->oggbuffer);
-      return -cpi->doneflag++;
-    }break;
-    case -1:{
-      theora_encode_tables(te,_op);
-      return -cpi->doneflag++;
-    }break;
-    case 0:return 0;
-    default:return OC_EINVAL;
-  }
-}
-
-/*Copies the picture region of the _src image plane into _dst and pads the rest
-   of _dst using a diffusion extension method.
-  We could do much better (e.g., the DCT-based low frequency extension method
-   in theora-exp's fdct.c) if we were to pad after motion compensation, but
-   that would require significant changes to the encoder.*/
-static unsigned char *th_encode_copy_pad_plane(th_img_plane *_dst,
- unsigned char *_buf,th_img_plane *_src,
- ogg_uint32_t _pic_x,ogg_uint32_t _pic_y,
- ogg_uint32_t _pic_width,ogg_uint32_t _pic_height){
-  size_t buf_sz;
-  _dst->width=_src->width;
-  _dst->height=_src->height;
-  _dst->stride=_src->width;
-  _dst->data=_buf;
-  buf_sz=_dst->width*_dst->height*sizeof(*_dst->data);
-  /*If we have _no_ data, just encode a dull green.*/
-  if(_pic_width==0||_pic_height==0)memset(_dst->data,0,buf_sz);
-  else{
-    unsigned char *dst;
-    unsigned char *src;
-    ogg_uint32_t   x;
-    ogg_uint32_t   y;
-    int            dstride;
-    int            sstride;
-    /*Step 1: Copy the data we do have.*/
-    dstride=_dst->stride;
-    sstride=_src->stride;
-    dst=_dst->data+_pic_y*dstride+_pic_x;
-    src=_src->data+_pic_y*sstride+_pic_x;
-    for(y=0;y<_pic_height;y++){
-      memcpy(dst,src,_pic_width);
-      dst+=dstride;
-      src+=sstride;
-    }
-    /*Step 2: Copy the border into any blocks that are 100% padding.
-      There's probably smarter things we could do than this.*/
-    /*Left side.*/
-    for(x=_pic_x;x-->0;){
-      dst=_dst->data+_pic_y*dstride+x;
-      for(y=0;y<_pic_height;y++){
-        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]+
-         (dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
-        dst+=dstride;
-      }
-    }
-    /*Right side.*/
-    for(x=_pic_x+_pic_width;x<_dst->width;x++){
-      dst=_dst->data+_pic_y*dstride+x-1;
-      for(y=0;y<_pic_height;y++){
-        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]+
-         (dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
-        dst+=dstride;
-      }
-    }
-    /*Top.*/
-    dst=_dst->data+_pic_y*dstride;
-    for(y=_pic_y;y-->0;){
-      for(x=0;x<_dst->width;x++){
-        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]+dst[x+(x+1<_dst->width)]+2>>2;
-      }
-      dst-=dstride;
-    }
-    /*Bottom.*/
-    dst=_dst->data+(_pic_y+_pic_height)*dstride;
-    for(y=_pic_y+_pic_height;y<_dst->height;y++){
-      for(x=0;x<_dst->width;x++){
-        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]+
-         (dst-dstride)[x+(x+1<_dst->width)]+2>>2;
-      }
-      dst+=dstride;
-    }
-  }
-  _buf+=buf_sz;
-  return _buf;
-}
-
-int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr){
-  CP_INSTANCE     *cpi;
-  theora_state    *te;
-  th_img_plane    *pycbcr;
-  th_ycbcr_buffer  ycbcr;
-  yuv_buffer       yuv;
-  ogg_uint32_t     pic_width;
-  ogg_uint32_t     pic_height;
-  int              hdec;
-  int              vdec;
-  int              ret;
-  if(_enc==NULL||_ycbcr==NULL)return OC_FAULT;
-  te=&_enc->state;
-  /*theora_encode_YUVin() does not bother to check uv_width and uv_height, and
-     then uses them.
-    This is arguably okay (it will most likely lead to a crash if they're
-     wrong, which will make the developer who passed them fix the problem), but
-     our API promises to return an error code instead.*/
-  cpi=(CP_INSTANCE *)te->internal_encode;
-  hdec=!(cpi->info.pixelformat&1);
-  vdec=!(cpi->info.pixelformat&2);
-  if(_ycbcr[0].width!=cpi->info.width||
-   _ycbcr[0].height!=cpi->info.height||
-   _ycbcr[1].width!=_ycbcr[0].width>>hdec||
-   _ycbcr[1].height!=_ycbcr[0].height>>vdec||
-   _ycbcr[2].width!=_ycbcr[1].width||_ycbcr[2].height!=_ycbcr[1].height){
-    return OC_EINVAL;
-  }
-  pic_width=cpi->info.frame_width;
-  pic_height=cpi->info.frame_height;
-  /*We can only directly use the input buffer if no padding is required (since
-     the new API is documented not to use values outside the picture region)
-     and if the strides for the Cb and Cr planes are the same, since the old
-     API had no way to specify different ones.*/
-  if(_ycbcr[0].width==pic_width&&_ycbcr[0].height==pic_height&&
-   _ycbcr[1].stride==_ycbcr[2].stride){
-    pycbcr=_ycbcr;
-  }
-  else{
-    unsigned char *buf;
-    int            pic_x;
-    int            pic_y;
-    int            pli;
-    pic_x=cpi->info.offset_x;
-    pic_y=cpi->info.offset_y;
-    if(_ycbcr[0].width>pic_width||_ycbcr[0].height>pic_height){
-      buf=th_encode_copy_pad_plane(ycbcr+0,_enc->buf,_ycbcr+0,
-       pic_x,pic_y,pic_width,pic_height);
-    }
-    else{
-      /*If only the strides differ, we can still avoid copying the luma plane.*/
-      memcpy(ycbcr+0,_ycbcr+0,sizeof(ycbcr[0]));
-      if(_enc->buf==NULL){
-        _enc->buf=(unsigned char *)_ogg_malloc(
-         (_ycbcr[1].width*_ycbcr[1].height<<1)*sizeof(*_enc->buf));
-      }
-      buf=_enc->buf;
-    }
-    for(pli=1;pli<3;pli++){
-      int x0;
-      int y0;
-      x0=pic_x>>hdec;
-      y0=pic_y>>vdec;
-      buf=th_encode_copy_pad_plane(ycbcr+pli,buf,_ycbcr+pli,
-       x0,y0,(pic_x+pic_width+hdec>>hdec)-x0,(pic_y+pic_height+vdec>>vdec)-y0);
-    }
-    pycbcr=ycbcr;
-  }
-  yuv.y_width=pycbcr[0].width;
-  yuv.y_height=pycbcr[0].height;
-  yuv.uv_width=pycbcr[1].width;
-  yuv.uv_height=pycbcr[1].height;
-  yuv.y_stride=pycbcr[0].stride;
-  yuv.y=pycbcr[0].data;
-  yuv.uv_stride=pycbcr[1].stride;
-  yuv.u=pycbcr[1].data;
-  yuv.v=pycbcr[2].data;
-  ret=theora_encode_YUVin(te,&yuv);
-  return ret;
-}
-
-int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op){
-  if(_enc==NULL)return OC_FAULT;
-  return theora_encode_packetout(&_enc->state,_last,_op);
-}
-
-void th_encode_free(th_enc_ctx *_enc){
-  if(_enc!=NULL){
-    theora_clear(&_enc->state);
-    _ogg_free(_enc->buf);
-    _ogg_free(_enc);
-  }
-}

Copied: branches/theora-thusnelda/lib/enc/encapiwrapper.c (from rev 15592, trunk/theora-exp/lib/encapiwrapper.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,154 @@
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "../dec/apiwrapper.h"
+#include "encint.h"
+#include "theora/theoraenc.h"
+
+
+
+static void th_enc_api_clear(th_api_wrapper *_api){
+  if(_api->encode)th_encode_free(_api->encode);
+  memset(_api,0,sizeof(*_api));
+}
+
+static void theora_encode_clear(theora_state *_te){
+  if(_te->i!=NULL)theora_info_clear(_te->i);
+  memset(_te,0,sizeof(*_te));
+}
+
+static int theora_encode_control(theora_state *_te,int _req,
+ void *_buf,size_t _buf_sz){
+  return th_encode_ctl(((th_api_wrapper *)_te->i->codec_setup)->encode,
+   _req,_buf,_buf_sz);
+}
+
+static ogg_int64_t theora_encode_granule_frame(theora_state *_te,
+ ogg_int64_t _gp){
+  return th_granule_frame(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static double theora_encode_granule_time(theora_state *_te,ogg_int64_t _gp){
+  return th_granule_time(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static const oc_state_dispatch_vtable OC_ENC_DISPATCH_VTBL={
+  (oc_state_clear_func)theora_encode_clear,
+  (oc_state_control_func)theora_encode_control,
+  (oc_state_granule_frame_func)theora_encode_granule_frame,
+  (oc_state_granule_time_func)theora_encode_granule_time,
+};
+
+int theora_encode_init(theora_state *_te,theora_info *_ci){
+  th_api_info *apiinfo;
+  th_info      info;
+  /*Allocate our own combined API wrapper/theora_info struct.
+    We put them both in one malloc'd block so that when the API wrapper is
+     freed, the info struct goes with it.
+    This avoids having to figure out whether or not we need to free the info
+     struct in either theora_info_clear() or theora_clear().*/
+  apiinfo=(th_api_info *)_ogg_malloc(sizeof(*apiinfo));
+  /*Make our own copy of the info struct, since its lifetime should be
+     independent of the one we were passed in.*/
+  *&apiinfo->info=*_ci;
+  oc_theora_info2th_info(&info,_ci);
+  apiinfo->api.encode=th_encode_alloc(&info);
+  if(apiinfo->api.encode==NULL){
+    _ogg_free(apiinfo);
+    return OC_EINVAL;
+  }
+  apiinfo->api.clear=(oc_setup_clear_func)th_enc_api_clear;
+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
+  _te->internal_encode=(void *)&OC_ENC_DISPATCH_VTBL;
+  _te->internal_decode=NULL;
+  _te->granulepos=0;
+  _te->i=&apiinfo->info;
+  _te->i->codec_setup=&apiinfo->api;
+  /*TODO: Additional codec setup using the extra fields in theora_info.*/
+  return 0;
+}
+
+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
+  th_api_wrapper  *api;
+  th_ycbcr_buffer  buf;
+  int              ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  buf[0].width=_yuv->y_width;
+  buf[0].height=_yuv->y_height;
+  buf[0].stride=_yuv->y_stride;
+  buf[0].data=_yuv->y;
+  buf[1].width=_yuv->uv_width;
+  buf[1].height=_yuv->uv_height;
+  buf[1].stride=_yuv->uv_stride;
+  buf[1].data=_yuv->u;
+  buf[2].width=_yuv->uv_width;
+  buf[2].height=_yuv->uv_height;
+  buf[2].stride=_yuv->uv_stride;
+  buf[2].data=_yuv->v;
+  ret=th_encode_ycbcr_in(api->encode,buf);
+  if(ret<0)return ret;
+  _te->granulepos=api->encode->state.granpos;
+  return ret;
+}
+
+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
+  th_api_wrapper *api;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  return th_encode_packetout(api->encode,_last_p,_op);
+}
+
+int theora_encode_header(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output an info packet.*/
+  enc->packet_state=OC_PACKET_INFO_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}
+
+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
+  oggpack_buffer  opb;
+  void           *buf;
+  int             packet_state;
+  int             ret;
+  packet_state=OC_PACKET_COMMENT_HDR;
+  oggpackB_writeinit(&opb);
+  ret=oc_state_flushheader(NULL,&packet_state,&opb,NULL,NULL,NULL,
+   (th_comment *)_tc,_op);
+  if(ret>=0){
+    /*The oggpack_buffer's lifetime ends with this function, so we have to
+       copy out the packet contents.
+      Presumably the application knows it is supposed to free this.
+      This part works nothing like the Vorbis API, and the documentation on it
+       has been wrong for some time, claiming libtheora owned the memory.*/
+    buf=_ogg_malloc(_op->bytes);
+    memcpy(buf,_op->packet,_op->bytes);
+    _op->packet=buf;
+    ret=0;
+  }
+  oggpack_writeclear(&opb);
+  return ret;
+}
+
+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output a setup packet.*/
+  enc->packet_state=OC_PACKET_SETUP_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}

Modified: branches/theora-thusnelda/lib/enc/encfrag.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encfrag.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encfrag.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -16,12 +16,12 @@
  ********************************************************************/
 #include <stdlib.h>
 #include <string.h>
-#include "codec_internal.h"
+#include "encint.h"
 
 
-void oc_enc_frag_sub(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
  const unsigned char *_src,const unsigned char *_ref,int _ystride){
-  (*_cpi->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
+  (*_enc->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
 }
 
 void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
@@ -35,9 +35,9 @@
   }
 }
 
-void oc_enc_frag_sub_128(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
  const unsigned char *_src,int _ystride){
-  (*_cpi->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
+  (*_enc->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
 }
 
 void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
@@ -50,9 +50,9 @@
   }
 }
 
-unsigned oc_enc_frag_sad(const CP_INSTANCE *_cpi,const unsigned char *_x,
+unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_x,
  const unsigned char *_y,int _ystride){
-  return (*_cpi->opt_vtable.frag_sad)(_x,_y,_ystride);
+  return (*_enc->opt_vtable.frag_sad)(_x,_y,_ystride);
 }
 
 unsigned oc_enc_frag_sad_c(const unsigned char *_src,
@@ -69,10 +69,10 @@
   return sad;
 }
 
-unsigned oc_enc_frag_sad_thresh(const CP_INSTANCE *_cpi,
+unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
  const unsigned char *_src,const unsigned char *_ref,int _ystride,
  unsigned _thresh){
-  return (*_cpi->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
+  return (*_enc->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
 }
 
 unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
@@ -90,10 +90,10 @@
   return sad;
 }
 
-unsigned oc_enc_frag_sad2_thresh(const CP_INSTANCE *_cpi,
+unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
  const unsigned char *_src,const unsigned char *_ref1,
  const unsigned char *_ref2,int _ystride,unsigned _thresh){
-  return (*_cpi->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
+  return (*_enc->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
    _thresh);
 }
 
@@ -320,10 +320,10 @@
   return sad;
 }
 
-unsigned oc_enc_frag_satd_thresh(const CP_INSTANCE *_cpi,
+unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
  const unsigned char *_src,const unsigned char *_ref,int _ystride,
  unsigned _thresh){
-  return (*_cpi->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh);
+  return (*_enc->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh);
 }
 
 unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
@@ -334,10 +334,10 @@
    -abs(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
 }
 
-unsigned oc_enc_frag_satd2_thresh(const CP_INSTANCE *_cpi,
+unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
  const unsigned char *_src,const unsigned char *_ref1,
  const unsigned char *_ref2,int _ystride,unsigned _thresh){
-  return (*_cpi->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride,
+  return (*_enc->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride,
    _thresh);
 }
 
@@ -349,9 +349,9 @@
   return oc_hadamard_sad_thresh(buf,_thresh);
 }
 
-unsigned oc_enc_frag_intra_satd(const CP_INSTANCE *_cpi,
+unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
  const unsigned char *_src,int _ystride){
-  return (*_cpi->opt_vtable.frag_intra_satd)(_src,_ystride);
+  return (*_enc->opt_vtable.frag_intra_satd)(_src,_ystride);
 }
 
 unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride){
@@ -360,14 +360,9 @@
   return oc_hadamard_sad_thresh(buf,0xFF000);
 }
 
-void oc_enc_frag_copy(const CP_INSTANCE *_cpi,unsigned char *_dst,
- const unsigned char *_src,int _ystride){
-  (*_cpi->opt_vtable.frag_copy)(_dst,_src,_ystride);
-}
-
-void oc_enc_frag_copy2(const CP_INSTANCE *_cpi,unsigned char *_dst,
+void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride){
-  (*_cpi->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
+  (*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
 }
 
 void oc_enc_frag_copy2_c(unsigned char *_dst,
@@ -382,16 +377,12 @@
   }
 }
 
-void oc_enc_frag_recon_intra(const CP_INSTANCE *_cpi,
+void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
  unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){
-  (*_cpi->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
+  (*_enc->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
 }
 
-void oc_enc_frag_recon_inter(const CP_INSTANCE *_cpi,unsigned char *_dst,
+void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
-  (*_cpi->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
+  (*_enc->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
 }
-
-void oc_enc_restore_fpu(const CP_INSTANCE *_cpi){
-  (*_cpi->opt_vtable.restore_fpu)();
-}

Copied: branches/theora-thusnelda/lib/enc/encinfo.c (from rev 15592, trunk/theora-exp/lib/encinfo.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/encinfo.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/encinfo.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,116 @@
+#include <stdlib.h>
+#include <string.h>
+#include "internal.h"
+#include "enquant.h"
+#include "huffenc.h"
+
+
+
+/*Packs a series of octets from a given byte array into the pack buffer.
+  _opb: The pack buffer to store the octets in.
+  _buf: The byte array containing the bytes to pack.
+  _len: The number of octets to pack.*/
+static void oc_pack_octets(oggpack_buffer *_opb,const char *_buf,int _len){
+  int i;
+  for(i=0;i<_len;i++)oggpackB_write(_opb,_buf[i],8);
+}
+
+
+
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op){
+  if(_op==NULL)return TH_EFAULT;
+  switch(*_packet_state){
+    /*Codec info header.*/
+    case OC_PACKET_INFO_HDR:{
+      if(_state==NULL)return TH_EFAULT;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the info header.*/
+      oggpackB_write(_opb,0x80,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the codec bitstream version.*/
+      oggpackB_write(_opb,TH_VERSION_MAJOR,8);
+      oggpackB_write(_opb,TH_VERSION_MINOR,8);
+      oggpackB_write(_opb,TH_VERSION_SUB,8);
+      /*Describe the encoded frame.*/
+      oggpackB_write(_opb,_state->info.frame_width>>4,16);
+      oggpackB_write(_opb,_state->info.frame_height>>4,16);
+      oggpackB_write(_opb,_state->info.pic_width,24);
+      oggpackB_write(_opb,_state->info.pic_height,24);
+      oggpackB_write(_opb,_state->info.pic_x,8);
+      oggpackB_write(_opb,_state->info.frame_height-
+       _state->info.pic_height-_state->info.pic_y,8);
+      oggpackB_write(_opb,_state->info.fps_numerator,32);
+      oggpackB_write(_opb,_state->info.fps_denominator,32);
+      oggpackB_write(_opb,_state->info.aspect_numerator,24);
+      oggpackB_write(_opb,_state->info.aspect_denominator,24);
+      oggpackB_write(_opb,_state->info.colorspace,8);
+      oggpackB_write(_opb,_state->info.target_bitrate,24);
+      oggpackB_write(_opb,_state->info.quality,6);
+      oggpackB_write(_opb,_state->info.keyframe_granule_shift,5);
+      oggpackB_write(_opb,_state->info.pixel_fmt,2);
+      /*Spare configuration bits.*/
+      oggpackB_write(_opb,0,3);
+      _op->b_o_s=1;
+    }break;
+    /*Comment header.*/
+    case OC_PACKET_COMMENT_HDR:{
+      int vendor_len;
+      int i;
+      if(_tc==NULL)return TH_EFAULT;
+      vendor_len=strlen(_vendor);
+      oggpackB_reset(_opb);
+      /*Mark this packet as the comment header.*/
+      oggpackB_write(_opb,0x81,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the vendor string.*/
+      oggpack_write(_opb,vendor_len,32);
+      oc_pack_octets(_opb,_vendor,vendor_len);
+      oggpack_write(_opb,_tc->comments,32);
+      for(i=0;i<_tc->comments;i++){
+        if(_tc->user_comments[i]!=NULL){
+          oggpack_write(_opb,_tc->comment_lengths[i],32);
+          oc_pack_octets(_opb,_tc->user_comments[i],_tc->comment_lengths[i]);
+        }
+        else oggpack_write(_opb,0,32);
+      }
+      _op->b_o_s=0;
+    }break;
+    /*Codec setup header.*/
+    case OC_PACKET_SETUP_HDR:{
+      int ret;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the setup header.*/
+      oggpackB_write(_opb,0x82,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the quantizer tables.*/
+      oc_quant_params_pack(_opb,_qinfo);
+      /*Write the huffman codes.*/
+      ret=oc_huff_codes_pack(_opb,_codes);
+      /*This should never happen, because we validate the tables when they
+         are set.
+        If you see, it's a good chance memory is being corrupted.*/
+      if(ret<0)return ret;
+      _op->b_o_s=0;
+    }break;
+    /*No more headers to emit.*/
+    default:return 0;
+  }
+  /*This is kind of fugly: we hand the user a buffer which they do not own.
+    We will overwrite it when the next packet is output, so the user better be
+     done with it by then.
+    Vorbis is little better: it hands back buffers that it will free the next
+     time the headers are requested, or when the encoder is cleared.
+    Hopefully libogg2 will make this much cleaner.*/
+  _op->packet=oggpackB_get_buffer(_opb);
+  _op->bytes=oggpackB_bytes(_opb);
+  _op->e_o_s=0;
+  _op->granulepos=0;
+  _op->packetno=*_packet_state+3;
+  return ++(*_packet_state)+3;
+}

Copied: branches/theora-thusnelda/lib/enc/encint.h (from rev 16019, branches/theora-thusnelda/lib/enc/codec_internal.h)
===================================================================
--- branches/theora-thusnelda/lib/enc/encint.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/encint.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,404 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#if !defined(_encint_H)
+# define _encint_H (1)
+# if defined(HAVE_CONFIG_H)
+#  include "config.h"
+# endif
+# include "theora/theoraenc.h"
+# include "../internal.h"
+# include "../dec/ocintrin.h"
+# include "mathops.h"
+# include "enquant.h"
+# include "huffenc.h"
+/*# define OC_COLLECT_METRICS*/
+
+
+
+typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
+typedef struct oc_mcenc_ctx           oc_mcenc_ctx;
+typedef struct oc_mb_enc_info         oc_mb_enc_info;
+typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_rc_state            oc_rc_state;
+typedef struct th_enc_ctx             oc_enc_ctx;
+typedef struct oc_token_checkpoint    oc_token_checkpoint;
+
+
+
+/*Constants for the packet-out state machine specific to the encoder.*/
+
+/*Next packet to emit: Data packet, but none are ready yet.*/
+#define OC_PACKET_EMPTY (0)
+/*Next packet to emit: Data packet, and one is ready.*/
+#define OC_PACKET_READY (1)
+
+
+
+/*The bits used for each of the MB mode codebooks.*/
+extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
+
+/*The bits used for each of the MV codebooks.*/
+extern const unsigned char OC_MV_BITS[2][64];
+
+/*The minimum value that can be stored in a SB run for each codeword.
+  The last entry is the upper bound on the length of a single SB run.*/
+extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
+/*The bits used for each SB run codeword.*/
+extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
+
+/*The bits used for each block run length (starting with 1).*/
+extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
+
+
+
+/*Encoder specific functions with accelerated variants.*/
+struct oc_enc_opt_vtable{
+  unsigned (*frag_sad)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_sad_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  unsigned (*frag_satd_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
+  void     (*frag_copy2)(unsigned char *_dst,
+   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void     (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+};
+
+
+void oc_enc_vtable_init(oc_enc_ctx *_enc);
+
+
+
+/*Encoder-specific macroblock information.*/
+struct oc_mb_enc_info{
+  /*Neighboring macro blocks that have MVs available from the current frame.*/
+  unsigned      cneighbors[4];
+  /*Neighboring macro blocks to use for MVs from the previous frame.*/
+  unsigned      pneighbors[4];
+  /*The number of current-frame neighbors.*/
+  unsigned char ncneighbors;
+  /*The number of previous-frame neighbors.*/
+  unsigned char npneighbors;
+  /*Flags indicating which MB modes have been refined.*/
+  unsigned char refined;
+  /*Motion vectors for a macro block for the current frame and the
+     previous two frames.
+    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
+     can be used to estimate constant velocity and constant acceleration
+     predictors.
+    Uninitialized MVs are (0,0).*/
+  oc_mv         analysis_mv[3][2]; /* [cur,prev,prev2][frame,golden] */
+  /*Current unrefined analysis MVs.*/
+  oc_mv         unref_mv[2];
+  /*Unrefined block MVs.*/
+  oc_mv         block_mv[4];
+  /*Refined block MVs.*/
+  oc_mv         ref_mv[4];
+  /*Minimum motion estimation error from the analysis stage.*/
+  ogg_uint16_t  error[2];
+  /*MB error for half-pel refinement for each frame type.*/
+  unsigned      satd[2];
+  /*Block error for half-pel refinement.*/
+  unsigned      block_satd[4];
+};
+
+
+
+/*State machine to estimate the opportunity cost of coding a MB mode.*/
+struct oc_mode_scheme_chooser{
+  /*Pointers to the a list containing the index of each mode in the mode
+     alphabet used by each scheme.
+    The first entry points to the dynamic scheme0_ranks, while the remaining 7
+     point to the constant entries stored in OC_MODE_SCHEMES.*/
+  const unsigned char *mode_ranks[8];
+  /*The ranks for each mode when coded with scheme 0.
+    These are optimized so that the more frequent modes have lower ranks.*/
+  unsigned char        scheme0_ranks[OC_NMODES];
+  /*The list of modes, sorted in descending order of frequency, that
+    corresponds to the ranks above.*/
+  unsigned char        scheme0_list[OC_NMODES];
+  /*The number of times each mode has been chosen so far.*/
+  int                  mode_counts[OC_NMODES];
+  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
+  unsigned char        scheme_list[8];
+  /*The number of bits used by each mode coding scheme.*/
+  int                  scheme_bits[8];
+};
+
+
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
+
+
+
+/*Rate control state information.*/
+struct oc_rc_state{
+  /*The target average bits per frame.*/
+  ogg_int64_t  bits_per_frame;
+  /*The current buffer fullness (bits available to be used).*/
+  ogg_int64_t  fullness;
+  /*The target buffer fullness.
+    This is where we'd like to be by the last keyframe the appears in the next
+     buf_delay frames.*/
+  ogg_int64_t  target;
+  /*The maximum buffer fullness (total size of the buffer).*/
+  ogg_int64_t  max;
+  /*The log of the number of pixels in a frame in Q57 format.*/
+  ogg_int64_t  log_npixels;
+  /*The exponent used in the rate model in Q8 format.*/
+  unsigned     exp[2];
+  /*The number of frames to distribute the buffer usage over.*/
+  int          buf_delay;
+  /*The total drop count from the previous frame.
+    This includes duplicates explicitly requested via the
+     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
+  ogg_uint32_t prev_drop_count;
+  /*The log of an estimated scale factor used to obtain the real framerate, for
+     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
+  ogg_int64_t  log_drop_scale;
+  /*The log of estimated scale factor for the rate model in Q57 format.*/
+  ogg_int64_t  log_scale[2];
+  /*The log of the target quantizer level in Q57 format.*/
+  ogg_int64_t  log_qtarget;
+};
+
+
+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
+void oc_rc_state_init(oc_rc_state *_rc,const oc_enc_ctx *_enc);
+void oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial);
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
+
+
+
+/*The internal encoder state.*/
+struct th_enc_ctx{
+  /*Shared encoder/decoder state.*/
+  oc_theora_state          state;
+  /*Buffer in which to assemble packets.*/
+  oggpack_buffer           opb;
+  /*Encoder-specific macroblock information.*/
+  oc_mb_enc_info          *mb_info;
+  /*DC coefficients after prediction.*/
+  ogg_int16_t             *frag_dc;
+  /*The list of coded macro blocks, in coded order.*/
+  unsigned                *coded_mbis;
+  /*The number of coded macro blocks.*/
+  size_t                   ncoded_mbis;
+  /*Whether or not packets are ready to be emitted.
+    This takes on negative values while there are remaining header packets to
+     be emitted, reaches 0 when the codec is ready for input, and becomes
+     positive when a frame has been processed and data packets are ready.*/
+  int                      packet_state;
+  /*The maximum distance between keyframes.*/
+  ogg_uint32_t             keyframe_frequency_force;
+  /*The number of duplicates to produce for the next frame.*/
+  ogg_uint32_t             dup_count;
+  /*The number of duplicates remaining to be emitted for the current frame.*/
+  ogg_uint32_t             nqueued_dups;
+  /*The number of duplicates emitted for the last frame.*/
+  ogg_uint32_t             prev_dup_count;
+  /*Whether or not VP3 compatibility mode has been enabled.*/
+  unsigned char            vp3_compatible;
+  /*Whether or not any INTER frames have been coded.*/
+  unsigned char            coded_inter_frame;
+  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
+     coefficients, and luma and chroma tokens.
+    The actual Huffman table used for a given coefficient depends not only on
+     the choice made here, but also its index in the zig-zag ordering.*/
+  unsigned char            huff_idxs[2][2][2];
+  /*Current count of bits used by each MV coding mode.*/
+  size_t                   mv_bits[2];
+  /*The mode scheme chooser for estimating mode coding costs.*/
+  oc_mode_scheme_chooser   chooser;
+  /*The DCT token lists for each coefficient and each plane.*/
+  unsigned char          **dct_tokens[3];
+  /*The extra bits associated with each DCT token.*/
+  ogg_uint16_t           **extra_bits[3];
+  /*The number of DCT tokens for each coefficient for each plane.*/
+  ptrdiff_t                ndct_tokens[3][64];
+  /*Pending EOB runs for each coefficient for each plane.*/
+  ogg_uint16_t             eob_run[3][64];
+  /*The offset of the first DCT token for each coefficient for each plane.*/
+  unsigned char            dct_token_offs[3][64];
+  /*The last DC coefficient for each plane and reference frame.*/
+  int                      dc_pred_last[3][3];
+#if defined(OC_COLLECT_METRICS)
+  /*Fragment SATD statistics for MB mode estimation metrics.*/
+  int                     *frag_satd;
+  /*Fragment SSD statistics for MB mode estimation metrics.*/
+  int                     *frag_ssd;
+#endif
+  /*The R-D optimization parameter.*/
+  int                      lambda;
+  /*The huffman tables in use.*/
+  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+  /*The quantization parameters in use.*/
+  th_quant_info            qinfo;
+  oc_iquant               *enquant_tables[64][3][2];
+  oc_iquant_table          enquant_table_data[64][3][2];
+  /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
+     value.
+    This is used to paramterize the rate control decisions.
+    They are kept in the log domain to simplify later processing.
+    Keep in mind these are DCT domain quantizers, and so are scaled by an
+     additional factor of 4 from the pixel domain.*/
+  ogg_int64_t              log_qavg[2][64];
+  /*The buffer state used to drive rate control.*/
+  oc_rc_state              rc;
+  /*Table for encoder acceleration functions.*/
+  oc_enc_opt_vtable        opt_vtable;
+};
+
+
+int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode);
+#if defined(OC_COLLECT_METRICS)
+extern void ModeMetrics(oc_enc_ctx *_enc);
+extern void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
+#endif
+
+
+
+/*Temporary state used for motion estimation.*/
+struct oc_mcenc_ctx{
+  /*The candidate motion vectors.*/
+  int                candidates[12][2];
+  /*The start of the Set B candidates.*/
+  int                setb0;
+  /*The total number of candidates.*/
+  int                ncandidates;
+  /*Accelerated predictor weights for each frame type.*/
+  ogg_int32_t        mvapw1[2];
+  ogg_int32_t        mvapw2[2];
+};
+
+
+/*Prep the motion search for the next frame.*/
+void oc_mcenc_start(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc);
+
+/*Search for a single MB MV (and with OC_FRAME_PREV, block MVs) in one frame.*/
+void oc_mcenc_search(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,int _mbi,int _frame);
+/*Refine a MB MV for one frame.*/
+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
+/*Refine the block MVs.*/
+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
+
+
+
+/*Used to rollback a tokenlog transaction when we retroactively decide to skip
+   a fragment.
+  A checkpoint is taken right before each token is added.*/
+struct oc_token_checkpoint{
+  /*The color plane the token was added to.*/
+  unsigned char pli;
+  /*The zig-zag index the token was added to.*/
+  unsigned char zzi;
+  /*The outstanding EOB run count before the token was added.*/
+  ogg_uint16_t  eob_run;
+  /*The token count before the token was added.*/
+  ptrdiff_t     ndct_tokens;
+};
+
+
+
+void oc_enc_tokenize_start(oc_enc_ctx *_enc);
+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,ptrdiff_t _fragi,ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,int _pli,
+ oc_token_checkpoint **_stack,int _acmin);
+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
+ const oc_token_checkpoint *_stack,int _n);
+void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
+
+
+
+/*Utility routine to encode one of the header packets.*/
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op);
+
+
+
+/*Encoder-specific accelerated functions.*/
+void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
+ unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64]);
+
+/*Default pure-C implementations.*/
+void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif

Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encode.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -14,447 +14,1321 @@
   last mod: $Id$
 
  ********************************************************************/
-
 #include <stdlib.h>
 #include <string.h>
-#include "codec_internal.h"
-#include "encoder_lookup.h"
+#include "encint.h"
+#if defined(OC_X86_ASM)
+# include "x86/x86enc.h"
+#endif
 
-static int predict_frag(int wpc,
-                        ogg_int16_t *dc,
-                        ogg_int16_t *down,
-                        int *last){
 
-  if(wpc){
-    ogg_int16_t DC = 0;
 
-    if(wpc&0x1) DC += pc[wpc][0]* *(dc-1);
-    if(wpc&0x2) DC += pc[wpc][1]* *(down-1);
-    if(wpc&0x4) DC += pc[wpc][2]* *(down);
-    if(wpc&0x8) DC += pc[wpc][3]* *(down+1);
+/*The default quantization parameters used by VP3.1.*/
+static const int OC_VP31_RANGE_SIZES[1]={63};
+static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
+  {
+     16, 11, 10, 16, 24,  40, 51, 61,
+     12, 12, 14, 19, 26,  58, 60, 55,
+     14, 13, 16, 24, 40,  57, 69, 56,
+     14, 17, 22, 29, 51,  87, 80, 62,
+     18, 22, 37, 58, 68, 109,103, 77,
+     24, 35, 55, 64, 81, 104,113, 92,
+     49, 64, 78, 87,103, 121,120,101,
+     72, 92, 95, 98,112, 100,103, 99
+  },
+  {
+     16, 11, 10, 16, 24,  40, 51, 61,
+     12, 12, 14, 19, 26,  58, 60, 55,
+     14, 13, 16, 24, 40,  57, 69, 56,
+     14, 17, 22, 29, 51,  87, 80, 62,
+     18, 22, 37, 58, 68, 109,103, 77,
+     24, 35, 55, 64, 81, 104,113, 92,
+     49, 64, 78, 87,103, 121,120,101,
+     72, 92, 95, 98,112, 100,103, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTRA_C[2]={
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTER[2]={
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  },
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  }
+};
 
-    /* if we need to do a shift */
-    if(pc[wpc][4]) {
-      /* If negative add in the negative correction factor */
-      DC += (HIGHBITDUPPED(DC) & pc[wpc][5]);
-      /* Shift in lieu of a divide */
-      DC >>= pc[wpc][4];
+const th_quant_info TH_VP31_QUANT_INFO={
+  {
+    220,200,190,180,170,170,160,160,
+    150,150,140,140,130,130,120,120,
+    110,110,100,100, 90, 90, 90, 80,
+     80, 80, 70, 70, 70, 60, 60, 60,
+     60, 50, 50, 50, 50, 40, 40, 40,
+     40, 40, 30, 30, 30, 30, 30, 30,
+     30, 20, 20, 20, 20, 20, 20, 20,
+     20, 10, 10, 10, 10, 10, 10, 10
+  },
+  {
+    500,450,400,370,340,310,285,265,
+    245,225,210,195,185,180,170,160,
+    150,145,135,130,125,115,110,107,
+    100, 96, 93, 89, 85, 82, 75, 74,
+     70, 68, 64, 60, 57, 56, 52, 50,
+     49, 45, 44, 43, 40, 38, 37, 35,
+     33, 32, 30, 29, 28, 25, 24, 22,
+     21, 19, 18, 17, 15, 13, 12, 10
+  },
+  {
+    30,25,20,20,15,15,14,14,
+    13,13,12,12,11,11,10,10,
+     9, 9, 8, 8, 7, 7, 7, 7,
+     6, 6, 6, 6, 5, 5, 5, 5,
+     4, 4, 4, 4, 3, 3, 3, 3,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0
+  },
+  {
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_Y},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C}
+    },
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER}
     }
+  }
+};
 
-    /* check for outranging on the two predictors that can outrange */
-    if((wpc&(PU|PUL|PL)) == (PU|PUL|PL)){
-      if( abs(DC - *down) > 128) {
-        DC = *down;
-      } else if( abs(DC - *(dc-1)) > 128) {
-        DC = *(dc-1);
-      } else if( abs(DC - *(down-1)) > 128) {
-        DC = *(down-1);
-      }
-    }
 
-    *last = *dc;
-    return *dc - DC;
-  }else{
-    int ret = *dc - *last;
-    *last = *dc;
-    return ret;
-  }
-}
 
-static void PredictDC(CP_INSTANCE *cpi){
-  ogg_int32_t pi;
-  int last[3];  /* last value used for given frame */
-  int y,x,fi = 0;
-  unsigned char *cp = cpi->frag_coded;
+/*The Huffman codes used for macro block modes.*/
 
-  /* for y,u,v; handles arbitrary plane subsampling arrangement.  Shouldn't need to be altered for 4:2:2 or 4:4:4 */
-  for (pi=0; pi<3; pi++) {
-    int v = cpi->frag_v[pi];
-    int h = cpi->frag_h[pi];
-    int subh = !(pi && cpi->info.pixelformat != OC_PF_444);
-    int subv = !(pi && cpi->info.pixelformat == OC_PF_420);
-    ogg_int16_t *dc;
-    ogg_int16_t *down;
-    dc=cpi->frag_dc_tmp;
-    down=cpi->frag_dc_tmp+h;
+const unsigned char OC_MODE_BITS[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {1,2,3,4,5,6,7,7},
+  /*Codebook 1: a fixed-length code.*/
+  {3,3,3,3,3,3,3,3}
+};
 
-    for(x=0;x<3;x++)last[x]=0;
+static const unsigned char OC_MODE_CODES[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {0x00,0x02,0x06,0x0E,0x1E,0x3E,0x7E,0x7F},
+  /*Codebook 1: a fixed-length code.*/
+  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07}
+};
 
-    for (y=0; y<v ; y++) {
-      macroblock_t *mb_row = cpi->macro + (y>>subv)*cpi->macro_h;
-      macroblock_t *mb_down = cpi->macro + ((y-1)>>subv)*cpi->macro_h;
 
-      memcpy(down,dc,h*sizeof(*down));
-      memcpy(dc,cpi->frag_dc+fi,h*sizeof(*dc));
+/*The Huffman codes used for motion vectors.*/
 
-      for (x=0; x<h; x++, fi++) {
-        if(cp[fi]) {
-          int wpc=0;
-          int wf = Mode2Frame[mb_row[x>>subh].mode];
+const unsigned char OC_MV_BITS[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+      8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,7,7,7,7,7,7,7,7,6,6,6,6,4,4,3,
+    3,
+    3,4,4,6,6,6,6,7,7,7,7,7,7,7,7,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).
+    This wastes a code word (0x01, negative zero), or a bit (0x00, positive
+     zero, requires only 5 bits to uniquely decode), but is hopefully not used
+     very often.*/
+  {
+      6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+  }
+};
 
-          if(x>0){
-            if(cp[fi-1] && Mode2Frame[mb_row[(x-1)>>subh].mode] == wf) wpc|=1; /* left */
-            if(y>0 && cp[fi-h-1] && Mode2Frame[mb_down[(x-1)>>subh].mode] == wf) wpc|=2; /* down left */
-          }
-          if(y>0){
-            if(cp[fi-h] && Mode2Frame[mb_down[x>>subh].mode] == wf) wpc|=4; /* down */
-            if(x+1<h && cp[fi-h+1] && Mode2Frame[mb_down[(x+1)>>subh].mode] == wf) wpc|=8; /* down right */
-          }
-          cpi->frag_dc[fi]=predict_frag(wpc,dc+x,down+x,last+wf);
-        }
-      }
-    }
+static const unsigned char OC_MV_CODES[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+         0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF3,
+    0xF1,0xEF,0xED,0xEB,0xE9,0xE7,0xE5,0xE3,
+    0xE1,0x6F,0x6D,0x6B,0x69,0x67,0x65,0x63,
+    0x61,0x2F,0x2D,0x2B,0x29,0x09,0x07,0x02,
+    0x00,
+    0x01,0x06,0x08,0x28,0x2A,0x2C,0x2E,0x60,
+    0x62,0x64,0x66,0x68,0x6A,0x6C,0x6E,0xE0,
+    0xE2,0xE4,0xE6,0xE8,0xEA,0xEC,0xEE,0xF0,
+    0xF2,0xF4,0xF6,0xF8,0xFA,0xFC,0xFE
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).*/
+  {
+         0x3F,0x3D,0x3B,0x39,0x37,0x35,0x33,
+    0x31,0x2F,0x2D,0x2B,0x29,0x27,0x25,0x23,
+    0x21,0x1F,0x1D,0x1B,0x19,0x17,0x15,0x13,
+    0x11,0x0F,0x0D,0x0B,0x09,0x07,0x05,0x03,
+    0x00,
+    0x02,0x04,0x06,0x08,0x0A,0x0C,0x0E,0x10,
+    0x12,0x14,0x16,0x18,0x1A,0x1C,0x1E,0x20,
+    0x22,0x24,0x26,0x28,0x2A,0x2C,0x2E,0x30,
+    0x32,0x34,0x36,0x38,0x3A,0x3C,0x3E
   }
-}
+};
 
-static void ChooseTokenTables (CP_INSTANCE *cpi) {
-  int interp = (cpi->FrameType!=KEY_FRAME);
-  int i,plane;
-  int best;
 
-  for(plane = 0; plane<2; plane++){
 
-    /* Work out which table options are best for DC */
-    best = cpi->dc_bits[plane][0];
-    cpi->huffchoice[interp][0][plane] = DC_HUFF_OFFSET;
-    for ( i = 1; i < DC_HUFF_CHOICES; i++ ) {
-      if ( cpi->dc_bits[plane][i] < best ) {
-        best = cpi->dc_bits[plane][i];
-        cpi->huffchoice[interp][0][plane] = i + DC_HUFF_OFFSET;
-      }
-    }
+/*Super block run coding scheme:
+   Codeword             Run Length
+   0                       1
+   10x                     2-3
+   110x                    4-5
+   1110xx                  6-9
+   11110xxx                10-17
+   111110xxxx              18-33
+   111111xxxxxxxxxxxx      34-4129*/
+const ogg_uint16_t    OC_SB_RUN_VAL_MIN[8]={1,2,4,6,10,18,34,4130};
+static const unsigned OC_SB_RUN_CODE_PREFIX[7]={
+  0,4,0xC,0x38,0xF0,0x3E0,0x3F000
+};
+const unsigned char   OC_SB_RUN_CODE_NBITS[7]={1,3,4,6,8,10,18};
 
-    /* Work out which table options are best for AC */
-    best = cpi->ac1_bits[plane][0]+cpi->acN_bits[plane][0];
-    cpi->huffchoice[interp][1][plane] = AC_HUFF_OFFSET;
-    for ( i = 1; i < AC_HUFF_CHOICES; i++ ) {
-      int test = cpi->ac1_bits[plane][i] + cpi->acN_bits[plane][i];
-      if ( test < best ){
-        best = test;
-        cpi->huffchoice[interp][1][plane] = i + AC_HUFF_OFFSET;
-      }
-    }
-  }
-}
 
-static void EncodeTokenGroup(CP_INSTANCE *cpi,
-                             int group,
-                             int huffY,
-                             int huffC){
-
+/*Writes the bit pattern for the run length of a super block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run, which must be positive.
+  _flag:      The current flag.
+  _done:      Whether or not more flags are to be encoded.*/
+static void oc_sb_run_pack(oggpack_buffer *_opb,ptrdiff_t _run_count,
+ int _flag,int _done){
   int i;
-  oggpack_buffer *opb=cpi->oggbuffer;
-  unsigned char *token = cpi->dct_token[group];
-  ogg_uint16_t *eb = cpi->dct_token_eb[group];
-
-  for(i=0; i<cpi->dct_token_ycount[group]; i++){
-    if(token[i] < DCT_NOOP){
-      oggpackB_write(opb,cpi->huff_codes[huffY][token[i]].pattern,
-       cpi->huff_codes[huffY][token[i]].nbits);
-      if(OC_DCT_TOKEN_EXTRA_BITS[token[i]]>0){
-        oggpackB_write(opb,eb[i],OC_DCT_TOKEN_EXTRA_BITS[token[i]]);
-      }
+  if(_run_count>=4129){
+    do{
+      oggpackB_write(_opb,0x3FFFF,18);
+      _run_count-=4129;
+      if(_run_count>0)oggpackB_write(_opb,_flag,1);
+      else if(!_done)oggpackB_write(_opb,!_flag,1);
     }
+    while(_run_count>=4129);
+    if(_run_count<=0)return;
   }
-
-  for(; i<cpi->dct_token_count[group]; i++){
-    if(token[i] < DCT_NOOP){
-      oggpackB_write(opb,cpi->huff_codes[huffC][token[i]].pattern,
-       cpi->huff_codes[huffC][token[i]].nbits);
-      if (OC_DCT_TOKEN_EXTRA_BITS[token[i]] > 0)
-        oggpackB_write( opb, eb[i], OC_DCT_TOKEN_EXTRA_BITS[token[i]] );
-    }
-  }
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  oggpackB_write(_opb,OC_SB_RUN_CODE_PREFIX[i]+_run_count-OC_SB_RUN_VAL_MIN[i],
+   OC_SB_RUN_CODE_NBITS[i]);
 }
 
-static long EncodeTokenList (CP_INSTANCE *cpi) {
-  int i;
-  int interp = (cpi->FrameType!=KEY_FRAME);
-  oggpack_buffer *opb=cpi->oggbuffer;
-  long bits0,bits1;
 
-  /* DC tokens aren't special, they just come first */
-  oggpackB_write( opb, cpi->huffchoice[interp][0][0] - DC_HUFF_OFFSET, DC_HUFF_CHOICE_BITS );
-  oggpackB_write( opb, cpi->huffchoice[interp][0][1] - DC_HUFF_OFFSET, DC_HUFF_CHOICE_BITS );
 
-  bits0 = oggpackB_bits(opb);
-  EncodeTokenGroup(cpi, 0,  cpi->huffchoice[interp][0][0], cpi->huffchoice[interp][0][1]);
-  bits0 = oggpackB_bits(opb)-bits0;
+/*Block run coding scheme:
+   Codeword             Run Length
+   0x                      1-2
+   10x                     3-4
+   110x                    5-6
+   1110xx                  7-10
+   11110xx                 11-14
+   11111xxxx               15-30*/
+const unsigned char OC_BLOCK_RUN_CODE_NBITS[30]={
+  2,2,3,3,4,4,6,6,6,6,7,7,7,7,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
+};
+static const ogg_uint16_t  OC_BLOCK_RUN_CODE_PATTERN[30]={
+        0x000,0x001,0x004,0x005,0x00C,0x00D,0x038,
+  0x039,0x03A,0x03B,0x078,0x079,0x07A,0x07B,0x1F0,
+  0x1F1,0x1F2,0x1F3,0x1F4,0x1F5,0x1F6,0x1F7,0x1F8,
+  0x1F9,0x1FA,0x1FB,0x1FC,0x1FD,0x1FE,0x1FF
+};
 
-  /* AC tokens */
-  oggpackB_write( opb, cpi->huffchoice[interp][1][0] - AC_HUFF_OFFSET, AC_HUFF_CHOICE_BITS );
-  oggpackB_write( opb, cpi->huffchoice[interp][1][1] - AC_HUFF_OFFSET, AC_HUFF_CHOICE_BITS );
 
-  bits1 = oggpackB_bits(opb);
-  for(i=1;i<=AC_TABLE_2_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0],
-                     cpi->huffchoice[interp][1][1]);
+/*Writes the bit pattern for the run length of a block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run.
+              This must be positive, and no more than 30.*/
+static void oc_block_run_pack(oggpack_buffer *_opb,int _run_count){
+  oggpackB_write(_opb,OC_BLOCK_RUN_CODE_PATTERN[_run_count-1],
+   OC_BLOCK_RUN_CODE_NBITS[_run_count-1]);
+}
 
-  for(;i<=AC_TABLE_3_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES,
-                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES);
 
-  for(;i<=AC_TABLE_4_THRESH;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*2,
-                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*2);
 
-  for(;i<BLOCK_SIZE;i++)
-    EncodeTokenGroup(cpi, i,  cpi->huffchoice[interp][1][0]+AC_HUFF_CHOICES*3,
-                     cpi->huffchoice[interp][1][1]+AC_HUFF_CHOICES*3);
-  bits1 = oggpackB_bits(opb)-bits1;
+static void oc_enc_frame_header_pack(oc_enc_ctx *_enc){
+  /*Mark this as a data packet.*/
+  oggpackB_write(&_enc->opb,0,1);
+  /*Output the frame type (key frame or delta frame).*/
+  oggpackB_write(&_enc->opb,_enc->state.frame_type,1);
+  /*Write out the current qi list.*/
+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
+  if(_enc->state.nqis>1){
+    oggpackB_write(&_enc->opb,1,1);
+    oggpackB_write(&_enc->opb,_enc->state.qis[1],6);
+    if(_enc->state.nqis>2){
+      oggpackB_write(&_enc->opb,1,1);
+      oggpackB_write(&_enc->opb,_enc->state.qis[2],6);
+    }
+    else oggpackB_write(&_enc->opb,0,1);
+  }
+  else oggpackB_write(&_enc->opb,0,1);
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    /*Key frames have 3 unused configuration bits, holdovers from the VP3 days.
+      Most of the other unused bits in the VP3 headers were eliminated.
+      Monty kept these to leave us some wiggle room for future expansion,
+       though a single bit in all frames would have been far more useful.*/
+    oggpackB_write(&_enc->opb,0,3);
+  }
+}
 
-  return bits1;
+/*Writes the bit flags for whether or not each super block is partially coded
+   or not.
+  These flags are run-length encoded, with the flag value alternating between
+   each run.
+  Return: The number partially coded SBs.*/
+static unsigned oc_enc_partial_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  unsigned           npartial;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  flag=sb_flags[0].coded_partially;
+  oggpackB_write(&_enc->opb,flag,1);
+  sbi=npartial=0;
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially!=flag)break;
+      run_count++;
+      npartial+=flag;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+  return npartial;
 }
 
-static const unsigned char NoOpModeWords[8] = {0,1,2,3,4,5,6,7};
-static const unsigned char NoOpModeBits[8] = {3,3,3,3,3,3,3,3};
-static const unsigned char NoOpScheme[8] = {0,1,2,3,4,5,6,7};
+/*Writes the coded/not coded flags for each super block that is not partially
+   coded.
+  These flags are run-length encoded, with the flag value altenating between
+   each run.*/
+static void oc_enc_coded_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  /*Skip partially coded super blocks; their flags have already been coded.*/
+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
+  flag=sb_flags[sbi].coded_fully;
+  oggpackB_write(&_enc->opb,flag,1);
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially)continue;
+      if(sb_flags[sbi].coded_fully!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+}
 
-static void PackModes (CP_INSTANCE *cpi) {
-  ogg_uint32_t    j;
-  ogg_uint32_t    BestScheme = cpi->chooser.scheme_list[0];
+static void oc_enc_coded_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  const oc_fragment *frags;
+  unsigned           npartial;
+  int                run_count;
+  int                flag;
+  int                pli;
+  unsigned           sbi;
+  npartial=oc_enc_partial_sb_flags_pack(_enc);
+  if(npartial<_enc->state.nsbs)oc_enc_coded_sb_flags_pack(_enc);
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  frags=_enc->state.frags;
+  for(sbi=0;sbi<nsbs&&!sb_flags[sbi].coded_partially;sbi++);
+  /*If there's at least one partial SB, store individual coded block flags.*/
+  if(sbi<nsbs){
+    flag=frags[sb_maps[sbi][0][0]].coded;
+    oggpackB_write(&_enc->opb,flag,1);
+    run_count=0;
+    nsbs=sbi=0;
+    for(pli=0;pli<3;pli++){
+      nsbs+=_enc->state.fplanes[pli].nsbs;
+      for(;sbi<nsbs;sbi++){
+        int       quadi;
+        int       bi;
+        ptrdiff_t fragi;
+        if(sb_flags[sbi].coded_partially){
+          for(quadi=0;quadi<4;quadi++){
+            for(bi=0;bi<4;bi++){
+              fragi=sb_maps[sbi][quadi][bi];
+              if(fragi>=0){
+                if(frags[fragi].coded!=flag){
+                  oc_block_run_pack(&_enc->opb,run_count);
+                  flag=!flag;
+                  run_count=1;
+                }
+                else run_count++;
+              }
+            }
+          }
+        }
+      }
+    }
+    /*Flush any trailing block coded run.*/
+    if(run_count>0)oc_block_run_pack(&_enc->opb,run_count);
+  }
+}
 
-  const unsigned char *ModeWords;
-  const unsigned char *ModeBits;
-  const unsigned char  *ModeScheme;
-  int SB,MB;
+static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
+  const unsigned char *mode_codes;
+  const unsigned char *mode_bits;
+  const unsigned char *mode_ranks;
+  unsigned            *coded_mbis;
+  size_t               ncoded_mbis;
+  const signed char   *mb_modes;
+  unsigned             mbii;
+  int                  scheme;
+  int                  mb_mode;
+  scheme=_enc->chooser.scheme_list[0];
+  /*Encode the best scheme.*/
+  oggpackB_write(&_enc->opb,scheme,3);
+  /*If the chosen scheme is scheme 0, send the mode frequency ordering.*/
+  if(scheme==0){
+    for(mb_mode=0;mb_mode<OC_NMODES;mb_mode++){
+      oggpackB_write(&_enc->opb,_enc->chooser.scheme0_ranks[mb_mode],3);
+    }
+  }
+  mode_ranks=_enc->chooser.mode_ranks[scheme];
+  mode_bits=OC_MODE_BITS[scheme+1>>3];
+  mode_codes=OC_MODE_CODES[scheme+1>>3];
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    int rank;
+    rank=mode_ranks[mb_modes[coded_mbis[mbii]]];
+    oggpackB_write(&_enc->opb,mode_codes[rank],mode_bits[rank]);
+  }
+}
 
-  oggpack_buffer *opb=cpi->oggbuffer;
+static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,int _dx,int _dy){
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][_dx+31],OC_MV_BITS[_mv_scheme][_dx+31]);
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][_dy+31],OC_MV_BITS[_mv_scheme][_dy+31]);
+}
 
-  /* Encode the best scheme. */
-  oggpackB_write( opb, BestScheme, (ogg_uint32_t)MODE_METHOD_BITS );
+static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
+  const unsigned     *coded_mbis;
+  size_t              ncoded_mbis;
+  const oc_mb_map    *mb_maps;
+  const signed char  *mb_modes;
+  const oc_fragment  *frags;
+  const oc_mv        *frag_mvs;
+  unsigned            mbii;
+  int                 mv_scheme;
+  /*Choose the coding scheme.*/
+  mv_scheme=_enc->mv_bits[1]<_enc->mv_bits[0];
+  oggpackB_write(&_enc->opb,mv_scheme,1);
+  /*Encode the motion vectors.
+    Macro blocks are iterated in Hilbert scan order, but the MVs within the
+     macro block are coded in raster order.*/
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  frags=_enc->state.frags;
+  frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    ptrdiff_t fragi;
+    unsigned  mbi;
+    int       bi;
+    mbi=coded_mbis[mbii];
+    switch(mb_modes[mbi]){
+      case OC_MODE_INTER_MV:
+      case OC_MODE_GOLDEN_MV:{
+        for(bi=0;;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,
+             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            /*Only code a single MV for this macro block.*/
+            break;
+          }
+        }
+      }break;
+      case OC_MODE_INTER_MV_FOUR:{
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,
+             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            /*Keep coding all the MVs for this macro block.*/
+          }
+        }
+      }break;
+    }
+  }
+}
 
-  /* If the chosen scheme is scheme 0 send details of the mode
-     frequency order */
-  if ( BestScheme == 0 ) {
-    for ( j = 0; j < MAX_MODES; j++ ){
-      /* Note that the last two entries are implicit */
-      oggpackB_write( opb, cpi->chooser.scheme0_ranks[j], (ogg_uint32_t)MODE_BITS );
+static void oc_enc_block_qis_pack(oc_enc_ctx *_enc){
+  const oc_fragment *frags;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  ptrdiff_t          run_count;
+  ptrdiff_t          nqi0;
+  int                flag;
+  if(_enc->state.nqis<=1)return;
+  ncoded_fragis=_enc->state.ntotal_coded_fragis;
+  if(ncoded_fragis<=0)return;
+  coded_fragis=_enc->state.coded_fragis;
+  frags=_enc->state.frags;
+  flag=!!frags[coded_fragis[0]].qii;
+  oggpackB_write(&_enc->opb,flag,1);
+  nqi0=0;
+  for(fragii=0;fragii<ncoded_fragis;){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      if(!!frags[coded_fragis[fragii]].qii!=flag)break;
+      run_count++;
+      nqi0+=!flag;
     }
-    ModeScheme = cpi->chooser.scheme0_ranks;
-    ModeWords = ModeBitPatterns;
-    ModeBits = ModeBitLengths;
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
   }
-  else if ( BestScheme < (MODE_METHODS - 1)) {
-    ModeScheme = ModeSchemes[BestScheme-1];
-    ModeWords = ModeBitPatterns;
-    ModeBits = ModeBitLengths;
-  }else{
-    ModeScheme = NoOpScheme;
-    ModeWords = NoOpModeWords;
-    ModeBits = NoOpModeBits;
+  if(_enc->state.nqis<3||nqi0>=ncoded_fragis)return;
+  for(fragii=0;!frags[coded_fragis[fragii]].qii;fragii++);
+  flag=frags[coded_fragis[fragii]].qii-1;
+  oggpackB_write(&_enc->opb,flag,1);
+  while(fragii<ncoded_fragis){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      int qii;
+      qii=frags[coded_fragis[fragii]].qii;
+      if(!qii)continue;
+      if(qii-1!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
   }
+}
 
-  /* modes coded in hilbert order; use superblock addressing */
-  for ( SB=0 ; SB < cpi->super_n[0]; SB++ ){
-    superblock_t *sp = &cpi->super[0][SB];
-    for ( MB=0; MB<4; MB++ ) {
-      macroblock_t *mbp = &cpi->macro[sp->m[MB]];
-      if(mbp->coded){
-        /* Add the appropriate mode entropy token. */
-        int index = ModeScheme[mbp->mode];
-        oggpackB_write( opb, ModeWords[index],
-                        (ogg_uint32_t)ModeBits[index] );
+/*Counts the tokens of each type used for the given range of coefficient
+   indices in zig-zag order.
+  _zzi_start:      The first zig-zag index to include.
+  _zzi_end:        The first zig-zag index to not include.
+  _token_counts_y: Returns the token counts for the Y' plane.
+  _token_counts_c: Returns the token counts for the Cb and Cr planes.*/
+static void oc_enc_count_tokens(oc_enc_ctx *_enc,int _zzi_start,int _zzi_end,
+ ptrdiff_t _token_counts_y[32],ptrdiff_t _token_counts_c[32]){
+  const unsigned char *dct_tokens;
+  ptrdiff_t            ndct_tokens;
+  int                  pli;
+  int                  zzi;
+  ptrdiff_t            ti;
+  memset(_token_counts_y,0,32*sizeof(*_token_counts_y));
+  memset(_token_counts_c,0,32*sizeof(*_token_counts_c));
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    dct_tokens=_enc->dct_tokens[0][zzi];
+    ndct_tokens=_enc->ndct_tokens[0][zzi];
+    for(ti=_enc->dct_token_offs[0][zzi];ti<ndct_tokens;ti++){
+      _token_counts_y[dct_tokens[ti]]++;
+    }
+  }
+  for(pli=1;pli<3;pli++){
+    for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        _token_counts_c[dct_tokens[ti]]++;
       }
     }
   }
 }
 
-static void PackMotionVectors (CP_INSTANCE *cpi) {
-  const ogg_uint32_t * MvPatternPtr;
-  const ogg_uint32_t * MvBitsPtr;
+/*Computes the number of bits used for each of the potential Huffman code for
+   the given list of token counts.
+  The bits are added to whatever the current bit counts are.*/
+static void oc_enc_count_bits(oc_enc_ctx *_enc,int _hgi,
+ const ptrdiff_t _token_counts[32],size_t _bit_counts[16]){
+  int huffi;
+  int huff_offs;
+  int token;
+  huff_offs=_hgi<<4;
+  for(huffi=0;huffi<16;huffi++){
+    for(token=0;token<32;token++){
+      _bit_counts[huffi]+=
+       _token_counts[token]*_enc->huff_codes[huffi+huff_offs][token].nbits;
+    }
+  }
+}
 
-  ogg_uint32_t SB, MB, B;
-  oggpack_buffer *opb=cpi->oggbuffer;
+/*Returns the Huffman index using the fewest number of bits.*/
+static int oc_select_huff_idx(size_t _bit_counts[16]){
+  int best_huffi;
+  int huffi;
+  best_huffi=0;
+  for(huffi=1;huffi<16;huffi++)if(_bit_counts[huffi]<_bit_counts[best_huffi]){
+    best_huffi=huffi;
+  }
+  return best_huffi;
+}
 
-  /* Choose the coding method */
-  if ( cpi->MVBits_0 < cpi->MVBits_1 ) {
-    oggpackB_write( opb, 0, 1 );
-    MvBitsPtr = &MvBits[MAX_MV_EXTENT];
-    MvPatternPtr = &MvPattern[MAX_MV_EXTENT];
-  }else{
-    oggpackB_write( opb, 1, 1 );
-    MvBitsPtr = &MvBits2[MAX_MV_EXTENT];
-    MvPatternPtr = &MvPattern2[MAX_MV_EXTENT];
+static void oc_enc_huff_group_pack(oc_enc_ctx *_enc,
+ int _zzi_start,int _zzi_end,const int _huff_idxs[2]){
+  int zzi;
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    int pli;
+    for(pli=0;pli<3;pli++){
+      const unsigned char *dct_tokens;
+      const ogg_uint16_t  *extra_bits;
+      ptrdiff_t            ndct_tokens;
+      const th_huff_code  *huff_codes;
+      ptrdiff_t            ti;
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      extra_bits=_enc->extra_bits[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      huff_codes=_enc->huff_codes[_huff_idxs[pli+1>>1]];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        int token;
+        int neb;
+        token=dct_tokens[ti];
+        oggpackB_write(&_enc->opb,huff_codes[token].pattern,
+         huff_codes[token].nbits);
+        neb=OC_DCT_TOKEN_EXTRA_BITS[token];
+        if(neb)oggpackB_write(&_enc->opb,extra_bits[ti],neb);
+      }
+    }
   }
+}
 
-  /* Pack and encode the motion vectors */
-  /* MBs are iterated in Hilbert scan order, but the MVs within the MB are coded in raster order */
+static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
+  static const unsigned char  OC_HUFF_GROUP_MIN[6]={0,1,6,15,28,64};
+  static const unsigned char *OC_HUFF_GROUP_MAX=OC_HUFF_GROUP_MIN+1;
+  ptrdiff_t token_counts_y[32];
+  ptrdiff_t token_counts_c[32];
+  size_t    bits_y[16];
+  size_t    bits_c[16];
+  int       huff_idxs[2];
+  int       frame_type;
+  int       hgi;
+  frame_type=_enc->state.frame_type;
+  /*Choose which Huffman tables to use for the DC token list.*/
+  oc_enc_count_tokens(_enc,0,1,token_counts_y,token_counts_c);
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  oc_enc_count_bits(_enc,0,token_counts_y,bits_y);
+  oc_enc_count_bits(_enc,0,token_counts_c,bits_c);
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the DC token list with the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][0][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][0][1]=(unsigned char)huff_idxs[1];
+  oc_enc_huff_group_pack(_enc,0,1,huff_idxs);
+  /*Choose which Huffman tables to use for the AC token lists.*/
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  for(hgi=1;hgi<5;hgi++){
+    oc_enc_count_tokens(_enc,OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],
+     token_counts_y,token_counts_c);
+    oc_enc_count_bits(_enc,hgi,token_counts_y,bits_y);
+    oc_enc_count_bits(_enc,hgi,token_counts_c,bits_c);
+  }
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the AC token lists using the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][1][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][1][1]=(unsigned char)huff_idxs[1];
+  for(hgi=1;hgi<5;hgi++){
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
+    oc_enc_huff_group_pack(_enc,
+     OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],huff_idxs);
+  }
+}
 
-  for ( SB=0 ; SB < cpi->super_n[0]; SB++ ){
-    superblock_t *sp = &cpi->super[0][SB];
-    for ( MB=0; MB<4; MB++ ) {
-      macroblock_t *mbp = &cpi->macro[sp->m[MB]];
-      if(!mbp->coded) continue;
+static void oc_enc_frame_pack(oc_enc_ctx *_enc){
+  oggpackB_reset(&_enc->opb);
+  /*Only proceed if we have some coded blocks.
+    If there are no coded blocks, we can drop this frame simply by emitting a
+     0 byte packet.*/
+  if(_enc->state.ntotal_coded_fragis>0){
+    oc_enc_frame_header_pack(_enc);
+    if(_enc->state.frame_type==OC_INTER_FRAME){
+      /*Coded block flags, MB modes, and MVs are only needed for delta frames.*/
+      oc_enc_coded_flags_pack(_enc);
+      oc_enc_mb_modes_pack(_enc);
+      oc_enc_mvs_pack(_enc);
+    }
+    oc_enc_block_qis_pack(_enc);
+    oc_enc_tokenize_finish(_enc);
+    oc_enc_residual_tokens_pack(_enc);
+  }
+  /*Success: Mark the packet as ready to be flushed.*/
+  _enc->packet_state=OC_PACKET_READY;
+#if defined(OC_COLLECT_METRICS)
+  ModeMetrics(_enc);
+#endif
+}
 
-      if(mbp->mode==CODE_INTER_PLUS_MV || mbp->mode==CODE_GOLDEN_MV){
-        /* One MV for the macroblock */
-        for(B=0; B<4; B++ ){
-          if(mbp->coded & (1<<B)){
-            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][0]], MvBitsPtr[mbp->mv[B][0]] );
-            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][1]], MvBitsPtr[mbp->mv[B][1]] );
-            break;
-          }
-        }
 
-      }else if (mbp->mode == CODE_INTER_FOURMV){
-        /* MV for each codedblock */
-        for(B=0; B<4; B++ ){
-          if(mbp->coded & (1<<B)){
-            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][0]], MvBitsPtr[mbp->mv[B][0]] );
-            oggpackB_write( opb, MvPatternPtr[mbp->mv[B][1]], MvBitsPtr[mbp->mv[B][1]] );
-          }
+void oc_enc_vtable_init_c(oc_enc_ctx *_enc){
+  /*The implementations prefixed with oc_enc_ are encoder-specific.
+    The rest we re-use from the decoder.*/
+  _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
+  _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
+  _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
+  _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c;
+  _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c;
+  _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
+}
+
+/*Initialize the macro block neighbor lists for MC analysis.
+  This assumes that the entire mb_info memory region has been initialized with
+   zeros.*/
+static void oc_enc_mb_info_init(oc_enc_ctx *_enc){
+  oc_mb_enc_info    *embs;
+  const signed char *mb_modes;
+  unsigned           nhsbs;
+  unsigned           nvsbs;
+  unsigned           nhmbs;
+  unsigned           nvmbs;
+  unsigned           sby;
+  mb_modes=_enc->state.mb_modes;
+  embs=_enc->mb_info;
+  nhsbs=_enc->state.fplanes[0].nhsbs;
+  nvsbs=_enc->state.fplanes[0].nvsbs;
+  nhmbs=_enc->state.nhmbs;
+  nvmbs=_enc->state.nvmbs;
+  for(sby=0;sby<nvsbs;sby++){
+    unsigned sbx;
+    for(sbx=0;sbx<nhsbs;sbx++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++){
+        /*Because of the Hilbert curve ordering the macro blocks are
+           visited in, the available neighbors change depending on where in
+           a super block the macro block is located.
+          Only the first three vectors are used in the median calculation
+           for the optimal predictor, and so the most important should be
+           listed first.
+          Additional vectors are used, so there will always be at least 3,
+           except for in the upper-left most macro block.*/
+        /*The number of current neighbors for each macro block position.*/
+        static const unsigned char NCNEIGHBORS[4]={4,3,2,4};
+        /*The offset of each current neighbor in the X direction.*/
+        static const signed char   CDX[4][4]={
+          {-1,0,1,-1},
+          {-1,0,-1,},
+          {-1,-1},
+          {-1,0,0,1}
+        };
+        /*The offset of each current neighbor in the Y direction.*/
+        static const signed char   CDY[4][4]={
+          {0,-1,-1,-1},
+          {0,-1,-1},
+          {0,-1},
+          {0,-1,1,-1}
+        };
+        /*The offset of each previous neighbor in the X direction.*/
+        static const signed char   PDX[4]={-1,0,1,0};
+        /*The offset of each previous neighbor in the Y direction.*/
+        static const signed char   PDY[4]={0,-1,0,1};
+        unsigned mbi;
+        int      mbx;
+        int      mby;
+        unsigned nmbi;
+        int      nmbx;
+        int      nmby;
+        int      ni;
+        mbi=(sby*nhsbs+sbx<<2)+quadi;
+        if(mb_modes[mbi]==OC_MODE_INVALID)continue;
+        mbx=2*sbx+(quadi>>1);
+        mby=2*sby+(quadi+1>>1&1);
+        /*Fill in the neighbors with current motion vectors available.*/
+        for(ni=0;ni<NCNEIGHBORS[quadi];ni++){
+          nmbx=mbx+CDX[quadi][ni];
+          nmby=mby+CDY[quadi][ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].cneighbors[embs[mbi].ncneighbors++]=nmbi;
         }
+        /*Fill in the neighbors with previous motion vectors available.*/
+        for(ni=0;ni<4;ni++){
+          nmbx=mbx+PDX[ni];
+          nmby=mby+PDY[ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].pneighbors[embs[mbi].npneighbors++]=nmbi;
+        }
       }
     }
   }
 }
 
-#include <stdio.h>
-void EncodeData(CP_INSTANCE *cpi){
-  long modebits=0;
-  long mvbits=0;
-  long dctbits;
-  long bits;
+static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int ret;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_codes==NULL)_codes=TH_VP31_HUFF_CODES;
+  /*Validate the codes.*/
+  oggpackB_reset(&_enc->opb);
+  ret=oc_huff_codes_pack(&_enc->opb,_codes);
+  if(ret<0)return ret;
+  memcpy(_enc->huff_codes,_codes,sizeof(_enc->huff_codes));
+  return 0;
+}
 
-  PredictDC(cpi);
-  dct_tokenize_finish(cpi);
+/*Sets the quantization parameters to use.
+  This may only be called before the setup header is written.
+  If it is called multiple times, only the last call has any effect.
+  _qinfo: The quantization parameters.
+          These are described in more detail in theoraenc.h.
+          This can be NULL, in which case the default quantization parameters
+           will be used.*/
+static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  int qi;
+  int pli;
+  int qti;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_qinfo==NULL)_qinfo=&TH_VP31_QUANT_INFO;
+  /*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/
+  memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo));
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+    _enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti];
+  }
+  oc_enquant_tables_init(_enc->state.dequant_tables,
+   _enc->enquant_tables,_qinfo);
+  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
+   sizeof(_enc->state.loop_filter_limits));
+  oc_enquant_qavg_init(_enc->log_qavg,_enc->state.dequant_tables,
+   _enc->state.info.pixel_fmt);
+  return 0;
+}
 
-  /* Mode and MV data not needed for key frames. */
-  if ( cpi->FrameType != KEY_FRAME ){
-    int prebits = oggpackB_bits(cpi->oggbuffer);
-    PackModes(cpi);
-    modebits = oggpackB_bits(cpi->oggbuffer)-prebits;
-    prebits = oggpackB_bits(cpi->oggbuffer);
-    PackMotionVectors (cpi);
-    mvbits = oggpackB_bits(cpi->oggbuffer)-prebits;
+static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
+  int ret;
+  int pli;
+  /*Initialize the shared encoder/decoder state.*/
+  ret=oc_state_init(&_enc->state,_info,4);
+  if(ret<0)return ret;
+  if(_enc->state.info.quality>63)_enc->state.info.quality=63;
+  if(_enc->state.info.quality<0)_enc->state.info.quality=32;
+  if(_enc->state.info.target_bitrate<0)_enc->state.info.target_bitrate=0;
+  _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
+  _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
+  _enc->coded_mbis=
+   (unsigned *)_ogg_malloc(_enc->state.nmbs*sizeof(*_enc->coded_mbis));
+  for(pli=0;pli<3;pli++){
+    _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
+    _enc->extra_bits[pli]=(ogg_uint16_t **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
   }
-  ChooseTokenTables(cpi);
-  {
-    int prebits = oggpackB_bits(cpi->oggbuffer);
-    EncodeTokenList(cpi);
-    dctbits = oggpackB_bits(cpi->oggbuffer)-prebits;
-  }
-  bits = oggpackB_bits(cpi->oggbuffer);
-  ReconRefFrames(cpi);
+#if defined(OC_COLLECT_METRICS)
+  _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
+  _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
+#endif
+#if defined(OC_X86_ASM)
+  oc_enc_vtable_init_x86(_enc);
+#else
+  oc_enc_vtable_init_c(_enc);
+#endif
+  oc_mode_scheme_chooser_init(&_enc->chooser);
+  oc_enc_mb_info_init(_enc);
+  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
+  /*Reset the packet-out state machine.*/
+  oggpackB_writeinit(&_enc->opb);
+  _enc->packet_state=OC_PACKET_INFO_HDR;
+  _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
+  _enc->dup_count=0;
+  _enc->nqueued_dups=0;
+  _enc->prev_dup_count=0;
+  /*Disable VP3 compatibility by default.*/
+  _enc->vp3_compatible=0;
+  /*No INTER frames coded yet.*/
+  _enc->coded_inter_frame=0;
+  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
+  oc_enc_set_quant_params(_enc,NULL);
+  _enc->state.qis[0]=_enc->state.info.quality;
+  _enc->state.nqis=1;
+  if(_enc->state.info.target_bitrate>0)oc_rc_state_init(&_enc->rc,_enc);
+  return 0;
+}
 
+static void oc_enc_clear(th_enc_ctx *_enc){
+  int pli;
 #if defined(OC_COLLECT_METRICS)
-  ModeMetrics(cpi);
+  oc_enc_mode_metrics_dump(_enc);
+#endif
+  oggpackB_writeclear(&_enc->opb);
+#if defined(OC_COLLECT_METRICS)
+  _ogg_free(_enc->frag_ssd);
+  _ogg_free(_enc->frag_satd);
+#endif
+  for(pli=3;pli-->0;){
+    oc_free_2d(_enc->extra_bits[pli]);
+    oc_free_2d(_enc->dct_tokens[pli]);
+  }
+  _ogg_free(_enc->coded_mbis);
+  _ogg_free(_enc->frag_dc);
+  _ogg_free(_enc->mb_info);
+  oc_state_clear(&_enc->state);
+}
 
-#if 0
-  {
-    int total = cpi->frag_total*64;
-    int fi=0,pi,x,y;
-    ogg_int64_t ssd=0;
-    double minimize;
 
-    for(pi=0;pi<3;pi++){
-      int bi = cpi->frag_buffer_index[fi];
-      unsigned char *frame = cpi->frame+bi;
-      unsigned char *recon = cpi->lastrecon+bi;
-      int stride = cpi->stride[pi];
-      int h = cpi->frag_h[pi]*8;
-      int v = cpi->frag_v[pi]*8;
+static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTRA_FRAME,
+     _enc->state.curframe_num>0);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
+  oc_enc_analyze(_enc,OC_INTRA_FRAME,_recode);
+  oc_enc_frame_pack(_enc);
+}
 
-      for(y=0;y<v;y++){
-        int lssd=0;
-        for(x=0;x<h;x++)
-          lssd += (frame[x]-recon[x])*(frame[x]-recon[x]);
-        ssd+=lssd;
-        frame+=stride;
-        recon+=stride;
+static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTER_FRAME,1);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTER_FRAME);
+  if(oc_enc_analyze(_enc,OC_INTER_FRAME,_recode)){
+    /*Mode analysis thinks this should have been a keyframe; start over.*/
+    oc_enc_compress_keyframe(_enc,1);
+  }
+  else{
+    oc_enc_frame_pack(_enc);
+    if(!_enc->coded_inter_frame){
+      /*On the first INTER frame, the previous call was an initial dry-run to
+         prime feed-forward statistics.*/
+      _enc->coded_inter_frame=1;
+      if(_enc->state.info.target_bitrate>0){
+        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+         OC_INTER_FRAME,_enc->state.qis[0],1);
       }
-      fi+=cpi->frag_n[pi];
+      oc_enc_compress_frame(_enc,1);
     }
+  }
+}
 
-    minimize = ssd + (float)bits*cpi->token_lambda*16;
+/*Set the granule position for the next packet to output based on the current
+   internal state.*/
+static void oc_enc_set_granpos(oc_enc_ctx *_enc){
+  unsigned dup_offs;
+  /*Add an offset for the number of duplicate frames we've emitted so far.*/
+  dup_offs=_enc->prev_dup_count-_enc->nqueued_dups;
+  /*If the current frame was a keyframe, use it for the high part.*/
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    _enc->state.granpos=(_enc->state.curframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)+dup_offs;
+  }
+  /*Otherwise use the last keyframe in the high part and put the current frame
+     in the low part.*/
+  else{
+    _enc->state.granpos=
+     (_enc->state.keyframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)
+     +_enc->state.curframe_num-_enc->state.keyframe_num+dup_offs;
+  }
+}
 
-    fprintf(stdout,"%d %d %d %d %f %f %f %ld %ld %ld %ld %f %f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  \n",
-            (int)cpi->CurrentFrame, // 0
-            cpi->BaseQ,             // 1
-            cpi->token_lambda,      // 2
-            cpi->skip_lambda,       // 3
-            (double)cpi->rho_count[cpi->BaseQ]/total,           // 4
-            (double)cpi->rho_postop/total,                      // 5
-            (double)cpi->rho_postop/cpi->rho_count[cpi->BaseQ], // 6
-            modebits,               // 7
-            mvbits,                 // 8
-            dctbits,                // 9
-            oggpackB_bits(cpi->oggbuffer), // 10
-            (double)ssd,              // 11
-            (double)0,
-            (double)cpi->dist_dist[0][0],//13
-            (double)cpi->dist_dist[0][1],
-            (double)cpi->dist_dist[0][2],
-            (double)cpi->dist_dist[0][3],
-            (double)cpi->dist_dist[0][4],
-            (double)cpi->dist_dist[0][5],
-            (double)cpi->dist_dist[0][6],
-            (double)cpi->dist_dist[0][7],
-            (double)(cpi->dist_bits[0][0]>>7),//21
-            (double)(cpi->dist_bits[0][1]>>7),
-            (double)(cpi->dist_bits[0][2]>>7),
-            (double)(cpi->dist_bits[0][3]>>7),
-            (double)(cpi->dist_bits[0][4]>>7),
-            (double)(cpi->dist_bits[0][5]>>7),
-            (double)(cpi->dist_bits[0][6]>>7),
-            (double)(cpi->dist_bits[0][7]>>7)
 
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  oc_enc_ctx *enc;
+  if(_info==NULL)return NULL;
+  enc=_ogg_malloc(sizeof(*enc));
+  if(oc_enc_init(enc,_info)<0){
+    _ogg_free(enc);
+    return NULL;
+  }
+  return enc;
+}
 
-            );
+void th_encode_free(th_enc_ctx *_enc){
+  if(_enc!=NULL){
+    oc_enc_clear(_enc);
+    _ogg_free(_enc);
   }
-#endif
-#endif
-  oc_enc_restore_fpu(cpi);
 }
 
-void WriteFrameHeader( CP_INSTANCE *cpi) {
-  oggpack_buffer *opb=cpi->oggbuffer;
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
+  switch(_req){
+    case TH_ENCCTL_SET_HUFFMAN_CODES:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_huff_table)*TH_NHUFFMAN_TABLES){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_huffman_codes(_enc,(const th_huff_table *)_buf);
+    }break;
+    case TH_ENCCTL_SET_QUANT_PARAMS:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_quant_info)){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_quant_params(_enc,(th_quant_info *)_buf);
+    }break;
+    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
+      ogg_uint32_t keyframe_frequency_force;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
+      keyframe_frequency_force=*(ogg_uint32_t *)_buf;
+      if(_enc->packet_state==OC_PACKET_INFO_HDR){
+        /*It's still early enough to enlarge keyframe_granule_shift.*/
+        _enc->state.info.keyframe_granule_shift=OC_CLAMPI(
+         _enc->state.info.keyframe_granule_shift,
+         OC_ILOG_32(keyframe_frequency_force-1),31);
+      }
+      _enc->keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
+       (ogg_uint32_t)1U<<_enc->state.info.keyframe_granule_shift);
+      *(ogg_uint32_t *)_buf=_enc->keyframe_frequency_force;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_VP3_COMPATIBLE:{
+      int vp3_compatible;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
+      vp3_compatible=*(int *)_buf;
+      _enc->vp3_compatible=vp3_compatible;
+      if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
+      if(oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO)<0)vp3_compatible=0;
+      if(_enc->state.info.pixel_fmt!=TH_PF_420||
+       _enc->state.info.pic_width<_enc->state.info.frame_width||
+       _enc->state.info.pic_height<_enc->state.info.frame_height||
+      /*If we have more than 4095 super blocks, VP3's RLE coding might
+         overflow.
+        We could overcome this by ensuring we flip the coded/not-coded flags on
+         at least one super block in the frame, but we pick the simple solution
+         of just telling the user the stream will be incompatible instead.
+        It's unlikely the old VP3 codec would be able to decode streams at this
+         resolution in real time in the first place.*/
+       _enc->state.nsbs>4095){
+        vp3_compatible=0;
+      }
+      *(int *)_buf=vp3_compatible;
+      return 0;
+    }break;
+    case TH_ENCCTL_GET_SPLEVEL_MAX:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
+      *(int *)_buf=2;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_SPLEVEL:{
+      int speed;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(speed))return TH_EINVAL;
+      speed=*(int *)_buf;
+      switch(speed){
+        case 0:{
+          /*_enc->MotionCompensation=1;*/
+          /*_enc->info.quick_p=0;*/
+        }break;
+        case 1:{
+          /*_enc->MotionCompensation=1;*/
+          /*_enc->info.quick_p=1;*/
+        }break;
+        case 2:{
+          /*_enc->MotionCompensation=0;*/
+          /*_enc->info.quick_p=1;*/
+        }break;
+        default:{
+          return TH_EINVAL;
+        }
+      }
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_DUP_COUNT:{
+      int dup_count;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(dup_count))return TH_EINVAL;
+      dup_count=*(int *)_buf;
+      if(dup_count>=_enc->keyframe_frequency_force)return TH_EINVAL;
+      _enc->dup_count=OC_MAXI(dup_count,0);
+      return 0;
+    }break;
+    default:return TH_EIMPL;
+  }
+}
 
-  /* Output the frame type (base/key frame or inter frame) */
-  oggpackB_write( opb, cpi->FrameType, 1 );
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
+  if(_enc==NULL)return TH_EFAULT;
+  return oc_state_flushheader(&_enc->state,&_enc->packet_state,&_enc->opb,
+   &_enc->qinfo,(const th_huff_table *)_enc->huff_codes,th_version_string(),
+   _tc,_op);
+}
 
-  /* Write out details of the current value of Q... variable resolution. */
-  oggpackB_write( opb, cpi->BaseQ, 6 ); // temporary
-
-  /* we only support one Q index per frame */
-  oggpackB_write( opb, 0, 1 );
-
-  /* If the frame was a base frame then write out the frame dimensions. */
-  if ( cpi->FrameType == KEY_FRAME ) {
-    /* all bits reserved! */
-    oggpackB_write( opb, 0, 3 );
+static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
+ ogg_uint32_t _pic_x,ogg_uint32_t _pic_y,
+ ogg_uint32_t _pic_width,ogg_uint32_t _pic_height){
+  unsigned char *dst;
+  int            dstride;
+  ogg_uint32_t   frame_width;
+  ogg_uint32_t   frame_height;
+  ogg_uint32_t   y;
+  frame_width=_dst->width;
+  frame_height=_dst->height;
+  /*If we have _no_ data, just encode a dull green.*/
+  if(_pic_width==0||_pic_height==0){
+    dst=_dst->data;
+    dstride=_dst->stride;
+    for(y=0;y<frame_height;y++){
+      memset(dst,0,frame_width*sizeof(*dst));
+      dst+=dstride;
+    }
   }
+  /*Otherwise, copy what we do have, and add our own padding.*/
+  else{
+    unsigned char *dst_data;
+    unsigned char *src_data;
+    unsigned char *src;
+    int            sstride;
+    ogg_uint32_t   x;
+    /*Step 1: Copy the data we do have.*/
+    dstride=_dst->stride;
+    sstride=_src->stride;
+    dst_data=_dst->data;
+    src_data=_src->data;
+    dst=dst_data+_pic_y*dstride+_pic_x;
+    src=src_data+_pic_y*sstride+_pic_x;
+    for(y=0;y<_pic_height;y++){
+      memcpy(dst,src,_pic_width);
+      dst+=dstride;
+      src+=sstride;
+    }
+    /*Step 2: Perform a low-pass extension into the padding region.*/
+    /*Left side.*/
+    for(x=_pic_x;x-->0;){
+      dst=dst_data+_pic_y*dstride+x;
+      for(y=0;y<_pic_height;y++){
+        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]
+         +(dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Right side.*/
+    for(x=_pic_x+_pic_width;x<frame_width;x++){
+      dst=dst_data+_pic_y*dstride+x-1;
+      for(y=0;y<_pic_height;y++){
+        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]
+         +(dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Top.*/
+    dst=dst_data+_pic_y*dstride;
+    for(y=_pic_y;y-->0;){
+      for(x=0;x<frame_width;x++){
+        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]
+         +dst[x+(x+1<frame_width)]+2>>2;
+      }
+      dst-=dstride;
+    }
+    /*Bottom.*/
+    dst=dst_data+(_pic_y+_pic_height)*dstride;
+    for(y=_pic_y+_pic_height;y<frame_height;y++){
+      for(x=0;x<frame_width;x++){
+        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]
+         +(dst-dstride)[x+(x+1<frame_width)]+2>>2;
+      }
+      dst+=dstride;
+    }
+  }
 }
 
-void oc_enc_dequant_idct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
- const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
-  (*_cpi->opt_vtable.dequant_idct8x8)(_y,_x,_last_zzi,_ncoefs,
-   _dc_quant,_ac_quant);
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
+  th_ycbcr_buffer img;
+  int             cframe_width;
+  int             cframe_height;
+  int             cpic_width;
+  int             cpic_height;
+  int             cpic_x;
+  int             cpic_y;
+  int             hdec;
+  int             vdec;
+  int             pli;
+  int             refi;
+  /*Step 1: validate parameters.*/
+  if(_enc==NULL||_img==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
+  if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width||
+   (ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){
+    return TH_EINVAL;
+  }
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  cframe_width=_enc->state.info.frame_width>>hdec;
+  cframe_height=_enc->state.info.frame_height>>vdec;
+  if(_img[1].width!=cframe_width||_img[2].width!=cframe_width||
+   _img[1].height!=cframe_height||_img[2].height!=cframe_height){
+    return TH_EINVAL;
+  }
+  /*Step 2: Copy the input to our internal buffer.
+    This lets us add padding, if necessary, so we don't have to worry about
+     dereferencing possibly invalid addresses, and allows us to use the same
+     strides and fragment offsets for both the input frame and the reference
+     frames.*/
+  /*Flip the input buffer upside down.*/
+  oc_ycbcr_buffer_flip(img,_img);
+  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+0,img+0,
+   _enc->state.info.pic_x,_enc->state.info.pic_y,
+   _enc->state.info.pic_width,_enc->state.info.pic_height);
+  cpic_x=_enc->state.info.pic_x>>hdec;
+  cpic_y=_enc->state.info.pic_y>>vdec;
+  cpic_width=(_enc->state.info.pic_x+_enc->state.info.pic_width+hdec>>hdec)
+   -cpic_x;
+  cpic_height=(_enc->state.info.pic_y+_enc->state.info.pic_height+vdec>>vdec)
+   -cpic_y;
+  for(pli=1;pli<3;pli++){
+    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+pli,img+pli,
+     cpic_x,cpic_y,cpic_width,cpic_height);
+  }
+  /*Step 3: Update the buffer state.*/
+  if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
+    _enc->state.ref_frame_idx[OC_FRAME_PREV]=
+     _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    if(_enc->state.frame_type==OC_INTRA_FRAME){
+      /*The new frame becomes both the previous and gold reference frames.*/
+      _enc->state.keyframe_num=_enc->state.curframe_num;
+      _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    }
+  }
+  /*Select a free buffer to use for the reconstructed version of this frame.*/
+  for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+  _enc->state.curframe_num+=_enc->prev_dup_count+1;
+  /*Step 4: Compress the frame.*/
+  /*Don't allow the generation of invalid files that overflow the
+     keyframe_granule_shift.*/
+  if(_enc->state.curframe_num==0||
+   _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
+   _enc->keyframe_frequency_force){
+    oc_enc_compress_keyframe(_enc,0);
+    /*On the first frame, the previous call was an initial dry-run to prime
+       feed-forward statistics.*/
+    if(_enc->state.curframe_num==0){
+      if(_enc->state.info.target_bitrate>0){
+        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+         OC_INTRA_FRAME,_enc->state.qis[0],1);
+      }
+      oc_enc_compress_keyframe(_enc,1);
+    }
+  }
+  /*Compress the frame.*/
+  else oc_enc_compress_frame(_enc,0);
+  /*Step 5: Finish reconstruction.
+    TODO: Move this inline with compression process.*/
+  {
+    int bv[256];
+    int loop_filter;
+    loop_filter=!oc_state_loop_filter_init(&_enc->state,bv);
+    for(pli=0;pli<3;pli++){
+      if(loop_filter){
+        oc_state_loop_filter_frag_rows(&_enc->state,bv,refi,pli,
+         0,_enc->state.fplanes[pli].nvfrags);
+      }
+      oc_state_borders_fill_rows(&_enc->state,refi,pli,
+       0,_enc->state.ref_frame_bufs[refi][pli].height);
+      oc_state_borders_fill_caps(&_enc->state,refi,pli);
+    }
+  }
+  oc_restore_fpu(&_enc->state);
+  /*Update state variables.*/
+  _enc->packet_state=OC_PACKET_READY;
+  if(_enc->state.info.target_bitrate>0){
+    oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+     _enc->state.frame_type,_enc->state.qis[0],0);
+  }
+  _enc->prev_dup_count=_enc->nqueued_dups=_enc->dup_count;
+  _enc->dup_count=0;
+#if defined(OC_DUMP_IMAGES)
+  oc_enc_set_granpos(_enc);
+  oc_state_dump_frame(&_enc->state,OC_FRAME_IO,"src");
+  oc_state_dump_frame(&_enc->state,OC_FRAME_SELF,"rec");
+#endif
+  return 0;
 }
 
-void oc_enc_loop_filter(CP_INSTANCE *_cpi,int _flimit){
-  (*_cpi->opt_vtable.enc_loop_filter)(_cpi,_flimit);
+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
+  if(_enc==NULL||_op==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_READY){
+    _enc->packet_state=OC_PACKET_EMPTY;
+    _op->packet=oggpackB_get_buffer(&_enc->opb);
+    _op->bytes=oggpackB_bytes(&_enc->opb);
+  }
+  else if(_enc->packet_state==OC_PACKET_EMPTY){
+    if(_enc->nqueued_dups>0){
+      _enc->nqueued_dups--;
+      _op->packet=NULL;
+      _op->bytes=0;
+    }
+    else{
+      if(_last_p)_enc->packet_state=OC_PACKET_DONE;
+      return 0;
+    }
+  }
+  else return 0;
+  _op->b_o_s=0;
+  _op->e_o_s=_last_p=_last_p&&_enc->nqueued_dups<=0;
+  oc_enc_set_granpos(_enc);
+  _op->packetno=th_granule_frame(_enc,_enc->state.granpos)+3;
+  _op->granulepos=_enc->state.granpos;
+  return 1+_enc->nqueued_dups;
 }
-
-void oc_enc_vtable_init_c(CP_INSTANCE *_cpi){
-  /*The implementations prefixed with oc_enc_ are encoder-specific.
-    The rest we re-use from the decoder.*/
-  _cpi->opt_vtable.frag_sad=oc_enc_frag_sad_c;
-  _cpi->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
-  _cpi->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
-  _cpi->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c;
-  _cpi->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c;
-  _cpi->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
-  _cpi->opt_vtable.frag_sub=oc_enc_frag_sub_c;
-  _cpi->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
-  _cpi->opt_vtable.frag_copy=oc_frag_copy_c;
-  _cpi->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
-  _cpi->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
-  _cpi->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
-  _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
-  _cpi->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_c;
-  _cpi->opt_vtable.enc_loop_filter=oc_enc_loop_filter_c;
-  _cpi->opt_vtable.restore_fpu=oc_restore_fpu_c;
-}

Modified: branches/theora-thusnelda/lib/enc/encoder_disabled.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_disabled.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encoder_disabled.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -14,38 +14,54 @@
   last mod: $Id$
 
  ********************************************************************/
+#include "../dec/apiwrapper.h"
+#include "encint.h"
 
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  return NULL;
+}
 
-#include "toplevel_lookup.h"
-#include "codec_internal.h"
+void th_encode_free(th_enc_ctx *_enc){}
 
-int theora_encode_init(theora_state *th, theora_info *c){
+
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
   return OC_DISABLED;
 }
 
-int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv){
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
   return OC_DISABLED;
 }
 
-int theora_encode_packetout( theora_state *t, int last_p, ogg_packet *op){
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
   return OC_DISABLED;
 }
 
-int theora_encode_header(theora_state *t, ogg_packet *op){
+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
   return OC_DISABLED;
 }
 
-int theora_encode_comment(theora_comment *tc, ogg_packet *op){
+
+
+int theora_encode_init(theora_state *_te,theora_info *_ci){
   return OC_DISABLED;
 }
 
-int theora_encode_tables(theora_state *t, ogg_packet *op){
+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
   return OC_DISABLED;
 }
 
-void theora_encoder_clear (CP_INSTANCE * cpi)
-{
+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
+  return OC_DISABLED;
 }
+
+int theora_encode_header(theora_state *_te,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
+  return OC_DISABLED;
+}

Deleted: branches/theora-thusnelda/lib/enc/encoder_huffman.h
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_huffman.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encoder_huffman.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,76 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-/********************************************************************
- *  Constants
- ********************************************************************/
-#define NUM_HUFF_TABLES         80
-#define DC_HUFF_OFFSET          0
-#define AC_HUFF_OFFSET          16
-#define AC_TABLE_2_THRESH       5
-#define AC_TABLE_3_THRESH       14
-#define AC_TABLE_4_THRESH       27
-
-#define DC_HUFF_CHOICES         16
-#define DC_HUFF_CHOICE_BITS     4
-
-#define AC_HUFF_CHOICES         16
-#define AC_HUFF_CHOICE_BITS     4
-
-/* Constants assosciated with entropy tokenisation. */
-#define MAX_SINGLE_TOKEN_VALUE  6
-#define DCT_VAL_CAT2_MIN        3
-#define DCT_VAL_CAT3_MIN        7
-#define DCT_VAL_CAT4_MIN        9
-#define DCT_VAL_CAT5_MIN        13
-#define DCT_VAL_CAT6_MIN        21
-#define DCT_VAL_CAT7_MIN        37
-#define DCT_VAL_CAT8_MIN        69
-
-#define DCT_EOB_TOKEN           0
-#define DCT_EOB_PAIR_TOKEN      1
-#define DCT_EOB_TRIPLE_TOKEN    2
-#define DCT_REPEAT_RUN_TOKEN    3
-#define DCT_REPEAT_RUN2_TOKEN   4
-#define DCT_REPEAT_RUN3_TOKEN   5
-#define DCT_REPEAT_RUN4_TOKEN   6
-
-#define DCT_SHORT_ZRL_TOKEN     7
-#define DCT_ZRL_TOKEN           8
-
-#define ONE_TOKEN               9       /* Special tokens for -1,1,-2,2 */
-#define MINUS_ONE_TOKEN         10
-#define TWO_TOKEN               11
-#define MINUS_TWO_TOKEN         12
-
-#define LOW_VAL_TOKENS          (MINUS_TWO_TOKEN + 1)   /* 13-16 */
-#define DCT_VAL_CATEGORY3       (LOW_VAL_TOKENS + 4)    /* 17 */
-#define DCT_VAL_CATEGORY4       (DCT_VAL_CATEGORY3 + 1) /* 18 */
-#define DCT_VAL_CATEGORY5       (DCT_VAL_CATEGORY4 + 1) /* 19 */
-#define DCT_VAL_CATEGORY6       (DCT_VAL_CATEGORY5 + 1) /* 20 */
-#define DCT_VAL_CATEGORY7       (DCT_VAL_CATEGORY6 + 1) /* 21 */
-#define DCT_VAL_CATEGORY8       (DCT_VAL_CATEGORY7 + 1) /* 22 */
-
-#define DCT_RUN_CATEGORY1       (DCT_VAL_CATEGORY8 + 1) /* 23-27 */
-#define DCT_RUN_CATEGORY1B      (DCT_RUN_CATEGORY1 + 5) /* 28 */
-#define DCT_RUN_CATEGORY1C      (DCT_RUN_CATEGORY1B+ 1) /* 29 */
-#define DCT_RUN_CATEGORY2       (DCT_RUN_CATEGORY1C+ 1) /* 30 */
-#define DCT_RUN_CATEGORY2B      (DCT_RUN_CATEGORY2 + 1) /* 31 */
-
-/* 32 */
-#define MAX_ENTROPY_TOKENS      (DCT_RUN_CATEGORY2B + 1) /* 32 */
-#define DCT_NOOP                MAX_ENTROPY_TOKENS

Deleted: branches/theora-thusnelda/lib/enc/encoder_lookup.h
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_lookup.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encoder_lookup.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,161 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function: simple static lookups for VP3 frame encoder
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "codec_internal.h"
-
-static const ogg_uint32_t MvPattern[(MAX_MV_EXTENT * 2) + 1] = {
-  0x000000ff, 0x000000fd, 0x000000fb, 0x000000f9,
-  0x000000f7, 0x000000f5, 0x000000f3, 0x000000f1,
-  0x000000ef, 0x000000ed, 0x000000eb, 0x000000e9,
-  0x000000e7, 0x000000e5, 0x000000e3, 0x000000e1,
-  0x0000006f, 0x0000006d, 0x0000006b, 0x00000069,
-  0x00000067, 0x00000065, 0x00000063, 0x00000061,
-  0x0000002f, 0x0000002d, 0x0000002b, 0x00000029,
-  0x00000009, 0x00000007, 0x00000002, 0x00000000,
-  0x00000001, 0x00000006, 0x00000008, 0x00000028,
-  0x0000002a, 0x0000002c, 0x0000002e, 0x00000060,
-  0x00000062, 0x00000064, 0x00000066, 0x00000068,
-  0x0000006a, 0x0000006c, 0x0000006e, 0x000000e0,
-  0x000000e2, 0x000000e4, 0x000000e6, 0x000000e8,
-  0x000000ea, 0x000000ec, 0x000000ee, 0x000000f0,
-  0x000000f2, 0x000000f4, 0x000000f6, 0x000000f8,
-  0x000000fa, 0x000000fc, 0x000000fe,
-};
-
-static const ogg_uint32_t MvBits[(MAX_MV_EXTENT * 2) + 1] = {
-  8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8,
-  7, 7, 7, 7, 7, 7, 7, 7,
-  6, 6, 6, 6, 4, 4, 3, 3,
-  3, 4, 4, 6, 6, 6, 6, 7,
-  7, 7, 7, 7, 7, 7, 7, 8,
-  8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8,
-};
-
-static const ogg_uint32_t MvPattern2[(MAX_MV_EXTENT * 2) + 1] = {
-  0x0000003f, 0x0000003d, 0x0000003b, 0x00000039,
-  0x00000037, 0x00000035, 0x00000033, 0x00000031,
-  0x0000002f, 0x0000002d, 0x0000002b, 0x00000029,
-  0x00000027, 0x00000025, 0x00000023, 0x00000021,
-  0x0000001f, 0x0000001d, 0x0000001b, 0x00000019,
-  0x00000017, 0x00000015, 0x00000013, 0x00000011,
-  0x0000000f, 0x0000000d, 0x0000000b, 0x00000009,
-  0x00000007, 0x00000005, 0x00000003, 0x00000000,
-  0x00000002, 0x00000004, 0x00000006, 0x00000008,
-  0x0000000a, 0x0000000c, 0x0000000e, 0x00000010,
-  0x00000012, 0x00000014, 0x00000016, 0x00000018,
-  0x0000001a, 0x0000001c, 0x0000001e, 0x00000020,
-  0x00000022, 0x00000024, 0x00000026, 0x00000028,
-  0x0000002a, 0x0000002c, 0x0000002e, 0x00000030,
-  0x00000032, 0x00000034, 0x00000036, 0x00000038,
-  0x0000003a, 0x0000003c, 0x0000003e,
-};
-
-static const ogg_uint32_t MvBits2[(MAX_MV_EXTENT * 2) + 1] = {
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6, 6,
-  6, 6, 6, 6, 6, 6, 6,
-};
-
-static const unsigned char ModeBitPatterns[MAX_MODES] = {
-  0x00, 0x02, 0x06, 0x0E, 0x1E, 0x3E, 0x7E, 0x7F };
-
-static const unsigned char ModeBitLengths[MAX_MODES] =  {
-  1,    2,    3,    4,    5,    6,    7,    7 };
-
-static const unsigned char ModeBitLengthsD[MAX_MODES] =  {
-  3,    3,    3,    3,    3,    3,    3,    3 };
-
-static const unsigned char ModeSchemes[MODE_METHODS-1][MAX_MODES] =  {
-  /* Last Mv dominates */
-  { 3,    4,    2,    0,    1,    5,    6,    7 },    /* L P  M N I G GM 4 */
-  { 2,    4,    3,    0,    1,    5,    6,    7 },    /* L P  N M I G GM 4 */
-  { 3,    4,    1,    0,    2,    5,    6,    7 },    /* L M  P N I G GM 4 */
-  { 2,    4,    1,    0,    3,    5,    6,    7 },    /* L M  N P I G GM 4 */
-
-  /* No MV dominates */
-  { 0,    4,    3,    1,    2,    5,    6,    7 },    /* N L  P M I G GM 4 */
-  { 0,    5,    4,    2,    3,    1,    6,    7 },    /* N G  L P M I GM 4 */
-  
-  /* fallback */
-  { 0,    1,    2,    3,    4,    5,    6,    7 }
-};
-
-#define PUR 8
-#define PU 4
-#define PUL 2
-#define PL 1
-#define HIGHBITDUPPED(X) (((ogg_int16_t) X)  >> 15)
-
-/* predictor multiplier up-left, up, up-right,left, shift
-   Entries are unpacked in the order L, UL, U, UR */
-static const ogg_int16_t pc[16][6]={
-  {0,0,0,0,0,0},
-  {1,0,0,0,0,0},      /* PL */
-  {0,1,0,0,0,0},      /* PUL */
-  {1,0,0,0,0,0},      /* PUL|PL */
-  {0,0,1,0,0,0},      /* PU */
-  {1,0,1,0,1,1},      /* PU|PL */
-  {0,0,1,0,0,0},      /* PU|PUL */
-  {29,-26,29,0,5,31}, /* PU|PUL|PL */
-  {0,0,0,1,0,0},      /* PUR */
-  {75,0,0,53,7,127},  /* PUR|PL */
-  {0,1,0,1,1,1},      /* PUR|PUL */
-  {75,0,0,53,7,127},  /* PUR|PUL|PL */
-  {0,0,1,0,0,0},      /* PUR|PU */
-  {75,0,0,53,7,127},  /* PUR|PU|PL */
-  {0,3,10,3,4,15},    /* PUR|PU|PUL */
-  {29,-26,29,0,5,31}  /* PUR|PU|PUL|PL */
-};
-
-/* boundary case bit masks. */
-static const int bc_mask[8]={
-  /* normal case no boundary condition */
-  PUR|PU|PUL|PL,
-  /* left column */
-  PUR|PU,
-  /* top row */
-  PL,
-  /* top row, left column */
-  0,
-  /* right column */
-  PU|PUL|PL,
-  /* right and left column */
-  PU,
-  /* top row, right column */
-  PL,
-  /* top row, right and left column */
-  0
-};
-
-static const ogg_int16_t Mode2Frame[] = {
-  1,  /* CODE_INTER_NO_MV     0 => Encoded diff from same MB last frame  */
-  0,  /* CODE_INTRA           1 => DCT Encoded Block */
-  1,  /* CODE_INTER_PLUS_MV   2 => Encoded diff from included MV MB last frame */
-  1,  /* CODE_INTER_LAST_MV   3 => Encoded diff from MRU MV MB last frame */
-  1,  /* CODE_INTER_PRIOR_MV  4 => Encoded diff from included 4 separate MV blocks */
-  2,  /* CODE_USING_GOLDEN    5 => Encoded diff from same MB golden frame */
-  2,  /* CODE_GOLDEN_MV       6 => Encoded diff from included MV MB golden frame */
-  1   /* CODE_INTER_FOUR_MV   7 => Encoded diff from included 4 separate MV blocks */
-};
-

Deleted: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,293 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2005                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "codec_internal.h"
-#include "quant_lookup.h"
-#include "mathops.h"
-
-#define OC_QUANT_MAX        (1024<<2)
-static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
-static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
-
-void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
-  const th_quant_ranges *qranges;
-  const th_quant_base   *base_mats[2*3*64];
-  int                    indices[2][3][64];
-  int                    nbase_mats;
-  int                    nbits;
-  int                    ci;
-  int                    qi;
-  int                    qri;
-  int                    qti;
-  int                    pli;
-  int                    qtj;
-  int                    plj;
-  int                    bmi;
-  int                    i;
-  /*Unlike the scale tables, we can't assume the maximum value will be in
-     index 0, so search for it here.*/
-  i=_qinfo->loop_filter_limits[0];
-  for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]);
-  nbits=OC_ILOG_32(i);
-  oggpackB_write(_opb,nbits,3);
-  for(qi=0;qi<64;qi++){
-    oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
-  }
-  /*580 bits for VP3.*/
-  nbits=OC_MAXI(OC_ILOG_32(_qinfo->ac_scale[0]),1);
-  oggpackB_write(_opb,nbits-1,4);
-  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
-  /*516 bits for VP3.*/
-  nbits=OC_MAXI(OC_ILOG_32(_qinfo->dc_scale[0]),1);
-  oggpackB_write(_opb,nbits-1,4);
-  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
-  /*Consolidate any duplicate base matrices.*/
-  nbase_mats=0;
-  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-    qranges=_qinfo->qi_ranges[qti]+pli;
-    for(qri=0;qri<=qranges->nranges;qri++){
-      for(bmi=0;;bmi++){
-        if(bmi>=nbase_mats){
-          base_mats[bmi]=qranges->base_matrices+qri;
-          indices[qti][pli][qri]=nbase_mats++;
-          break;
-        }
-        else if(memcmp(base_mats[bmi][0],qranges->base_matrices[qri],
-         sizeof(base_mats[bmi][0]))==0){
-          indices[qti][pli][qri]=bmi;
-          break;
-        }
-      }
-    }
-  }
-  /*Write out the list of unique base matrices.
-    1545 bits for VP3 matrices.*/
-  oggpackB_write(_opb,nbase_mats-1,9);
-  for(bmi=0;bmi<nbase_mats;bmi++){
-    for(ci=0;ci<64;ci++)oggpackB_write(_opb,base_mats[bmi][0][ci],8);
-  }
-  /*Now store quant ranges and their associated indices into the base matrix
-     list.
-    46 bits for VP3 matrices.*/
-  nbits=OC_ILOG_32(nbase_mats-1);
-  for(i=0;i<6;i++){
-    qti=i/3;
-    pli=i%3;
-    qranges=_qinfo->qi_ranges[qti]+pli;
-    if(i>0){
-      if(qti>0){
-        if(qranges->nranges==_qinfo->qi_ranges[qti-1][pli].nranges&&
-         memcmp(qranges->sizes,_qinfo->qi_ranges[qti-1][pli].sizes,
-         qranges->nranges*sizeof(qranges->sizes[0]))==0&&
-         memcmp(indices[qti][pli],indices[qti-1][pli],
-         (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
-          oggpackB_write(_opb,1,2);
-          continue;
-        }
-      }
-      qtj=(i-1)/3;
-      plj=(i-1)%3;
-      if(qranges->nranges==_qinfo->qi_ranges[qtj][plj].nranges&&
-       memcmp(qranges->sizes,_qinfo->qi_ranges[qtj][plj].sizes,
-       qranges->nranges*sizeof(qranges->sizes[0]))==0&&
-       memcmp(indices[qti][pli],indices[qtj][plj],
-       (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
-        oggpackB_write(_opb,0,1+(qti>0));
-        continue;
-      }
-      oggpackB_write(_opb,1,1);
-    }
-    oggpackB_write(_opb,indices[qti][pli][0],nbits);
-    for(qi=qri=0;qi<63;qri++){
-      oggpackB_write(_opb,qranges->sizes[qri]-1,OC_ILOG_32(62-qi));
-      qi+=qranges->sizes[qri];
-      oggpackB_write(_opb,indices[qti][pli][qri+1],nbits);
-    }
-  }
-}
-
-static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
-  ogg_uint32_t t;
-  int          l;
-  _d<<=1;
-  l=OC_ILOG_32(_d)-1;
-  t=1+((ogg_uint32_t)1<<16+l)/_d;
-  _this->m=(ogg_int16_t)(t-0x10000);
-  _this->l=l;
-}
-
-/*This table gives the square root of the fraction of the squared magnitude of
-   each DCT coefficient relative to the total, scaled by 2**16, for both INTRA
-   and INTER modes.
-  These values were measured after motion-compensated prediction, before
-   quantization, over a large set of test video (from QCIF to 1080p) encoded at
-   all possible rates.
-  The DC coefficient takes into account the DPCM prediction (using the
-   quantized values from neighboring blocks, as the encoder does, but still
-   before quantization of the coefficient in the current block).
-  The results differ significantly from the expected variance (e.g., using an
-   AR(1) model of the signal with rho=0.95, as is frequently done to compute
-   the coding gain of the DCT).
-  We use them to estimate an "average" quantizer for a given quantizer matrix,
-   as this is used to parameterize a number of the rate control decisions.
-  These values are themselves probably quantizer-matrix dependent, since the
-   shape of the matrix affects the noise distribution in the reference frames,
-   but they should at least give us _some_ amount of adaptivity to different
-   matrices, as opposed to hard-coding a table of average Q values for the
-   current set.
-  The main features they capture are that a) only a few of the quantizers in
-   the upper-left corner contribute anything significant at all (though INTER
-   mode is significantly flatter) and b) the DPCM prediction of the DC
-   coefficient gives a very minor improvement in the INTRA case and a quite
-   significant one in the INTER case (over the expected variance).*/
-static ogg_uint16_t OC_RPSD[2][64]={
-  {
-    52725,17370,10399, 6867, 5115, 3798, 2942, 2076,
-    17370, 9900, 6948, 4994, 3836, 2869, 2229, 1619,
-    10399, 6948, 5516, 4202, 3376, 2573, 2015, 1461,
-     6867, 4994, 4202, 3377, 2800, 2164, 1718, 1243,
-     5115, 3836, 3376, 2800, 2391, 1884, 1530, 1091,
-     3798, 2869, 2573, 2164, 1884, 1495, 1212,  873,
-     2942, 2229, 2015, 1718, 1530, 1212, 1001,  704,
-     2076, 1619, 1461, 1243, 1091,  873,  704,  474
-  },
-  {
-    23411,15604,13529,11601,10683, 8958, 7840, 6142,
-    15604,11901,10718, 9108, 8290, 6961, 6023, 4487,
-    13529,10718, 9961, 8527, 7945, 6689, 5742, 4333,
-    11601, 9108, 8527, 7414, 7084, 5923, 5175, 3743,
-    10683, 8290, 7945, 7084, 6771, 5754, 4793, 3504,
-     8958, 6961, 6689, 5923, 5754, 4679, 3936, 2989,
-     7840, 6023, 5742, 5175, 4793, 3936, 3522, 2558,
-     6142, 4487, 4333, 3743, 3504, 2989, 2558, 1829
-  }
-};
-
-/*The fraction of the squared magnitude of the residuals in each color channel
-   relative to the total, scaled by 2**16, for each pixel format.
-  These values were measured after motion-compensated prediction, before
-   quantization, over a large set of test video encoded at all possible rates.
-  TODO: These values are only from INTER frames; it should be re-measured for
-   INTRA frames.*/
-static ogg_uint16_t OC_PCD[4][3]={
-  {59926, 3038, 2572},
-  {55201, 5597, 4738},
-  {55201, 5597, 4738},
-  {47682, 9669, 8185}
-};
-
-
-
-/* a copied/reconciled version of derf's theora-exp code; redundancy
-   should be eliminated at some point */
-void InitQTables( CP_INSTANCE *cpi ){
-  /*Coding mode: intra or inter.*/
-  int qti;
-  /*Y', Cb, Cr.*/
-  int pli;
-  th_quant_info *qinfo = &cpi->quant_info;
-  for(qti=0;qti<2;qti++){
-    /*Quality index.*/
-    int qi;
-    int ci;
-    for(pli=0;pli<3;pli++){
-      /*Range iterator.*/
-      int qri;
-      for(qi=0,qri=0;qri<=qinfo->qi_ranges[qti][pli].nranges;qri++){
-        th_quant_base base;
-        ogg_uint32_t  q;
-        int           qi_start;
-        int           qi_end;
-        memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
-         sizeof(base));
-        qi_start=qi;
-        if(qri==qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
-        else qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
-        /*Iterate over quality indicies in this range.*/
-        for(;;){
-          /*In the original VP3.2 code, the rounding offset and the size of the
-             dead zone around 0 were controlled by a "sharpness" parameter.
-            We now R-D optimize the tokens for each block after quantization,
-             so the rounding offset should always be 1/2, and an explicit dead
-             zone is unnecessary.
-            Hence, all of that VP3.2 code is gone from here, and the remaining
-             floating point code has been implemented as equivalent integer
-             code with exact precision.*/
-          /*Scale DC the coefficient from the proper table.*/
-          q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
-          q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-          cpi->quant_tables[qti][pli][0][qi]=(ogg_uint16_t)q;
-          oc_iquant_init(cpi->iquant_tables[qti][pli][qi]+0,(ogg_uint16_t)q);
-          /*Now scale AC coefficients from the proper table.*/
-          for(ci=1;ci<64;ci++){
-            int zzi;
-            q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
-            q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-            zzi=zigzag_index[ci];
-            cpi->quant_tables[qti][pli][zzi][qi]=(ogg_uint16_t)q;
-            oc_iquant_init(cpi->iquant_tables[qti][pli][qi]+zzi,
-             (ogg_uint16_t)q);
-          }
-          if(++qi>=qi_end)break;
-          /*Interpolate the next base matrix.*/
-          for(ci=0;ci<64;ci++){
-            unsigned a;
-            unsigned b;
-            unsigned r;
-            unsigned s;
-            a=qinfo->qi_ranges[qti][pli].base_matrices[qri][ci];
-            b=qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci];
-            r=qi-qi_start;
-            s=qi_end-qi_start;
-            base[ci]=(unsigned char)((2*((s-r)*a+r*b)+s)/(2*s));
-          }
-        }
-      }
-    }
-    /*Now compute an "average" quantizer for each qi level.
-      We do one for INTER and one for INTRA, since their behavior is very
-       different, but average across chroma channels.
-      The basic approach is to compute a harmonic average of the squared
-       quantizer, weighted by the expected squared magnitude of the DCT
-       coefficients.
-      Under the (not quite true) assumption that DCT coefficients are
-       Laplacian-distributed, this preserves the product Q*lambda, where
-       lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter.
-      The value Q*lambda completely determines the entropy of the
-       coefficients.*/
-    for(qi=0;qi<64;qi++){
-      ogg_int64_t q2;
-      q2=0;
-      for(pli=0;pli<3;pli++){
-        ogg_uint32_t qp;
-        qp=0;
-        for(ci=0;ci<64;ci++){
-          unsigned rq;
-          unsigned qd;
-          qd=cpi->quant_tables[qti][pli][zigzag_index[ci]][qi];
-          rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
-          qp+=rq*(ogg_uint32_t)rq;
-        }
-        q2+=OC_PCD[cpi->info.pixelformat][pli]*(ogg_int64_t)qp;
-      }
-      /*qavg=1.0/sqrt(q2).*/
-      cpi->log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
-    }
-  }
-}

Deleted: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,835 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include "toplevel_lookup.h"
-#include "../internal.h"
-#include "codec_internal.h"
-#include "mathops.h"
-#include "../dec/ocintrin.h"
-#if defined(OC_X86_ASM)
-# include "x86/x86enc.h"
-#endif
-
-
-
-static void oc_enc_calc_lambda(CP_INSTANCE *cpi){
-  ogg_int64_t l;
-  int         qti;
-  qti=cpi->FrameType!=KEY_FRAME;
-  /*For now, lambda is fixed depending on the qi value and frame type:
-      lambda=scale[qti]*(qavg[qti][qi]**1.5),
-     where scale={2.25,1.125}.
-    A more adaptive scheme might perform better, but Theora's behavior does not
-     seem to conform to existing models in the literature.*/
-  /*If rate control is active, use the lambda for the _target_ quantizer.
-    This allows us to scale to rates slightly lower than we'd normally be able
-     to reach, and give the rate control a semblance of "fractional QI"
-     precision.*/
-  if(cpi->info.target_bitrate>0)l=cpi->rc.log_qtarget;
-  else l=cpi->log_qavg[qti][cpi->BaseQ];
-  /*Raise to the 1.5 power.*/
-  l+=l>>1;
-  /*Multiply by 1.125.*/
-  l+=0x00570068E7EF5A1ELL;
-  /*And multiply by an extra factor of 2 for INTRA frames.*/
-  if(!qti)l+=OC_Q57(1);
-  /*The upper bound here is 0x48000.*/
-  cpi->lambda=(int)oc_bexp64(l);
-}
-
-
-
-static void oc_rc_state_init(oc_rc_state *_rc,const theora_info *_info){
-  ogg_int64_t npixels;
-  ogg_int64_t ibpp;
-  /*TODO: These parameters should be exposed in a th_enc_ctl() API.*/
-  _rc->bits_per_frame=(_info->target_bitrate*
-   (ogg_int64_t)_info->fps_denominator)/_info->fps_numerator;
-  /*Insane framerates or frame sizes mean insane bitrates.
-    Let's not get carried away.*/
-  if(_rc->bits_per_frame>0x40000000000000LL){
-    _rc->bits_per_frame=(ogg_int64_t)0x40000000000000LL;
-  }
-  else if(_rc->bits_per_frame<32)_rc->bits_per_frame=32;
-  /*The buffer size is set equal to the keyframe interval, clamped to the range
-     [8,256] frames.
-    The 8 frame minimum gives us some chance to distribute bit estimation
-     errors.
-    The 256 frame maximum means we'll require 8-10 seconds of pre-buffering at
-     24-30 fps, which is not unreasonable.*/
-  _rc->buf_delay=_info->keyframe_frequency_force>256?
-   256:_info->keyframe_frequency_force;
-  _rc->buf_delay=OC_MAXI(_rc->buf_delay,12);
-  _rc->max=_rc->bits_per_frame*_rc->buf_delay;
-  /*Start with a buffer fullness of 75%.
-    We can require fully half the buffer for a keyframe, and so this initial
-     level gives us maximum flexibility for over/under-shooting in subsequent
-     frames.*/
-  _rc->target=_rc->fullness=(_rc->max+1>>1)+(_rc->max+2>>2);
-  /*Pick exponents and initial scales for quantizer selection.
-    TODO: These still need to be tuned.*/
-  npixels=_info->width*(ogg_int64_t)_info->height;
-  _rc->log_npixels=oc_blog64(npixels);
-  ibpp=npixels/_rc->bits_per_frame;
-  if(ibpp<1){
-    _rc->exp[0]=59;
-    _rc->log_scale[0]=oc_blog64(1997)-OC_Q57(8);
-  }
-  else if(ibpp<2){
-    _rc->exp[0]=55;
-    _rc->log_scale[0]=oc_blog64(1604)-OC_Q57(8);
-  }
-  else{
-    _rc->exp[0]=48;
-    _rc->log_scale[0]=oc_blog64(834)-OC_Q57(8);
-  }
-  if(ibpp<4){
-    _rc->exp[1]=100;
-    _rc->log_scale[1]=oc_blog64(2249)-OC_Q57(8);
-  }
-  else if(ibpp<8){
-    _rc->exp[1]=95;
-    _rc->log_scale[1]=oc_blog64(1751)-OC_Q57(8);
-  }
-  else{
-    _rc->exp[1]=73;
-    _rc->log_scale[1]=oc_blog64(1260)-OC_Q57(8);
-  }
-}
-
-static void oc_enc_update_rc_state(CP_INSTANCE *cpi,
- long _bits,int _qti,int _qi,int _trial){
-  /*Note, setting OC_SCALE_SMOOTHING[1] to 0x80 (0.5), which one might expect
-     to be a reasonable value, actually causes a feedback loop with, e.g., 12
-     fps content encoded at 24 fps; use values near 0 or near 1 for now.
-    TODO: Should probably revisit using an exponential moving average in the
-     first place at some point.*/
-  static const unsigned OC_SCALE_SMOOTHING[2]={0x13,0x00};
-  ogg_int64_t   log_scale;
-  ogg_int64_t   log_bits;
-  ogg_int64_t   log_qexp;
-  /*Compute the estimated scale factor for this frame type.*/
-  log_bits=oc_blog64(_bits);
-  log_qexp=cpi->log_qavg[_qti][_qi]-OC_Q57(2);
-  log_qexp=(log_qexp>>6)*(cpi->rc.exp[_qti]);
-  log_scale=OC_MINI(log_bits-cpi->rc.log_npixels+log_qexp,OC_Q57(16));
-  /*Use it to set that factor directly if this was a trial.*/
-  if(_trial)cpi->rc.log_scale[_qti]=log_scale;
-  else{
-    /*Otherwise update an exponential moving average.*/
-    cpi->rc.log_scale[_qti]=log_scale
-     +(cpi->rc.log_scale[_qti]-log_scale+128>>8)*OC_SCALE_SMOOTHING[_qti];
-    /*And update the buffer fullness level.*/
-    cpi->rc.fullness+=cpi->rc.bits_per_frame*(1+cpi->dup_count)-_bits;
-    /*If we're too quick filling the buffer, that rate is lost forever.*/
-    if(cpi->rc.fullness>cpi->rc.max)cpi->rc.fullness=cpi->rc.max;
-  }
-}
-
-static int oc_enc_select_qi(CP_INSTANCE *cpi,int _qti,int _trial){
-  ogg_int64_t  rate_total;
-  ogg_uint32_t next_key_frame;
-  int          nframes[2];
-  int          buf_delay;
-  ogg_int64_t  log_qtarget;
-  int          best_qi;
-  ogg_int64_t  best_qdiff;
-  int          qi;
-  /*Figure out how to re-distribute bits so that we hit our fullness target
-     before the last keyframe in our current buffer window (after the current
-     frame), or the end of the buffer window, whichever comes first.*/
-  next_key_frame=_qti?cpi->info.keyframe_frequency_force-cpi->LastKeyFrame:0;
-  nframes[0]=(cpi->rc.buf_delay-OC_MINI(next_key_frame,cpi->rc.buf_delay)
-   +cpi->info.keyframe_frequency_force-1)/cpi->info.keyframe_frequency_force;
-  if(nframes[0]+_qti>1){
-    buf_delay=next_key_frame+(nframes[0]-1)*cpi->info.keyframe_frequency_force;
-    nframes[0]--;
-  }
-  else buf_delay=cpi->rc.buf_delay;
-  nframes[1]=buf_delay-nframes[0];
-  rate_total=cpi->rc.fullness-cpi->rc.target
-   +buf_delay*cpi->rc.bits_per_frame;
-  /*Downgrade the delta frame rate to correspond to the current dup count.
-    This will way over-estimate the bits to use for an occasional dup (as
-     opposed to a consistent dup count, as used with VFR input), but the
-     hysteresis on the quantizer below will keep us from going out of control,
-     and we _do_ have more bits to spend after all.*/
-  if(cpi->dup_count>0)nframes[1]=(nframes[1]+cpi->dup_count)/(cpi->dup_count+1);
-  /*If there aren't enough bits to achieve our desired fullness level, use the
-     minimum quality permitted.*/
-  if(rate_total<=0)log_qtarget=OC_QUANT_MAX_LOG;
-  else{
-    static const unsigned char KEY_RATIO[2]={32,17};
-    ogg_int64_t   log_scale0;
-    ogg_int64_t   log_scale1;
-    ogg_int64_t   prevr;
-    ogg_int64_t   curr;
-    ogg_int64_t   realr;
-    int           i;
-    log_scale0=cpi->rc.log_scale[_qti]+cpi->rc.log_npixels;
-    log_scale1=cpi->rc.log_scale[1-_qti]+cpi->rc.log_npixels;
-    curr=(rate_total+(buf_delay>>1))/buf_delay;
-    realr=curr*KEY_RATIO[_qti]+16>>5;
-    for(i=0;i<10;i++){
-      ogg_int64_t rdiff;
-      ogg_int64_t rderiv;
-      ogg_int64_t log_rpow;
-      ogg_int64_t rscale;
-      ogg_int64_t drscale;
-      ogg_int64_t bias;
-      prevr=curr;
-      log_rpow=oc_blog64(prevr)-log_scale0;
-      log_rpow=(log_rpow+(cpi->rc.exp[_qti]>>1))/cpi->rc.exp[_qti]*
-       cpi->rc.exp[1-_qti];
-      rscale=nframes[1-_qti]*KEY_RATIO[1-_qti]*
-       oc_bexp64(log_scale1+log_rpow);
-      rdiff=nframes[_qti]*KEY_RATIO[_qti]*prevr+rscale-(rate_total<<5);
-      drscale=(rscale+(cpi->rc.exp[_qti]>>1))/cpi->rc.exp[_qti]*
-       cpi->rc.exp[1-_qti]/prevr;
-      rderiv=nframes[_qti]*KEY_RATIO[_qti]+drscale;
-      if(rderiv==0)break;
-      bias=rderiv+OC_SIGNMASK(rdiff^rderiv)^OC_SIGNMASK(rdiff^rderiv);
-      curr=prevr-((rdiff<<1)+bias)/(rderiv<<1);
-      realr=curr*KEY_RATIO[_qti]+16>>5;
-      if(curr<=0||realr>rate_total||prevr==curr)break;
-    }
-    log_qtarget=OC_Q57(2)-((oc_blog64(realr)-log_scale0+(cpi->rc.exp[_qti]>>1))/
-     cpi->rc.exp[_qti]<<6);
-    log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
-  }
-  /*If this was not one of the initial frames, limit the change in quality.*/
-  if(!_trial){
-    ogg_int64_t log_qmin;
-    ogg_int64_t log_qmax;
-    /*Clamp the target quantizer to within [0.8*Q,1.2*Q], where Q is the
-       current quantizer.
-      TODO: With user-specified quant matrices, we need to enlarge these limits
-       if they don't actually let us change qi values.*/
-    log_qmin=cpi->log_qavg[_qti][cpi->BaseQ]-0x00A4D3C25E68DC58LL;
-    log_qmax=cpi->log_qavg[_qti][cpi->BaseQ]+0x00A4D3C25E68DC58LL;
-    log_qtarget=OC_CLAMPI(log_qmin,log_qtarget,log_qmax);
-  }
-  /*Search for the quantizer that matches the target most closely.
-    We don't assume a linear ordering, but when there are ties we do pick the
-     quantizer closest to the current one.*/
-  best_qi=cpi->info.quality;
-  best_qdiff=cpi->log_qavg[_qti][best_qi]-log_qtarget;
-  best_qdiff=best_qdiff+OC_SIGNMASK(best_qdiff)^OC_SIGNMASK(best_qdiff);
-  for(qi=cpi->info.quality+1;qi<64;qi++){
-    ogg_int64_t qdiff;
-    qdiff=cpi->log_qavg[_qti][qi]-log_qtarget;
-    qdiff=qdiff+OC_SIGNMASK(qdiff)^OC_SIGNMASK(qdiff);
-    if(qdiff<best_qdiff||
-     qdiff==best_qdiff&&abs(qi-cpi->BaseQ)<abs(best_qi-cpi->BaseQ)){
-      best_qi=qi;
-      best_qdiff=qdiff;
-    }
-  }
-  /*Save the quantizer target for lambda calculations.*/
-  cpi->rc.log_qtarget=log_qtarget;
-  return best_qi;
-}
-
-
-
-static void CompressKeyFrame(CP_INSTANCE *cpi, int recode){
-  oggpackB_reset(cpi->oggbuffer);
-  cpi->FrameType = KEY_FRAME;
-  if(cpi->info.target_bitrate>0){
-    cpi->BaseQ=oc_enc_select_qi(cpi,0,cpi->CurrentFrame==1);
-  }
-  oc_enc_calc_lambda(cpi);
-  cpi->LastKeyFrame = 0;
-
-  /* mark as video frame */
-  oggpackB_write(cpi->oggbuffer,0,1);
-
-  WriteFrameHeader(cpi);
-  PickModes(cpi,recode);
-  EncodeData(cpi);
-
-  cpi->LastKeyFrame = 1;
-}
-
-static int CompressFrame( CP_INSTANCE *cpi, int recode ) {
-  oggpackB_reset(cpi->oggbuffer);
-  cpi->FrameType = DELTA_FRAME;
-  if(cpi->info.target_bitrate>0){
-    cpi->BaseQ=oc_enc_select_qi(cpi,1,0);
-  }
-  oc_enc_calc_lambda(cpi);
-
-  /* mark as video frame */
-  oggpackB_write(cpi->oggbuffer,0,1);
-
-  WriteFrameHeader(cpi);
-  if(PickModes(cpi,recode)){
-    /* mode analysis thinks this should have been a keyframe; start over and code as a keyframe instead */
-
-    oggpackB_reset(cpi->oggbuffer);
-    cpi->FrameType = KEY_FRAME;
-    if(cpi->info.target_bitrate>0)cpi->BaseQ=oc_enc_select_qi(cpi,0,0);
-    oc_enc_calc_lambda(cpi);
-    cpi->LastKeyFrame = 0;
-
-    /* mark as video frame */
-    oggpackB_write(cpi->oggbuffer,0,1);
-
-    WriteFrameHeader(cpi);
-
-    PickModes(cpi,1);
-    EncodeData(cpi);
-
-    cpi->LastKeyFrame = 1;
-
-    return 0;
-  }
-
-  if(cpi->first_inter_frame == 0){
-    cpi->first_inter_frame = 1;
-    EncodeData(cpi);
-    if(cpi->info.target_bitrate>0){
-      oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
-       1,cpi->BaseQ,1);
-    }
-    CompressFrame(cpi,1);
-    return 0;
-  }
-
-  cpi->LastKeyFrame++;
-  EncodeData(cpi);
-
-  return 0;
-}
-
-/********************** The toplevel: encode ***********************/
-
-static void theora_encode_dispatch_init(CP_INSTANCE *cpi);
-
-int theora_encode_init(theora_state *th, theora_info *c){
-  CP_INSTANCE *cpi;
-
-  memset(th, 0, sizeof(*th));
-  /*Currently only the 4:2:0 format is supported.*/
-  if(c->pixelformat!=OC_PF_420)return OC_IMPL;
-  th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
-  theora_encode_dispatch_init(cpi);
-  oc_mode_scheme_chooser_init(&cpi->chooser);
-#if defined(OC_X86_ASM)
-  oc_enc_vtable_init_x86(cpi);
-#else
-  oc_enc_vtable_init_c(cpi);
-#endif
-
-  c->version_major=TH_VERSION_MAJOR;
-  c->version_minor=TH_VERSION_MINOR;
-  c->version_subminor=TH_VERSION_SUB;
-
-  if(c->quality>63)c->quality=63;
-  if(c->quality<0)c->quality=32;
-  if(c->target_bitrate<0)c->target_bitrate=0;
-  cpi->BaseQ = c->quality;
-
-  /* Set encoder flags. */
-  /* if not AutoKeyframing cpi->ForceKeyFrameEvery = is frequency */
-  if(!c->keyframe_auto_p)
-    c->keyframe_frequency_force = c->keyframe_frequency;
-
-  /* Set the frame rate variables. */
-  if ( c->fps_numerator < 1 )
-    c->fps_numerator = 1;
-  if ( c->fps_denominator < 1 )
-    c->fps_denominator = 1;
-
-  /* don't go too nuts on keyframe spacing; impose a high limit to
-     make certain the granulepos encoding strategy works */
-  if(c->keyframe_frequency_force>32768)c->keyframe_frequency_force=32768;
-  if(c->keyframe_mindistance>32768)c->keyframe_mindistance=32768;
-  if(c->keyframe_mindistance>c->keyframe_frequency_force)
-    c->keyframe_mindistance=c->keyframe_frequency_force;
-  cpi->keyframe_granule_shift=OC_ILOG_32(c->keyframe_frequency_force-1);
-
-
-  /* copy in config */
-  memcpy(&cpi->info,c,sizeof(*c));
-  th->i=&cpi->info;
-  th->granulepos=-1;
-
-  /* Set up an encode buffer */
-  cpi->oggbuffer = _ogg_malloc(sizeof(oggpack_buffer));
-  oggpackB_writeinit(cpi->oggbuffer);
-  cpi->dup_count=0;
-  cpi->nqueued_dups=0;
-  cpi->packetflag=0;
-
-  InitFrameInfo(cpi);
-
-  /* Initialise the compression process. */
-  /* We always start at frame 1 */
-  cpi->CurrentFrame = 1;
-
-  memcpy(cpi->huff_codes,TH_VP31_HUFF_CODES,sizeof(cpi->huff_codes));
-
-  /* This makes sure encoder version specific tables are initialised */
-  memcpy(&cpi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
-  InitQTables(cpi);
-  if(cpi->info.target_bitrate>0)oc_rc_state_init(&cpi->rc,&cpi->info);
-
-  /* Indicate that the next frame to be compressed is the first in the
-     current clip. */
-  cpi->LastKeyFrame = -1;
-  cpi->readyflag = 1;
-
-  cpi->HeadersWritten = 0;
-  /*We overload this flag to track header output.*/
-  cpi->doneflag=-3;
-
-  return 0;
-}
-
-int theora_encode_YUVin(theora_state *t,
-                         yuv_buffer *yuv){
-  int dropped = 0;
-  ogg_int32_t i;
-  unsigned char *LocalDataPtr;
-  unsigned char *InputDataPtr;
-  CP_INSTANCE *cpi=(CP_INSTANCE *)(t->internal_encode);
-
-  if(!cpi->readyflag)return OC_EINVAL;
-  if(cpi->doneflag>0)return OC_EINVAL;
-
-  /* If frame size has changed, abort out for now */
-  if (yuv->y_height != (int)cpi->info.height ||
-      yuv->y_width != (int)cpi->info.width )
-    return(-1);
-
-  /* Copy over input YUV to internal YUV buffers. */
-  /* we invert the image for backward compatibility with VP3 */
-  /* First copy over the Y data */
-  LocalDataPtr = cpi->frame + cpi->offset[0] + cpi->stride[0]*(yuv->y_height - 1);
-  InputDataPtr = yuv->y;
-  for ( i = 0; i < yuv->y_height; i++ ){
-    memcpy( LocalDataPtr, InputDataPtr, yuv->y_width );
-    LocalDataPtr -= cpi->stride[0];
-    InputDataPtr += yuv->y_stride;
-  }
-
-  /* Now copy over the U data */
-  LocalDataPtr = cpi->frame + cpi->offset[1] + cpi->stride[1]*(yuv->uv_height - 1);
-  InputDataPtr = yuv->u;
-  for ( i = 0; i < yuv->uv_height; i++ ){
-    memcpy( LocalDataPtr, InputDataPtr, yuv->uv_width );
-    LocalDataPtr -= cpi->stride[1];
-    InputDataPtr += yuv->uv_stride;
-  }
-
-  /* Now copy over the V data */
-  LocalDataPtr = cpi->frame + cpi->offset[2] + cpi->stride[2]*(yuv->uv_height - 1);
-  InputDataPtr = yuv->v;
-  for ( i = 0; i < yuv->uv_height; i++ ){
-    memcpy( LocalDataPtr, InputDataPtr, yuv->uv_width );
-    LocalDataPtr -= cpi->stride[2];
-    InputDataPtr += yuv->uv_stride;
-  }
-
-  /* don't allow generating invalid files that overflow the p-frame
-     shift, even if keyframe_auto_p is turned off */
-  if(cpi->LastKeyFrame==-1 || cpi->LastKeyFrame+cpi->dup_count>= (ogg_uint32_t)
-     cpi->info.keyframe_frequency_force){
-
-    CompressKeyFrame(cpi,0);
-    if(cpi->info.target_bitrate>0){
-      oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
-       0,cpi->BaseQ,1);
-    }
-
-    /* On first frame, the previous was a initial dry-run to prime
-       feed-forward statistics */
-    if(cpi->CurrentFrame==1)CompressKeyFrame(cpi,1);
-
-  }
-  else{
-    /*Compress the frame.*/
-    dropped=CompressFrame(cpi,0);
-  }
-  oc_enc_restore_fpu(cpi);
-
-
-  /* Update stats variables. */
-  {
-    /* swap */
-    unsigned char *temp;
-    temp=cpi->lastrecon;
-    cpi->lastrecon=cpi->recon;
-    cpi->recon=temp;
-  }
-  if(cpi->FrameType==KEY_FRAME){
-    memcpy(cpi->golden,cpi->lastrecon,sizeof(*cpi->lastrecon)*cpi->frame_size);
-  }
-  cpi->CurrentFrame++;
-  cpi->packetflag=1;
-  if(cpi->info.target_bitrate>0){
-    oc_enc_update_rc_state(cpi,oggpackB_bytes(cpi->oggbuffer)<<3,
-     cpi->FrameType!=KEY_FRAME,cpi->BaseQ,0);
-  }
-
-  t->granulepos=
-    ((cpi->CurrentFrame - cpi->LastKeyFrame)<<cpi->keyframe_granule_shift)+
-    cpi->LastKeyFrame - 1;
-  cpi->nqueued_dups=cpi->dup_count;
-  cpi->dup_count=0;
-
-  return 0;
-}
-
-int theora_encode_packetout(theora_state *_t,int _last_p,ogg_packet *_op){
-  CP_INSTANCE *cpi;
-  cpi=(CP_INSTANCE *)_t->internal_encode;
-  if(cpi->doneflag>0)return -1;
-  if(cpi->packetflag){
-    cpi->packetflag=0;
-    _op->packet=oggpackB_get_buffer(cpi->oggbuffer);
-    _op->bytes=oggpackB_bytes(cpi->oggbuffer);
-  }
-  else if(cpi->nqueued_dups>0){
-    cpi->nqueued_dups--;
-    cpi->CurrentFrame++;
-    cpi->LastKeyFrame++;
-    _t->granulepos=cpi->LastKeyFrame-1
-     +(cpi->CurrentFrame-cpi->LastKeyFrame<<cpi->keyframe_granule_shift);
-    _op->packet=NULL;
-    _op->bytes=0;
-  }
-  else{
-    if(_last_p){
-      cpi->doneflag=1;
-#if defined(OC_COLLECT_METRICS)
-      oc_enc_mode_metrics_dump(cpi);
-#endif
-    }
-    return 0;
-  }
-  _last_p=_last_p&&cpi->nqueued_dups<=0;
-  _op->b_o_s=0;
-  _op->e_o_s=_last_p;
-  _op->packetno=cpi->CurrentFrame;
-  _op->granulepos=_t->granulepos;
-  return 1+cpi->nqueued_dups;
-}
-
-static void _tp_writebuffer(oggpack_buffer *opb, const char *buf, const long len)
-{
-  long i;
-
-  for (i = 0; i < len; i++)
-    oggpackB_write(opb, *buf++, 8);
-}
-
-static void _tp_writelsbint(oggpack_buffer *opb, long value)
-{
-  oggpackB_write(opb, value&0xFF, 8);
-  oggpackB_write(opb, value>>8&0xFF, 8);
-  oggpackB_write(opb, value>>16&0xFF, 8);
-  oggpackB_write(opb, value>>24&0xFF, 8);
-}
-
-/* build the initial short header for stream recognition and format */
-int theora_encode_header(theora_state *t, ogg_packet *op){
-  CP_INSTANCE *cpi=(CP_INSTANCE *)(t->internal_encode);
-  int offset_y;
-
-  oggpackB_reset(cpi->oggbuffer);
-  oggpackB_write(cpi->oggbuffer,0x80,8);
-  _tp_writebuffer(cpi->oggbuffer, "theora", 6);
-
-  oggpackB_write(cpi->oggbuffer,TH_VERSION_MAJOR,8);
-  oggpackB_write(cpi->oggbuffer,TH_VERSION_MINOR,8);
-  oggpackB_write(cpi->oggbuffer,TH_VERSION_SUB,8);
-
-  oggpackB_write(cpi->oggbuffer,cpi->info.width>>4,16);
-  oggpackB_write(cpi->oggbuffer,cpi->info.height>>4,16);
-  oggpackB_write(cpi->oggbuffer,cpi->info.frame_width,24);
-  oggpackB_write(cpi->oggbuffer,cpi->info.frame_height,24);
-  oggpackB_write(cpi->oggbuffer,cpi->info.offset_x,8);
-  /* Applications use offset_y to mean offset from the top of the image; the
-   * meaning in the bitstream is the opposite (from the bottom). Transform.
-   */
-  offset_y = cpi->info.height - cpi->info.frame_height -
-    cpi->info.offset_y;
-  oggpackB_write(cpi->oggbuffer,offset_y,8);
-
-  oggpackB_write(cpi->oggbuffer,cpi->info.fps_numerator,32);
-  oggpackB_write(cpi->oggbuffer,cpi->info.fps_denominator,32);
-  oggpackB_write(cpi->oggbuffer,cpi->info.aspect_numerator,24);
-  oggpackB_write(cpi->oggbuffer,cpi->info.aspect_denominator,24);
-
-  oggpackB_write(cpi->oggbuffer,cpi->info.colorspace,8);
-
-  /* The header target_bitrate is limited to 24 bits, so we clamp here */
-  oggpackB_write(cpi->oggbuffer,(cpi->info.target_bitrate>(1<<24)-1) ? ((1<<24)-1) : cpi->info.target_bitrate ,24);
-
-  oggpackB_write(cpi->oggbuffer,cpi->info.quality,6);
-
-  oggpackB_write(cpi->oggbuffer,cpi->keyframe_granule_shift,5);
-
-  oggpackB_write(cpi->oggbuffer,cpi->info.pixelformat,2);
-
-  oggpackB_write(cpi->oggbuffer,0,3); /* spare config bits */
-
-  op->packet=oggpackB_get_buffer(cpi->oggbuffer);
-  op->bytes=oggpackB_bytes(cpi->oggbuffer);
-
-  op->b_o_s=1;
-  op->e_o_s=0;
-
-  op->packetno=0;
-
-  op->granulepos=0;
-  cpi->packetflag=0;
-
-  return(0);
-}
-
-/* build the comment header packet from the passed metadata */
-int theora_encode_comment(theora_comment *tc, ogg_packet *op)
-{
-  const char *vendor = theora_version_string();
-  const int vendor_length = strlen(vendor);
-  oggpack_buffer *opb;
-
-  opb = _ogg_malloc(sizeof(oggpack_buffer));
-  oggpackB_writeinit(opb);
-  oggpackB_write(opb, 0x81, 8);
-  _tp_writebuffer(opb, "theora", 6);
-
-  _tp_writelsbint(opb, vendor_length);
-  _tp_writebuffer(opb, vendor, vendor_length);
-
-  _tp_writelsbint(opb, tc->comments);
-  if(tc->comments){
-    int i;
-    for(i=0;i<tc->comments;i++){
-      if(tc->user_comments[i]){
-        _tp_writelsbint(opb,tc->comment_lengths[i]);
-        _tp_writebuffer(opb,tc->user_comments[i],tc->comment_lengths[i]);
-      }else{
-        oggpackB_write(opb,0,32);
-      }
-    }
-  }
-  op->bytes=oggpack_bytes(opb);
-
-  /* So we're expecting the application will free this? */
-  op->packet=_ogg_malloc(oggpack_bytes(opb));
-  memcpy(op->packet, oggpack_get_buffer(opb), oggpack_bytes(opb));
-  oggpack_writeclear(opb);
-
-  _ogg_free(opb);
-
-  op->b_o_s=0;
-  op->e_o_s=0;
-
-  op->packetno=0;
-  op->granulepos=0;
-
-  return (0);
-}
-
-/* build the final header packet with the tables required
-   for decode */
-int theora_encode_tables(theora_state *t, ogg_packet *op){
-  CP_INSTANCE *cpi=(CP_INSTANCE *)(t->internal_encode);
-
-  oggpackB_reset(cpi->oggbuffer);
-  oggpackB_write(cpi->oggbuffer,0x82,8);
-  _tp_writebuffer(cpi->oggbuffer,"theora",6);
-
-  oc_quant_params_pack(cpi->oggbuffer,&cpi->quant_info);
-  oc_huff_codes_pack(cpi->oggbuffer,(const th_huff_table *)cpi->huff_codes);
-
-  op->packet=oggpackB_get_buffer(cpi->oggbuffer);
-  op->bytes=oggpackB_bytes(cpi->oggbuffer);
-
-  op->b_o_s=0;
-  op->e_o_s=0;
-
-  op->packetno=0;
-
-  op->granulepos=0;
-  cpi->packetflag=0;
-
-  cpi->HeadersWritten = 1;
-
-  return(0);
-}
-
-static void theora_encode_clear (theora_state  *th){
-  CP_INSTANCE *cpi;
-  cpi=(CP_INSTANCE *)th->internal_encode;
-  if(cpi){
-
-    ClearFrameInfo(cpi);
-
-    oggpackB_writeclear(cpi->oggbuffer);
-    _ogg_free(cpi->oggbuffer);
-
-    memset(cpi,0,sizeof(cpi));
-    _ogg_free(cpi);
-  }
-
-  memset(th,0,sizeof(*th));
-}
-
-
-/* returns, in seconds, absolute time of current packet in given
-   logical stream */
-static double theora_encode_granule_time(theora_state *th,
- ogg_int64_t granulepos){
-#ifndef THEORA_DISABLE_FLOAT
-  CP_INSTANCE *cpi=(CP_INSTANCE *)(th->internal_encode);
-
-  if(granulepos>=0){
-    ogg_int64_t iframe=granulepos>>cpi->keyframe_granule_shift;
-    ogg_int64_t pframe=granulepos-(iframe<<cpi->keyframe_granule_shift);
-
-    return (iframe+pframe)*
-      ((double)cpi->info.fps_denominator/cpi->info.fps_numerator);
-
-  }
-#endif
-
-  return(-1); /* negative granulepos or float calculations disabled */
-}
-
-/* returns frame number of current packet in given logical stream */
-static ogg_int64_t theora_encode_granule_frame(theora_state *th,
- ogg_int64_t granulepos){
-  CP_INSTANCE *cpi=(CP_INSTANCE *)(th->internal_encode);
-
-  if(granulepos>=0){
-    ogg_int64_t iframe=granulepos>>cpi->keyframe_granule_shift;
-    ogg_int64_t pframe=granulepos-(iframe<<cpi->keyframe_granule_shift);
-
-    return (iframe+pframe-1);
-  }
-
-  return(-1);
-}
-
-
-static int theora_encode_control(theora_state *th,int req,
- void *buf,size_t buf_sz) {
-  CP_INSTANCE *cpi;
-  int value;
-
-  if(th == NULL)
-    return TH_EFAULT;
-
-  cpi = th->internal_encode;
-
-  switch(req) {
-    case TH_ENCCTL_SET_QUANT_PARAMS:
-      if( ( buf==NULL&&buf_sz!=0 )
-  	   || ( buf!=NULL&&buf_sz!=sizeof(th_quant_info) )
-  	   || cpi->HeadersWritten ){
-        return TH_EINVAL;
-      }
-
-      memcpy(&cpi->quant_info, buf, sizeof(th_quant_info));
-      InitQTables(cpi);
-
-      return 0;
-    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
-      ogg_uint32_t keyframe_frequency_force;
-      if(buf==NULL)return TH_EFAULT;
-      if(buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
-      keyframe_frequency_force=*(ogg_uint32_t *)buf;
-      if(cpi->HeadersWritten){
-        /*It's still early enough to enlarge keyframe_granule_shift.*/
-        cpi->keyframe_granule_shift=OC_CLAMPI(cpi->keyframe_granule_shift,
-         OC_ILOG_32(keyframe_frequency_force-1),31);
-      }
-      cpi->info.keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
-       (ogg_uint32_t)1U<<cpi->keyframe_granule_shift);
-      *(ogg_uint32_t *)buf=cpi->info.keyframe_frequency_force;
-      return 0;
-    }
-    case TH_ENCCTL_SET_VP3_COMPATIBLE:
-      if(cpi->HeadersWritten)
-        return TH_EINVAL;
-
-      memcpy(&cpi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
-      InitQTables(cpi);
-
-      return 0;
-    case TH_ENCCTL_SET_SPLEVEL:
-      if(buf == NULL || buf_sz != sizeof(int))
-        return TH_EINVAL;
-
-      memcpy(&value, buf, sizeof(int));
-
-      switch(value) {
-        case 0:
-          cpi->MotionCompensation = 1;
-          cpi->info.quick_p = 0;
-        break;
-
-        case 1:
-          cpi->MotionCompensation = 1;
-          cpi->info.quick_p = 1;
-        break;
-
-        case 2:
-          cpi->MotionCompensation = 0;
-          cpi->info.quick_p = 1;
-        break;
-
-        default:
-          return TH_EINVAL;
-      }
-
-      return 0;
-    case TH_ENCCTL_GET_SPLEVEL_MAX:
-      value = 2;
-      memcpy(buf, &value, sizeof(int));
-      return 0;
-    case TH_ENCCTL_SET_DUP_COUNT:{
-      int dup_count;
-      if(buf==NULL)return TH_EFAULT;
-      if(buf_sz!=sizeof(int))return TH_EINVAL;
-      dup_count=*(int *)buf;
-      if(dup_count>=cpi->info.keyframe_frequency_force)return TH_EINVAL;
-      cpi->dup_count=OC_MAXI(dup_count,0);
-      return 0;
-    }break;
-    default:
-      return TH_EIMPL;
-  }
-}
-
-static void theora_encode_dispatch_init(CP_INSTANCE *cpi){
-  cpi->dispatch_vtbl.clear=theora_encode_clear;
-  cpi->dispatch_vtbl.control=theora_encode_control;
-  cpi->dispatch_vtbl.granule_frame=theora_encode_granule_frame;
-  cpi->dispatch_vtbl.granule_time=theora_encode_granule_time;
-}

Copied: branches/theora-thusnelda/lib/enc/enquant.c (from rev 16052, branches/theora-thusnelda/lib/enc/encoder_quant.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/enquant.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/enquant.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,274 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2005                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
+  const th_quant_ranges *qranges;
+  const th_quant_base   *base_mats[2*3*64];
+  int                    indices[2][3][64];
+  int                    nbase_mats;
+  int                    nbits;
+  int                    ci;
+  int                    qi;
+  int                    qri;
+  int                    qti;
+  int                    pli;
+  int                    qtj;
+  int                    plj;
+  int                    bmi;
+  int                    i;
+  i=_qinfo->loop_filter_limits[0];
+  for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]);
+  nbits=OC_ILOG_32(i);
+  oggpackB_write(_opb,nbits,3);
+  for(qi=0;qi<64;qi++){
+    oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
+  }
+  /*580 bits for VP3.*/
+  i=1;
+  for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->ac_scale[qi],i);
+  nbits=OC_ILOGNZ_32(i);
+  oggpackB_write(_opb,nbits-1,4);
+  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
+  /*516 bits for VP3.*/
+  i=1;
+  for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->dc_scale[qi],i);
+  nbits=OC_ILOGNZ_32(i);
+  oggpackB_write(_opb,nbits-1,4);
+  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
+  /*Consolidate any duplicate base matrices.*/
+  nbase_mats=0;
+  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    for(qri=0;qri<=qranges->nranges;qri++){
+      for(bmi=0;;bmi++){
+        if(bmi>=nbase_mats){
+          base_mats[bmi]=qranges->base_matrices+qri;
+          indices[qti][pli][qri]=nbase_mats++;
+          break;
+        }
+        else if(memcmp(base_mats[bmi][0],qranges->base_matrices[qri],
+         sizeof(base_mats[bmi][0]))==0){
+          indices[qti][pli][qri]=bmi;
+          break;
+        }
+      }
+    }
+  }
+  /*Write out the list of unique base matrices.
+    1545 bits for VP3 matrices.*/
+  oggpackB_write(_opb,nbase_mats-1,9);
+  for(bmi=0;bmi<nbase_mats;bmi++){
+    for(ci=0;ci<64;ci++)oggpackB_write(_opb,base_mats[bmi][0][ci],8);
+  }
+  /*Now store quant ranges and their associated indices into the base matrix
+     list.
+    46 bits for VP3 matrices.*/
+  nbits=OC_ILOG_32(nbase_mats-1);
+  for(i=0;i<6;i++){
+    qti=i/3;
+    pli=i%3;
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    if(i>0){
+      if(qti>0){
+        if(qranges->nranges==_qinfo->qi_ranges[qti-1][pli].nranges&&
+         memcmp(qranges->sizes,_qinfo->qi_ranges[qti-1][pli].sizes,
+         qranges->nranges*sizeof(qranges->sizes[0]))==0&&
+         memcmp(indices[qti][pli],indices[qti-1][pli],
+         (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
+          oggpackB_write(_opb,1,2);
+          continue;
+        }
+      }
+      qtj=(i-1)/3;
+      plj=(i-1)%3;
+      if(qranges->nranges==_qinfo->qi_ranges[qtj][plj].nranges&&
+       memcmp(qranges->sizes,_qinfo->qi_ranges[qtj][plj].sizes,
+       qranges->nranges*sizeof(qranges->sizes[0]))==0&&
+       memcmp(indices[qti][pli],indices[qtj][plj],
+       (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
+        oggpackB_write(_opb,0,1+(qti>0));
+        continue;
+      }
+      oggpackB_write(_opb,1,1);
+    }
+    oggpackB_write(_opb,indices[qti][pli][0],nbits);
+    for(qi=qri=0;qi<63;qri++){
+      oggpackB_write(_opb,qranges->sizes[qri]-1,OC_ILOG_32(62-qi));
+      qi+=qranges->sizes[qri];
+      oggpackB_write(_opb,indices[qti][pli][qri+1],nbits);
+    }
+  }
+}
+
+static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
+  ogg_uint32_t t;
+  int          l;
+  _d<<=1;
+  l=OC_ILOGNZ_32(_d)-1;
+  t=1+((ogg_uint32_t)1<<16+l)/_d;
+  _this->m=(ogg_int16_t)(t-0x10000);
+  _this->l=l;
+}
+
+/*See comments at oc_dequant_tables_init() for how the quantization tables'
+   storage should be initialized.*/
+void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){
+  int qi;
+  int pli;
+  int qti;
+  /*Initialize the dequantization tables first.*/
+  oc_dequant_tables_init(_dequant,NULL,_qinfo);
+  /*Derive the quantization tables directly from the dequantization tables.*/
+  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    int zzi;
+    int plj;
+    int qtj;
+    int dupe;
+    dupe=0;
+    for(qtj=0;qtj<=qti;qtj++){
+      for(plj=0;plj<(qtj<qti?3:pli);plj++){
+        if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){
+          dupe=1;
+          break;
+        }
+      }
+      if(dupe)break;
+    }
+    if(dupe){
+      _enquant[qi][pli][qti]=_enquant[qi][plj][qtj];
+      continue;
+    }
+    /*In the original VP3.2 code, the rounding offset and the size of the
+       dead zone around 0 were controlled by a "sharpness" parameter.
+      We now R-D optimize the tokens for each block after quantization,
+       so the rounding offset should always be 1/2, and an explicit dead
+       zone is unnecessary.
+      Hence, all of that VP3.2 code is gone from here, and the remaining
+       floating point code has been implemented as equivalent integer
+       code with exact precision.*/
+    for(zzi=0;zzi<64;zzi++){
+      oc_iquant_init(_enquant[qi][pli][qti]+zzi,
+       _dequant[qi][pli][qti][zzi]);
+    }
+    /*Now compute an "average" quantizer for each qi level.
+      We do one for INTER and one for INTRA, since their behavior is very
+       different, but average across chroma channels.
+      The basic approach is to compute a harmonic average of the squared
+       quantizer, weighted by the expected squared magnitude of the DCT
+       coefficients.
+      Under the (not quite true) assumption that DCT coefficients are
+       Laplacian-distributed, this preserves the product Q*lambda, where
+       lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter.
+      The value Q*lambda completely determines the entropy of the
+       coefficients.*/
+  }
+}
+
+
+
+/*This table gives the square root of the fraction of the squared magnitude of
+   each DCT coefficient relative to the total, scaled by 2**16, for both INTRA
+   and INTER modes.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video (from QCIF to 1080p) encoded at
+   all possible rates.
+  The DC coefficient takes into account the DPCM prediction (using the
+   quantized values from neighboring blocks, as the encoder does, but still
+   before quantization of the coefficient in the current block).
+  The results differ significantly from the expected variance (e.g., using an
+   AR(1) model of the signal with rho=0.95, as is frequently done to compute
+   the coding gain of the DCT).
+  We use them to estimate an "average" quantizer for a given quantizer matrix,
+   as this is used to parameterize a number of the rate control decisions.
+  These values are themselves probably quantizer-matrix dependent, since the
+   shape of the matrix affects the noise distribution in the reference frames,
+   but they should at least give us _some_ amount of adaptivity to different
+   matrices, as opposed to hard-coding a table of average Q values for the
+   current set.
+  The main features they capture are that a) only a few of the quantizers in
+   the upper-left corner contribute anything significant at all (though INTER
+   mode is significantly flatter) and b) the DPCM prediction of the DC
+   coefficient gives a very minor improvement in the INTRA case and a quite
+   significant one in the INTER case (over the expected variance).*/
+static ogg_uint16_t OC_RPSD[2][64]={
+  {
+    52725,17370,10399, 6867, 5115, 3798, 2942, 2076,
+    17370, 9900, 6948, 4994, 3836, 2869, 2229, 1619,
+    10399, 6948, 5516, 4202, 3376, 2573, 2015, 1461,
+     6867, 4994, 4202, 3377, 2800, 2164, 1718, 1243,
+     5115, 3836, 3376, 2800, 2391, 1884, 1530, 1091,
+     3798, 2869, 2573, 2164, 1884, 1495, 1212,  873,
+     2942, 2229, 2015, 1718, 1530, 1212, 1001,  704,
+     2076, 1619, 1461, 1243, 1091,  873,  704,  474
+  },
+  {
+    23411,15604,13529,11601,10683, 8958, 7840, 6142,
+    15604,11901,10718, 9108, 8290, 6961, 6023, 4487,
+    13529,10718, 9961, 8527, 7945, 6689, 5742, 4333,
+    11601, 9108, 8527, 7414, 7084, 5923, 5175, 3743,
+    10683, 8290, 7945, 7084, 6771, 5754, 4793, 3504,
+     8958, 6961, 6689, 5923, 5754, 4679, 3936, 2989,
+     7840, 6023, 5742, 5175, 4793, 3936, 3522, 2558,
+     6142, 4487, 4333, 3743, 3504, 2989, 2558, 1829
+  }
+};
+
+/*The fraction of the squared magnitude of the residuals in each color channel
+   relative to the total, scaled by 2**16, for each pixel format.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video encoded at all possible rates.
+  TODO: These values are only from INTER frames; it should be re-measured for
+   INTRA frames.*/
+static ogg_uint16_t OC_PCD[4][3]={
+  {59926, 3038, 2572},
+  {55201, 5597, 4738},
+  {55201, 5597, 4738},
+  {47682, 9669, 8185}
+};
+
+
+void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
+  int qi;
+  int pli;
+  int qti;
+  int ci;
+  for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
+    ogg_int64_t q2;
+    q2=0;
+    for(pli=0;pli<3;pli++){
+      ogg_uint32_t qp;
+      qp=0;
+      for(ci=0;ci<64;ci++){
+        unsigned rq;
+        unsigned qd;
+        qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
+        rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
+        qp+=rq*(ogg_uint32_t)rq;
+      }
+      q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp;
+    }
+    /*qavg=1.0/sqrt(q2).*/
+    _log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
+  }
+}

Modified: branches/theora-thusnelda/lib/enc/enquant.h
===================================================================
--- branches/theora-thusnelda/lib/enc/enquant.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/enquant.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -15,12 +15,13 @@
 };
 
 typedef oc_iquant        oc_iquant_table[64];
-typedef oc_iquant_table  oc_iquant_tables[64];
 
 
 
 void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
-void oc_enquant_tables_init(oc_quant_table *_dequant[2][3],
- oc_quant_table *_enquant[2][3],const th_quant_info *_qinfo);
+void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
+void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
 
 #endif

Deleted: branches/theora-thusnelda/lib/enc/frarray.c
===================================================================
--- branches/theora-thusnelda/lib/enc/frarray.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/frarray.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,418 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <string.h>
-#include "codec_internal.h"
-#include <stdio.h>
-
-void fr_clear(CP_INSTANCE *cpi, fr_state_t *fr){
-  fr->sb_partial_last = -1;
-  fr->sb_partial_count = 0;
-  fr->sb_partial_break = 0;
-
-  fr->sb_full_last = -1;
-  fr->sb_full_count = 0;
-  fr->sb_full_break = 0;
-
-  fr->b_last = -1;
-  fr->b_count = 0;
-  fr->b_pend = 0;
-
-  fr->sb_partial=0;
-  fr->sb_coded=0;
-
-  fr->cost=0;
-}
-
-static int BRun( ogg_uint32_t value, ogg_int16_t *token) {
-  
-  /* Coding scheme:
-     Codeword                                RunLength
-     0x                                      1-2
-     10x                                     3-4
-     110x                                    5-6
-     1110xx                                  7-10
-     11110xx                                 11-14
-     11111xxxx                               15-30 */
-
-  if ( value <= 2 ) {
-    *token = value - 1;
-    return 2;
-  } else if ( value <= 4 ) {
-    *token = 0x0004 + (value - 3);
-    return 3;
-  } else if ( value <= 6 ) {
-    *token = 0x000C + (value - 5);
-    return 4;
-  } else if ( value <= 10 ) {
-    *token = 0x0038 + (value - 7);
-    return 6;
-  } else if ( value <= 14 ) {
-    *token = 0x0078 + (value - 11);
-    return 7;
-  } else {
-    *token = 0x01F0 + (value - 15);
-    return 9;
- }
-}
-
-static int BRunCost( ogg_uint32_t value ) {
-  
-  if ( value <= 0 ) {
-    return 0;
-  } else if ( value <= 2 ) {
-    return 2;
-  } else if ( value <= 4 ) {
-    return 3;
-  } else if ( value <= 6 ) {
-    return 4;
-  } else if ( value <= 10 ) {
-    return 6;
-  } else if ( value <= 14 ) {
-    return 7;
-  } else {
-    return 9;
- }
-}
-
-static int SBRun(ogg_uint32_t value, int *token){
-
-  /* Coding scheme:
-        Codeword              RunLength
-      0                       1
-      10x                     2-3
-      110x                    4-5
-      1110xx                  6-9
-      11110xxx                10-17
-      111110xxxx              18-33
-      111111xxxxxxxxxxxx      34-4129 */
-
-  if ( value == 1 ){
-    *token = 0;
-    return 1;
-  } else if ( value <= 3 ) {
-    *token = 0x0004 + (value - 2);
-    return 3;
-  } else if ( value <= 5 ) {
-    *token = 0x000C + (value - 4);
-    return 4;
-  } else if ( value <= 9 ) {
-    *token = 0x0038 + (value - 6);
-    return 6;
-  } else if ( value <= 17 ) {
-    *token = 0x00F0 + (value - 10);
-    return 8;
-  } else if ( value <= 33 ) {
-    *token = 0x03E0 + (value - 18);
-    return 10;
-  } else {
-    *token = 0x3F000 + (value - 34);
-    return 18;
-  }
-}
-
-static int SBRunCost(ogg_uint32_t value){
-
-  if ( value == 0 ){
-    return 0;
-  } else if ( value == 1 ){
-    return 1;
-  } else if ( value <= 3 ) {
-    return 3;
-  } else if ( value <= 5 ) {
-    return 4;
-  } else if ( value <= 9 ) {
-    return 6;
-  } else if ( value <= 17 ) {
-    return 8;
-  } else if ( value <= 33 ) {
-    return 10;
-  } else {
-    return 18;
-  }
-}
-
-void fr_skipblock(CP_INSTANCE *cpi, fr_state_t *fr){
-  if(fr->sb_coded){
-    if(!fr->sb_partial){
-
-      /* superblock was previously fully coded */
-
-      if(fr->b_last==-1){
-	/* first run of the frame */
-	if(cpi){
-	  cpi->fr_block[cpi->fr_block_count]=1;
-	  cpi->fr_block_bits[cpi->fr_block_count++]=1;
-	}
-	fr->cost++;
-	fr->b_last = 1;
-      }
-
-      if(fr->b_last==1){
-	/* in-progress run also a coded run */
-	fr->b_count += fr->b_pend;
-      }else{
-	/* in-progress run an uncoded run; flush */
-	if(cpi){
-	  fr->cost +=
-	    cpi->fr_block_bits[cpi->fr_block_count] = 
-	    BRun(fr->b_count, cpi->fr_block+cpi->fr_block_count);
-	  cpi->fr_block_count++;
-	}else
-	  fr->cost += BRunCost(fr->b_count);
-	  
-	fr->b_count=fr->b_pend;
-	fr->b_last = 1;
-      }
-    }
-
-    /* add a skip block */
-    if(fr->b_last == 0){
-      fr->b_count++;
-    }else{
-      if(cpi){
-	fr->cost+=
-	  cpi->fr_block_bits[cpi->fr_block_count] = 
-	  BRun(fr->b_count, cpi->fr_block+cpi->fr_block_count);
-	cpi->fr_block_count++;
-      }else
-	fr->cost+=BRunCost(fr->b_count);
-      fr->b_count = 1;
-      fr->b_last = 0;
-    }
-  }
-   
-  fr->b_pend++;
-  fr->sb_partial=1;
-}
-
-void fr_codeblock(CP_INSTANCE *cpi, fr_state_t *fr){
-  if(fr->sb_partial){
-    if(!fr->sb_coded){
-
-      /* superblock was previously completely uncoded */
-
-      if(fr->b_last==-1){
-	/* first run of the frame */
-	if(cpi){
-	  cpi->fr_block[cpi->fr_block_count]=0;
-	  cpi->fr_block_bits[cpi->fr_block_count++]=1;
-	}
-	fr->cost++;
-	fr->b_last = 0;
-      }
-
-      if(fr->b_last==0){
-	/* in-progress run also an uncoded run */
-	fr->b_count += fr->b_pend;
-      }else{
-	/* in-progress run a coded run; flush */
-	if(cpi){
-	  fr->cost+=
-	    cpi->fr_block_bits[cpi->fr_block_count] = 
-	    BRun(fr->b_count, cpi->fr_block+cpi->fr_block_count);
-	  cpi->fr_block_count++;
-	}else
-	  fr->cost+=BRunCost(fr->b_count);
-	fr->b_count=fr->b_pend;
-	fr->b_last = 0;
-      }
-    }
-    
-    /* add a coded block */
-    if(fr->b_last == 1){
-      fr->b_count++;
-    }else{
-      if(cpi){
-	fr->cost+=
-	  cpi->fr_block_bits[cpi->fr_block_count] = 
-	  BRun(fr->b_count, cpi->fr_block+cpi->fr_block_count);
-	cpi->fr_block_count++;
-      }else
-	fr->cost+=BRunCost(fr->b_count);
-      fr->b_count = 1;
-      fr->b_last = 1;
-    }
-  }
-   
-  fr->b_pend++;
-  fr->sb_coded=1;
-}
-
-void fr_finishsb(CP_INSTANCE *cpi, fr_state_t *fr){
-  /* update partial state */
-  int partial = (fr->sb_partial & fr->sb_coded); 
-  if(fr->sb_partial_last == -1){
-    if(cpi){
-      cpi->fr_partial[cpi->fr_partial_count] = partial;
-      cpi->fr_partial_bits[cpi->fr_partial_count++] = 1;
-    }
-    fr->cost++;
-    fr->sb_partial_last = partial;
-  }
-
-  if(fr->sb_partial_break){
-    if(cpi){
-      cpi->fr_partial[cpi->fr_partial_count] = partial;
-      cpi->fr_partial_bits[cpi->fr_partial_count++] = 1;
-    }
-    fr->cost++;
-    fr->sb_partial_break=0;
-  }
-  if(fr->sb_partial_last == partial && fr->sb_partial_count < 4129){
-    fr->sb_partial_count++;
-  }else{
-    if(cpi){
-      fr->cost+=
-	cpi->fr_partial_bits[cpi->fr_partial_count] = 
-	SBRun( fr->sb_partial_count, cpi->fr_partial+cpi->fr_partial_count);
-      cpi->fr_partial_count++;
-    }else
-      fr->cost+=SBRunCost(fr->sb_partial_count);
-    if(fr->sb_partial_count >= 4129) fr->sb_partial_break = 1;
-    fr->sb_partial_count=1;
-  }
-  fr->sb_partial_last=partial;
-  
-  /* fully coded/uncoded state */
-  if(!fr->sb_partial || !fr->sb_coded){
-    
-    if(fr->sb_full_last == -1){
-      if(cpi){
-	cpi->fr_full[cpi->fr_full_count] = fr->sb_coded;
-	cpi->fr_full_bits[cpi->fr_full_count++] = 1;
-      }
-      fr->cost++;
-      fr->sb_full_last = fr->sb_coded;
-    }
-    if(fr->sb_full_break){
-      if(cpi){
-        cpi->fr_full[cpi->fr_full_count] = fr->sb_coded;
-        cpi->fr_full_bits[cpi->fr_full_count++] = 1;
-      }
-      fr->cost++;
-      fr->sb_full_break=0;
-    }
-    if(fr->sb_full_last == fr->sb_coded && fr->sb_full_count < 4129){
-      fr->sb_full_count++;
-    }else{
-      if(cpi){
-	fr->cost+=
-	  cpi->fr_full_bits[cpi->fr_full_count] = 
-	  SBRun( fr->sb_full_count, cpi->fr_full+cpi->fr_full_count);
-	cpi->fr_full_count++;
-      }else
-	fr->cost+= SBRunCost( fr->sb_full_count);
-      if(fr->sb_full_count >= 4129) fr->sb_full_break = 1;
-      fr->sb_full_count=1;
-    }
-    fr->sb_full_last=fr->sb_coded;
-
-  }
-
-  fr->b_pend=0;
-  fr->sb_partial=0;
-  fr->sb_coded=0;
-}
-
-static void fr_flush(CP_INSTANCE *cpi, fr_state_t *fr){
-  /* flush any pending partial run */
-  if(fr->sb_partial_break){
-    if(cpi){
-      cpi->fr_partial[cpi->fr_partial_count] = fr->sb_partial_last;
-      cpi->fr_partial_bits[cpi->fr_partial_count++] = 1;
-    }
-    fr->cost++;
-  }
-  if(fr->sb_partial_count){
-    if(cpi){
-      fr->cost+=
-	cpi->fr_partial_bits[cpi->fr_partial_count] = 
-	SBRun( fr->sb_partial_count, cpi->fr_partial+cpi->fr_partial_count);
-      cpi->fr_partial_count++;
-    }else
-      fr->cost+=SBRunCost( fr->sb_partial_count );
-  }
-  
-  /* flush any pending full run */
-  if(fr->sb_full_break){
-    if(cpi){
-      cpi->fr_full[cpi->fr_full_count] = fr->sb_full_last;
-      cpi->fr_full_bits[cpi->fr_full_count++] = 1;
-    }
-    fr->cost++;
-  }
-  if(fr->sb_full_count){
-    if(cpi){
-      fr->cost+=
-	cpi->fr_full_bits[cpi->fr_full_count] = 
-	SBRun( fr->sb_full_count, cpi->fr_full+cpi->fr_full_count);
-      cpi->fr_full_count++;
-    }else
-      fr->cost+=SBRunCost(fr->sb_full_count);
-  }
-  
-  /* flush any pending block run */
-  if(fr->b_count){
-    if(cpi){
-      fr->cost+=
-	cpi->fr_block_bits[cpi->fr_block_count] = 
-	BRun(fr->b_count, cpi->fr_block+cpi->fr_block_count);
-      cpi->fr_block_count++;
-    }else
-      fr->cost+=BRunCost(fr->b_count);
-  }
-}
-
-void fr_write(CP_INSTANCE *cpi, fr_state_t *fr){
-  int i;
-
-  fr_flush(cpi,fr);
-
-  for(i=0;i<cpi->fr_partial_count;i++)
-    oggpackB_write( cpi->oggbuffer, cpi->fr_partial[i], cpi->fr_partial_bits[i]);      
-  for(i=0;i<cpi->fr_full_count;i++)
-    oggpackB_write( cpi->oggbuffer, cpi->fr_full[i], cpi->fr_full_bits[i]);      
-  for(i=0;i<cpi->fr_block_count;i++)
-    oggpackB_write( cpi->oggbuffer, cpi->fr_block[i], cpi->fr_block_bits[i]);      
-}
-
-int fr_cost1(fr_state_t *fr){
-  fr_state_t temp = *fr;
-  int cost;
-
-  fr_skipblock(NULL,&temp);
-  cost=temp.cost;
-  temp=*fr;
-  fr_codeblock(NULL,&temp);
-  return temp.cost - cost;
-}
-
-int fr_cost4(fr_state_t *pre, fr_state_t *post){
-  fr_state_t temp = *pre;
-  int cost;
-
-  fr_skipblock(NULL,&temp);
-  fr_skipblock(NULL,&temp);
-  fr_skipblock(NULL,&temp);
-  fr_skipblock(NULL,&temp);
-  //fr_finishsb(NULL,&temp);
-  cost=temp.cost;
-  temp=*post;
-  //fr_finishsb(NULL,&temp);
-  return temp.cost - cost;
-}

Deleted: branches/theora-thusnelda/lib/enc/frinit.c
===================================================================
--- branches/theora-thusnelda/lib/enc/frinit.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/frinit.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,417 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "codec_internal.h"
-
-
-void ClearFrameInfo(CP_INSTANCE *cpi){
-
-  if(cpi->frame) _ogg_free(cpi->frame);
-  if(cpi->lastrecon ) _ogg_free(cpi->lastrecon );
-  if(cpi->golden) _ogg_free(cpi->golden);
-  if(cpi->recon) _ogg_free(cpi->recon);
-  if(cpi->dct_token_storage) _ogg_free(cpi->dct_token_storage);
-  if(cpi->dct_token_eb_storage) _ogg_free(cpi->dct_token_eb_storage);
-  if(cpi->frag_coded) _ogg_free(cpi->frag_coded);
-  if(cpi->frag_buffer_index) _ogg_free(cpi->frag_buffer_index);
-  if(cpi->frag_dc) _ogg_free(cpi->frag_dc);
-  if(cpi->frag_dc_tmp) _ogg_free(cpi->frag_dc_tmp);
-#if defined(OC_COLLECT_METRICS)
-  if(cpi->frag_mbi) _ogg_free(cpi->frag_mbi);
-  if(cpi->frag_sad) _ogg_free(cpi->frag_sad);
-  if(cpi->frag_ssd) _ogg_free(cpi->frag_ssd);
-  if(cpi->dct_token_frag_storage) _ogg_free(cpi->dct_token_frag_storage);
-  if(cpi->dct_eob_fi_storage) _ogg_free(cpi->dct_eob_fi_storage);
-#endif
-
-  if(cpi->macro) _ogg_free(cpi->macro);
-  if(cpi->super[0]) _ogg_free(cpi->super[0]);
-
-  if(cpi->fr_partial)_ogg_free(cpi->fr_partial);
-  if(cpi->fr_partial_bits)_ogg_free(cpi->fr_partial_bits);
-  if(cpi->fr_full)_ogg_free(cpi->fr_full);
-  if(cpi->fr_full_bits)_ogg_free(cpi->fr_full_bits);
-  if(cpi->fr_block)_ogg_free(cpi->fr_block);
-  if(cpi->fr_block_bits)_ogg_free(cpi->fr_block_bits);
-}
-
-/* A note to people reading and wondering why malloc returns aren't
-   checked:
-
-   lines like the following that implement a general strategy of
-   'check the return of malloc; a zero pointer means we're out of
-   memory!'...:
-
-   if(!cpi->extra_fragments) { EDeleteFragmentInfo(cpi); return FALSE; }
-
-   ...are not useful.  It's true that many platforms follow this
-   malloc behavior, but many do not.  The more modern malloc
-   strategy is only to allocate virtual pages, which are not mapped
-   until the memory on that page is touched.  At *that* point, if
-   the machine is out of heap, the page fails to be mapped and a
-   SEGV is generated.
-
-   That means that if we want to deal with out of memory conditions,
-   we *must* be prepared to process a SEGV.  If we implement the
-   SEGV handler, there's no reason to to check malloc return; it is
-   a waste of code. */
-
-void InitFrameInfo(CP_INSTANCE *cpi){
-
-  cpi->stride[0] = (cpi->info.width + STRIDE_EXTRA);
-  cpi->stride[1] = (cpi->info.width + STRIDE_EXTRA) / 2;
-  cpi->stride[2] = (cpi->info.width + STRIDE_EXTRA) / 2;
-
-  {
-    ogg_uint32_t ry_size = cpi->stride[0] * (cpi->info.height + STRIDE_EXTRA);
-    ogg_uint32_t ruv_size = ry_size / 4;
-
-    cpi->frame_size = ry_size + 2 * ruv_size;
-    cpi->offset[0] = (cpi->stride[0] * UMV_BORDER) + UMV_BORDER;
-    cpi->offset[1] = ry_size + cpi->stride[1] * (UMV_BORDER/2) + (UMV_BORDER/2);
-    cpi->offset[2] = ry_size + ruv_size + cpi->stride[2] * (UMV_BORDER/2) + (UMV_BORDER/2);
-  }
-
-  cpi->frag_h[0] = (cpi->info.width >> 3);
-  cpi->frag_v[0] = (cpi->info.height >> 3);
-  cpi->frag_n[0] = cpi->frag_h[0] * cpi->frag_v[0];
-  cpi->frag_h[1] = (cpi->info.width >> 4);
-  cpi->frag_v[1] = (cpi->info.height >> 4);
-  cpi->frag_n[1] = cpi->frag_h[1] * cpi->frag_v[1];
-  cpi->frag_h[2] = (cpi->info.width >> 4);
-  cpi->frag_v[2] = (cpi->info.height >> 4);
-  cpi->frag_n[2] = cpi->frag_h[2] * cpi->frag_v[2];
-  cpi->frag_total = cpi->frag_n[0] + cpi->frag_n[1] + cpi->frag_n[2];
-
-  cpi->macro_h = (cpi->frag_h[0] >> 1);
-  cpi->macro_v = (cpi->frag_v[0] >> 1);
-  cpi->macro_total = cpi->macro_h * cpi->macro_v;
-
-  cpi->super_h[0] = (cpi->info.width >> 5) + ((cpi->info.width & 0x1f) ? 1 : 0);
-  cpi->super_v[0] = (cpi->info.height >> 5) + ((cpi->info.height & 0x1f) ? 1 : 0);
-  cpi->super_n[0] = cpi->super_h[0] * cpi->super_v[0];
-  cpi->super_h[1] = (cpi->info.width >> 6) + ((cpi->info.width & 0x3f) ? 1 : 0);
-  cpi->super_v[1] = (cpi->info.height >> 6) + ((cpi->info.height & 0x3f) ? 1 : 0);
-  cpi->super_n[1] = cpi->super_h[1] * cpi->super_v[1];
-  cpi->super_h[2] = (cpi->info.width >> 6) + ((cpi->info.width & 0x3f) ? 1 : 0);
-  cpi->super_v[2] = (cpi->info.height >> 6) + ((cpi->info.height & 0x3f) ? 1 : 0);
-  cpi->super_n[2] = cpi->super_h[2] * cpi->super_v[2];
-  cpi->super_total = cpi->super_n[0] + cpi->super_n[1] + cpi->super_n[2];
-
-  /* +1; the last entry is the 'invalid' frag, which is always set to not coded as it doesn't really exist */
-  cpi->frag_coded = calloc(cpi->frag_total+1, sizeof(*cpi->frag_coded));
-  cpi->frag_buffer_index = calloc(cpi->frag_total, sizeof(*cpi->frag_buffer_index));
-  cpi->frag_dc = calloc(cpi->frag_total, sizeof(*cpi->frag_dc));
-  cpi->frag_dc_tmp = calloc(2*cpi->frag_h[0], sizeof(*cpi->frag_dc_tmp));
-
-  /* +1; the last entry is the 'invalid' mb, which contains only 'invalid' frags */
-  cpi->macro = calloc(cpi->macro_total+1, sizeof(*cpi->macro));
-
-  cpi->super[0] = calloc(cpi->super_total, sizeof(**cpi->super));
-  cpi->super[1] = cpi->super[0] + cpi->super_n[0];
-  cpi->super[2] = cpi->super[1] + cpi->super_n[1];
-
-  cpi->stack_offset = (cpi->frag_total + (cpi->frag_total+4094)/4095 + 1);
-  cpi->dct_token_storage = _ogg_malloc( cpi->stack_offset*BLOCK_SIZE*sizeof(*cpi->dct_token_storage));
-  cpi->dct_token_eb_storage = _ogg_malloc(cpi->stack_offset*BLOCK_SIZE*sizeof(*cpi->dct_token_eb_storage));
-
-  cpi->fr_partial = _ogg_calloc(cpi->super_total+1, sizeof(*cpi->fr_partial));
-  cpi->fr_partial_bits = _ogg_calloc(cpi->super_total+1, sizeof(*cpi->fr_partial_bits));
-  cpi->fr_full = _ogg_calloc(cpi->super_total+1, sizeof(*cpi->fr_full));
-  cpi->fr_full_bits = _ogg_calloc(cpi->super_total+1, sizeof(*cpi->fr_full_bits));
-  cpi->fr_block = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->fr_block));
-  cpi->fr_block_bits = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->fr_block_bits));
-
-#if defined(OC_COLLECT_METRICS)
- {
-   cpi->frag_mbi = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->frag_mbi));
-   cpi->frag_sad = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->frag_sad));
-   cpi->frag_ssd = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->frag_ssd));
-   cpi->dct_token_frag_storage = _ogg_malloc(cpi->stack_offset*BLOCK_SIZE*sizeof(*cpi->dct_token_frag_storage));
-   cpi->dct_eob_fi_storage = _ogg_malloc(cpi->frag_total*BLOCK_SIZE*sizeof(*cpi->dct_eob_fi_storage));
- }
-#endif
-
-  /* fill in superblock fragment pointers; hilbert order */
-  /* fill in macroblock superblock backpointers */
-  {
-    int row,col,frag,mb;
-    int fhilbertx[16] = {0,1,1,0,0,0,1,1,2,2,3,3,3,2,2,3};
-    int fhilberty[16] = {0,0,1,1,2,3,3,2,2,3,3,2,1,1,0,0};
-    int mhilbertx[4] = {0,0,1,1};
-    int mhilberty[4] = {0,1,1,0};
-    int offset = 0;
-    int plane;
-
-    for(plane=0;plane<3;plane++){
-
-      for(row=0;row<cpi->super_v[plane];row++){
-        for(col=0;col<cpi->super_h[plane];col++){
-          int superindex = row*cpi->super_h[plane] + col;
-          for(frag=0;frag<16;frag++){
-            /* translate to fragment index */
-            int frow = row*4 + fhilberty[frag];
-            int fcol = col*4 + fhilbertx[frag];
-            if(frow<cpi->frag_v[plane] && fcol<cpi->frag_h[plane]){
-              int fragindex = frow*cpi->frag_h[plane] + fcol + offset;
-              cpi->super[plane][superindex].f[frag] = fragindex;
-            }else
-              cpi->super[plane][superindex].f[frag] = cpi->frag_total; /* 'invalid' */
-          }
-        }
-      }
-      offset+=cpi->frag_n[plane];
-    }
-
-    /* Y */
-    for(row=0;row<cpi->super_v[0];row++){
-      for(col=0;col<cpi->super_h[0];col++){
-        int superindex = row*cpi->super_h[0] + col;
-        for(mb=0;mb<4;mb++){
-          /* translate to macroblock index */
-          int mrow = row*2 + mhilberty[mb];
-          int mcol = col*2 + mhilbertx[mb];
-          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-            int macroindex = mrow*cpi->macro_h + mcol;
-            cpi->super[0][superindex].m[mb] = macroindex;
-            cpi->macro[macroindex].ysb = superindex;
-          }else
-            cpi->super[0][superindex].m[mb] = cpi->macro_total;
-        }
-      }
-    }
-
-    /* U (assuming 4:2:0 for now) */
-    for(row=0;row<cpi->super_v[1];row++){
-      for(col=0;col<cpi->super_h[1];col++){
-        int superindex = row*cpi->super_h[1] + col;
-        for(mb=0;mb<16;mb++){
-          /* translate to macroblock index */
-          int mrow = row*4 + fhilberty[mb];
-          int mcol = col*4 + fhilbertx[mb];
-          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-            int macroindex = mrow*cpi->macro_h + mcol;
-            cpi->super[1][superindex].m[mb] = macroindex;
-            cpi->macro[macroindex].usb = superindex + cpi->super_n[0];
-          }else
-            cpi->super[1][superindex].m[mb] = cpi->macro_total;
-        }
-      }
-    }
-
-    /* V (assuming 4:2:0 for now) */
-    for(row=0;row<cpi->super_v[2];row++){
-      for(col=0;col<cpi->super_h[2];col++){
-        int superindex = row*cpi->super_h[2] + col;
-        for(mb=0;mb<16;mb++){
-          /* translate to macroblock index */
-          int mrow = row*4 + fhilberty[mb];
-          int mcol = col*4 + fhilbertx[mb];
-          if(mrow<cpi->macro_v && mcol<cpi->macro_h){
-            int macroindex = mrow*cpi->macro_h + mcol;
-            cpi->super[2][superindex].m[mb] = macroindex;
-            cpi->macro[macroindex].vsb = superindex + cpi->super_n[0] + cpi->super_n[1];
-          }else
-            cpi->super[2][superindex].m[mb] = cpi->macro_total;
-        }
-      }
-    }
-
-  }
-
-  /* fill in macroblock fragment pointers; raster (MV coding) order */
-  /* 4:2:0 only for now */
-  {
-    int row,col,frag;
-    int Hscanx[4][4] = { {0,1,1,0}, {1,0,0,1}, {0,0,1,1}, {0,0,1,1} };
-    int Hscany[4][4] = { {0,0,1,1}, {1,1,0,0}, {0,1,1,0}, {0,1,1,0} };
-
-    for(row=0;row<cpi->macro_v;row++){
-      int baserow = row*2;
-      for(col=0;col<cpi->macro_h;col++){
-        int basecol = col*2;
-        int macroindex = row*cpi->macro_h + col;
-        int hpos = (col&1) + (row&1)*2;
-
-        /* Y */
-        for(frag=0;frag<4;frag++){
-          int Hrow = baserow + Hscany[hpos][frag];
-          int Hcol = basecol + Hscanx[hpos][frag];
-          int Rrow = baserow + ((frag>>1)&1);
-          int Rcol = basecol + (frag&1);
-
-          cpi->macro[macroindex].Hyuv[0][frag] = cpi->frag_total; // default
-          cpi->macro[macroindex].Ryuv[0][frag] = cpi->frag_total; //default
-          if(Hrow<cpi->frag_v[0] && Hcol<cpi->frag_h[0]){
-            cpi->macro[macroindex].Hyuv[0][frag] = Hrow*cpi->frag_h[0] + Hcol;
-#if defined(OC_COLLECT_METRICS)
-            cpi->frag_mbi[Hrow*cpi->frag_h[0] + Hcol] = macroindex;
-#endif
-          }
-          if(Rrow<cpi->frag_v[0] && Rcol<cpi->frag_h[0])
-            cpi->macro[macroindex].Ryuv[0][frag] = Rrow*cpi->frag_h[0] + Rcol;
-        }
-
-        /* U */
-        cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[1][1] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[1][2] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[1][3] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[1][1] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[1][2] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[1][3] = cpi->frag_total;
-        if(row<cpi->frag_v[1] && col<cpi->frag_h[1]){
-          cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_n[0] + macroindex;
-          cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_n[0] + macroindex;
-#if defined(OC_COLLECT_METRICS)
-          cpi->frag_mbi[cpi->frag_n[0] + macroindex] = macroindex;
-#endif
-        }
-
-        /* V */
-        cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[2][1] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[2][2] = cpi->frag_total;
-        cpi->macro[macroindex].Ryuv[2][3] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[2][1] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[2][2] = cpi->frag_total;
-        cpi->macro[macroindex].Hyuv[2][3] = cpi->frag_total;
-        if(row<cpi->frag_v[2] && col<cpi->frag_h[2]){
-          cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
-          cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
-#if defined(OC_COLLECT_METRICS)
-          cpi->frag_mbi[cpi->frag_n[0] + cpi->frag_n[1] + macroindex] = macroindex;
-#endif
-        }
-      }
-    }
-  }
-
-  /* fill in macroblock neighbor information for MC analysis */
-  {
-    int row,col;
-
-    for(row=0;row<cpi->macro_v;row++){
-      for(col=0;col<cpi->macro_h;col++){
-        int macroindex = row*cpi->macro_h + col;
-        int count=0;
-
-        /* cneighbors are of four possible already-filled-in neighbors
-           from the eight-neighbor square for doing ME. The
-           macroblocks are scanned in Hilbert order and the corner
-           cases here are annoying, so we precompute. */
-        if(row&1){
-          if(col&1){
-            /* 2 */
-            cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-            cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-          }else{
-            /* 1 */
-            if(col){
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-            }
-            cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-          }
-        }else{
-          if(col&1){
-            /* 3; Could have up to six, fill in at most 4 */
-            if(row && col+1<cpi->macro_h)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
-            if(row)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-            if(col && row)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-            if(col)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-            if(col && row+1<cpi->macro_v && count<4)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
-            if(row+1<cpi->macro_v && count<4)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h;
-          }else{
-            /* 0; Could have up to five, fill in at most 4 */
-            if(row && col+1<cpi->macro_h)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h+1;
-            if(row)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h;
-            if(col && row)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-cpi->macro_h-1;
-            if(col)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex-1;
-            if(col && row+1<cpi->macro_v && count<4)
-              cpi->macro[macroindex].cneighbors[count++]=macroindex+cpi->macro_h-1;
-          }
-        }
-        cpi->macro[macroindex].ncneighbors=count;
-
-        /* pneighbors are of the four possible direct neighbors (plus pattern), not the same as cneighbors */
-        count=0;
-        if(col)
-          cpi->macro[macroindex].pneighbors[count++]=macroindex-1;
-        if(row)
-          cpi->macro[macroindex].pneighbors[count++]=macroindex-cpi->macro_h;
-        if(col+1<cpi->macro_h)
-          cpi->macro[macroindex].pneighbors[count++]=macroindex+1;
-        if(row+1<cpi->macro_v)
-          cpi->macro[macroindex].pneighbors[count++]=macroindex+cpi->macro_h;
-        cpi->macro[macroindex].npneighbors=count;
-      }
-    }
-  }
-
-  /* fill in 'invalid' macroblock */
-  {
-    int p,f;
-    for(p=0;p<3;p++)
-      for(f=0;f<4;f++){
-        cpi->macro[cpi->macro_total].Ryuv[p][f] = cpi->frag_total;
-        cpi->macro[cpi->macro_total].Hyuv[p][f] = cpi->frag_total;
-      }
-    cpi->macro[cpi->macro_total].ncneighbors=0;
-    cpi->macro[cpi->macro_total].npneighbors=0;
-#if defined(OC_COLLECT_METRICS)
-    cpi->frag_mbi[cpi->frag_total] = cpi->macro_total;
-#endif
-  }
-
-  /* allocate frames */
-  cpi->frame = _ogg_calloc(cpi->frame_size,sizeof(*cpi->frame));
-  cpi->lastrecon = _ogg_calloc(cpi->frame_size,sizeof(*cpi->lastrecon));
-  cpi->golden = _ogg_calloc(cpi->frame_size,sizeof(*cpi->golden));
-  cpi->recon = _ogg_calloc(cpi->frame_size,sizeof(*cpi->recon));
-
-  /* Re-initialise the pixel index table. */
-  {
-    ogg_uint32_t plane,row,col;
-    ogg_uint32_t *bp = cpi->frag_buffer_index;
-
-    for(plane=0;plane<3;plane++){
-      ogg_uint32_t offset = cpi->offset[plane];
-      for(row=0;row<cpi->frag_v[plane];row++){
-        for(col=0;col<cpi->frag_h[plane];col++,bp++){
-          *bp = offset+col*8;
-        }
-        offset += cpi->stride[plane]*8;
-      }
-    }
-  }
-}
-

Deleted: branches/theora-thusnelda/lib/enc/hufftables.h
===================================================================
--- branches/theora-thusnelda/lib/enc/hufftables.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/hufftables.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,1034 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "../dec/huffman.h"
-#include "codec_internal.h"
-
-const unsigned char ExtraBitLengths_VP31[MAX_ENTROPY_TOKENS] = {
-  0, 0, 0, 2, 3, 4, 12,3, 6,                 /* EOB and Zero-run tokens. */
-  0, 0, 0, 0,                                /* Very low value tokens. */
-  1, 1, 1, 1, 2, 3, 4, 5, 6, 10,             /* Other value tokens */
-  1, 1, 1, 1, 1, 3, 4,                       /* Category 1 runs. */
-  2, 3,                                      /* Category 2 runs. */
-};
-
-#define NEW_FREQS 0 /* dbm - test new frequency tables */
-
-#if NEW_FREQS
-/* New baseline frequency tables for encoder version >= 2 */
-const ogg_uint32_t FrequencyCounts_VP3[NUM_HUFF_TABLES][MAX_ENTROPY_TOKENS] = {
-  /* DC Intra bias  */
-  {  272,    84,    31,    36,    10,    2,    1,    92,    1,
-     701,   872,   410,   478,
-     630,   502,   417,   356,   582,   824,   985,   965,   697,   606,
-     125,   119,    40,    3,    9,    15,    10,
-     73,    37,
-  },
-  {  311,   107,    41,    51,    18,    4,    2,   120,    1,
-     824,  1037,   468,   541,
-     714,   555,   451,   374,   595,   819,   929,   817,   474,   220,
-     172,   142,    27,    4,    9,    10,    2,
-     98,    48,
-  },
-  {  353,   125,    49,    66,    24,    6,    2,   124,    1,
-     926,  1172,   512,   594,
-     766,   581,   458,   379,   590,   789,   849,   665,   306,    80,
-     204,   147,    25,    5,    12,    9,    2,
-     108,    54,
-  },
-  {  392,   141,    57,    75,    31,    7,    4,   138,    1,
-     1050,  1321,   559,   649,
-     806,   594,   460,   372,   568,   727,   710,   475,   155,    19,
-     251,   174,    27,    7,    16,    8,    2,
-     126,    62,
-  },
-  {  455,   168,    66,    87,    39,    10,    6,   124,    2,
-     1143,  1455,   592,   692,
-     824,   596,   453,   361,   542,   657,   592,   329,    78,    5,
-     269,   184,    27,    9,    19,    7,    2,
-     127,    66,
-  },
-  {  544,   201,    80,   102,    45,    11,    6,    99,    1,
-     1236,  1587,   610,   720,
-     833,   590,   444,   348,   506,   588,   487,   226,    39,    2,
-     253,   178,    27,    10,    20,    7,    2,
-     118,    65,
-  },
-  {  649,   241,    98,   121,    54,    14,    8,    84,    1,
-     1349,  1719,   634,   763,
-     847,   583,   428,   323,   456,   492,   349,   120,    13,    1,
-     231,   170,    24,    8,    19,    7,    1,
-     109,    67,
-  },
-  {  824,   304,   129,   158,    66,    19,    10,    44,    2,
-     1476,  1925,   644,   794,
-     838,   559,   396,   289,   392,   384,   223,    53,    3,    1,
-     159,   121,    17,    6,    16,    6,    2,
-     69,    53,
-  },
-
-  /* DC Inter Bias */
-  {  534,   174,    71,    68,    10,    1,    1,    68,   119,
-     1674,  1526,   560,   536,
-     539,   331,   229,   168,   233,   262,   231,   149,    71,    51,
-     629,   530,   284,   126,   182,   208,   184,
-     148,    87,
-  },
-  {  594,   195,    77,    71,    9,    1,    1,    47,    89,
-     1723,  1592,   595,   570,
-     574,   351,   241,   176,   243,   271,   234,   144,    65,    37,
-     534,   449,   240,   117,   167,   277,   153,
-     96,    54,
-  },
-  {  642,   213,    88,    83,    12,    1,    1,    40,    80,
-     1751,  1630,   621,   600,
-     598,   367,   250,   183,   251,   276,   235,   143,    62,    28,
-     485,   397,   212,   110,   161,   193,   141,
-     84,    48,
-  },
-  {  693,   258,   114,   131,    27,    3,    1,    44,    79,
-     1794,  1644,   550,   533,
-     518,   314,   213,   154,   209,   223,   174,    97,    40,    14,
-     584,   463,   236,   138,   196,   249,   143,
-     94,    54,
-  },
-  {  758,   303,   144,   189,    53,    8,    1,    37,    69,
-     1842,  1732,   513,   504,
-     478,   287,   191,   137,   182,   186,   137,    72,    31,    6,
-     589,   469,   199,   128,   177,   264,   161,
-     89,    49,
-  },
-  {  817,   344,   170,   243,    84,    18,    2,    30,    65,
-     1836,  1733,   518,   511,
-     477,   281,   185,   130,   169,   166,   117,    59,    25,    3,
-     572,   450,   185,   121,   173,   232,   146,
-     80,    43,
-  },
-  {  865,   389,   204,   322,   139,    42,    9,    26,    51,
-     1848,  1766,   531,   522,
-     477,   275,   177,   122,   153,   144,    97,    50,    16,    1,
-     485,   378,   167,   115,   164,   203,   128,
-     74,    42,
-  },
-  {  961,   447,   243,   407,   196,    74,    26,    12,    34,
-     2003,  1942,   571,   565,
-     494,   278,   173,   116,   141,   129,    85,    44,    8,    1,
-     285,   223,   101,    66,   104,   120,    74,
-     35,    22,
-  },
-
-  /* AC INTRA Tables  */
-  /* AC Intra bias group 1 tables */
-  {  245,    68,    25,    28,    5,    1,    1,   359,    4,
-     910,   904,   570,   571,
-     766,   620,   478,   375,   554,   684,   652,   441,   182,    30,
-     535,   206,   118,    77,    69,    90,    16,
-     299,   100,
-  },
-  {  302,    86,    32,    36,    8,    1,    1,   362,    3,
-     974,   968,   599,   599,
-     774,   635,   469,   365,   528,   628,   557,   337,   118,    14,
-     577,   219,   136,    82,    69,    65,    13,
-     317,   112,
-  },
-  {  348,   102,    39,    44,    9,    2,    1,   363,    3,
-     1062,  1055,   607,   609,
-     787,   626,   457,   348,   494,   550,   452,   233,    60,    2,
-     636,   244,   159,    92,    74,    68,    12,
-     327,   119,
-  },
-  {  400,   121,    47,    51,    11,    2,    1,   366,    3,
-     1109,  1102,   620,   622,
-     786,   624,   450,   331,   459,   490,   366,   163,    29,    1,
-     673,   257,   175,    98,    77,    63,    14,
-     344,   131,
-  },
-  {  470,   151,    59,    67,    15,    3,    1,   354,    4,
-     1198,  1189,   640,   643,
-     769,   603,   410,   294,   386,   381,   240,    78,    5,    1,
-     746,   282,   205,   113,    87,    64,    15,
-     368,   145,
-  },
-  {  553,   189,    77,    94,    24,    6,    1,   347,    4,
-     1244,  1232,   650,   653,
-     739,   551,   360,   249,   303,   261,   129,    24,    1,    1,
-     828,   313,   245,   135,   108,    77,    17,
-     403,   169,
-  },
-  {  701,   253,   109,   140,    42,    12,    2,   350,    6,
-     1210,  1197,   652,   647,
-     673,   495,   299,   189,   211,   151,    50,    2,    1,    1,
-     892,   336,   284,   162,   134,   101,    25,
-     455,   205,
-  },
-  {  924,   390,   180,   248,    85,    31,    13,   286,    14,
-     1242,  1206,   601,   577,
-     519,   342,   175,   100,    85,    36,    1,    1,    1,    1,
-     1031,   348,   346,   204,   166,   131,    34,
-     473,   197,
-  },
-  /* AC Inter bias group 1 tables */
-  {  459,   128,    50,    48,    8,    1,    1,   224,    69,
-     1285,  1227,   587,   565,
-     573,   406,   261,   180,   228,   213,   130,    47,    11,    3,
-     1069,   540,   309,   231,   147,   279,   157,
-     383,   165,
-  },
-  {  524,   155,    62,    64,    14,    2,    1,   209,    63,
-     1345,  1288,   523,   507,
-     515,   358,   225,   153,   183,   160,    87,    29,    7,    2,
-     1151,   591,   365,   282,   179,   308,   133,
-     344,   157,
-  },
-  {  588,   181,    75,    81,    19,    3,    1,   204,    68,
-     1344,  1288,   517,   503,
-     505,   346,   216,   141,   169,   139,    71,    21,    5,    1,
-     1146,   584,   366,   286,   170,   298,   153,
-     342,   157,
-  },
-  {  634,   196,    82,    89,    22,    4,    1,   194,    60,
-     1356,  1312,   515,   502,
-     489,   331,   199,   127,   145,   111,    51,    14,    3,    1,
-     1156,   589,   393,   300,   182,   285,   144,
-     340,   159,
-  },
-  {  715,   231,    98,   113,    31,    7,    1,   181,    57,
-     1345,  1303,   498,   490,
-     448,   291,   166,   101,   106,    75,    30,    9,    1,    1,
-     1175,   584,   416,   321,   209,   333,   164,
-     330,   159,
-  },
-  {  825,   283,   125,   149,    44,    11,    2,   160,    59,
-     1343,  1308,   476,   469,
-     405,   247,   131,    75,    76,    47,    18,    5,    1,    1,
-     1192,   579,   432,   332,   217,   327,   176,
-     320,   154,
-  },
-  {  961,   361,   170,   215,    70,    20,    5,   161,    55,
-     1250,  1218,   463,   460,
-     354,   204,   101,    52,    48,    28,    11,    1,    1,    1,
-     1172,   570,   449,   350,   222,   332,   169,
-     338,   174,
-  },
-  {  1139,   506,   266,   387,   156,    57,    26,   114,    48,
-     1192,  1170,   366,   366,
-     226,   113,    47,    22,    22,    12,    1,    1,    1,    1,
-     1222,   551,   462,   391,   220,   322,   156,
-     290,   136,
-  },
-
-  /* AC Intra bias group 2 tables */
-  {  245,    49,    15,    11,    1,    1,    1,   332,    38,
-     1163,  1162,   685,   683,
-     813,   623,   437,   318,   421,   424,   288,   109,    14,    1,
-     729,   303,   179,   112,    87,   199,    46,
-     364,   135,
-  },
-  {  305,    67,    22,    17,    2,    1,    1,   329,    39,
-     1250,  1245,   706,   705,
-     801,   584,   385,   267,   330,   296,   165,    40,    3,    1,
-     798,   340,   206,   131,   108,   258,    52,
-     382,   154,
-  },
-  {  356,    82,    28,    23,    3,    1,    1,   312,    42,
-     1340,  1334,   701,   703,
-     770,   545,   346,   227,   269,   223,   100,    17,    1,    1,
-     846,   359,   222,   142,   120,   284,    55,
-     379,   157,
-  },
-  {  402,    95,    33,    30,    4,    1,    1,   300,    43,
-     1379,  1371,   710,   714,
-     724,   486,   289,   182,   202,   144,    47,    5,    1,    1,
-     908,   394,   250,   161,   141,   350,    60,
-     391,   171,
-  },
-  {  499,   122,    44,    42,    7,    1,    1,   267,    45,
-     1439,  1436,   690,   694,
-     628,   385,   213,   122,   117,    62,    14,    1,    1,    1,
-     992,   441,   288,   187,   167,   446,    82,
-     378,   176,
-  },
-  {  641,   168,    62,    60,    12,    1,    1,   247,    49,
-     1435,  1436,   662,   669,
-     527,   298,   142,    71,    55,    22,    3,    1,    1,    1,
-     1036,   470,   319,   208,   193,   548,   106,
-     362,   184,
-  },
-  {  860,   274,   111,   113,    23,    4,    1,   229,    59,
-     1331,  1323,   629,   645,
-     419,   192,    72,    30,    19,    6,    1,    1,    1,    1,
-     1022,   478,   339,   225,   213,   690,   142,
-     342,   198,
-  },
-  {  1059,   437,   218,   285,    84,    17,    2,   152,    44,
-     1284,  1313,   530,   561,
-     212,    66,    17,    6,    3,    1,    1,    1,    1,    1,
-     1034,   485,   346,   226,   207,   819,   185,
-     248,   145,
-  },
-  /* AC Inter bias group 2 tables */
-  {  407,    93,    31,    24,    2,    1,    1,   232,   108,
-     1365,  1349,   581,   578,
-     498,   305,   170,   100,   103,    67,    24,    5,    1,    1,
-     1175,   604,   393,   268,   209,   506,   217,
-     379,   193,
-  },
-  {  521,   129,    46,    39,    4,    1,    1,   199,   116,
-     1419,  1403,   543,   540,
-     446,   263,   138,    78,    75,    44,    13,    2,    1,    1,
-     1201,   605,   392,   267,   214,   533,   252,
-     334,   167,
-  },
-  {  575,   144,    52,    46,    6,    1,    1,   193,   124,
-     1394,  1384,   528,   528,
-     406,   227,   112,    59,    54,    28,    7,    1,    1,    1,
-     1210,   621,   412,   284,   235,   604,   265,
-     320,   167,
-  },
-  {  673,   174,    64,    59,    9,    1,    1,   177,   128,
-     1392,  1385,   499,   499,
-     352,   183,    85,    42,    35,    16,    3,    1,    1,    1,
-     1210,   626,   418,   289,   246,   675,   297,
-     292,   158,
-  },
-  {  804,   225,    85,    77,    12,    1,    1,   150,   129,
-     1387,  1384,   455,   455,
-     277,   129,    53,    23,    17,    7,    1,    1,    1,    1,
-     1212,   635,   433,   306,   268,   760,   313,
-     249,   137,
-  },
-  {  975,   305,   123,   117,    20,    2,    1,   135,   140,
-     1312,  1310,   401,   399,
-     201,    80,    28,    11,    8,    2,    1,    1,    1,    1,
-     1162,   623,   439,   314,   283,   906,   368,
-     203,   121,
-  },
-  {  1205,   452,   208,   231,    50,    6,    1,   123,   149,
-     1161,  1164,   370,   370,
-     137,    45,    14,    4,    2,    1,    1,    1,    1,    1,
-     1047,   562,   413,   300,   277,  1020,   404,
-     168,   105,
-  },
-  {  1297,   662,   389,   574,   200,    39,    4,    55,   120,
-     1069,  1076,   273,   265,
-     66,    14,    2,    1,    1,    1,    1,    1,    1,    1,
-     930,   475,   345,   249,   236,  1124,   376,
-     91,    56,
-  },
-
-  /* AC Intra bias group 3 tables */
-  {  278,    55,    17,    12,    1,    1,    1,   288,    71,
-     1315,  1304,   725,   724,
-     733,   506,   307,   195,   225,   175,    77,    12,    1,    1,
-     904,   414,   246,   170,   126,   290,   205,
-     423,   185,
-  },
-  {  382,    80,    26,    21,    2,    1,    1,   239,    64,
-     1442,  1429,   706,   701,
-     664,   420,   239,   146,   152,   105,    34,    2,    1,    1,
-     975,   440,   263,   185,   140,   332,   229,
-     397,   169,
-  },
-  {  451,    97,    32,    27,    4,    1,    1,   223,    75,
-     1462,  1454,   682,   680,
-     574,   343,   179,   101,    98,    54,    9,    1,    1,    1,
-     1031,   482,   293,   210,   163,   400,   297,
-     384,   181,
-  },
-  {  551,   128,    43,    37,    5,    1,    1,   201,    78,
-     1497,  1487,   642,   651,
-     493,   269,   133,    70,    60,    24,    2,    1,    1,    1,
-     1065,   504,   312,   228,   178,   451,   352,
-     351,   174,
-  },
-  {  693,   179,    63,    54,    8,    1,    1,   169,    78,
-     1502,  1497,   580,   591,
-     375,   186,    77,    35,    21,    4,    1,    1,    1,    1,
-     1099,   533,   341,   253,   206,   542,   432,
-     306,   164,
-  },
-  {  867,   263,   105,    96,    16,    2,    1,   152,    81,
-     1435,  1439,   521,   525,
-     270,   107,    32,    8,    3,    1,    1,    1,    1,    1,
-     1085,   537,   361,   277,   223,   616,   549,
-     258,   156,
-  },
-  {  1022,   385,   182,   207,    46,    7,    1,   158,    88,
-     1290,  1318,   501,   502,
-     184,    38,    6,    1,    1,    1,    1,    1,    1,    1,
-     1023,   480,   345,   301,   232,   665,   661,
-     210,   133,
-  },
-  {  1184,   555,   307,   457,   185,    44,    6,   115,    41,
-     1236,  1253,   329,   340,
-     32,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     1017,   385,   316,   370,   246,   672,   788,
-     85,    23,
-  },
-  /* AC Inter bias group 3 tables */
-  {  502,   106,    33,    22,    1,    1,    1,   151,   132,
-     1446,  1451,   502,   499,
-     343,   181,    84,    42,    36,    16,    3,    1,    1,    1,
-     1211,   661,   429,   312,   242,   637,   498,
-     288,   156,
-  },
-  {  651,   147,    48,    35,    3,    1,    1,   145,   140,
-     1419,  1420,   469,   466,
-     281,   132,    56,    25,    18,    6,    1,    1,    1,    1,
-     1175,   656,   435,   328,   260,   715,   556,
-     252,   147,
-  },
-  {  749,   179,    59,    43,    4,    1,    1,   123,   135,
-     1423,  1431,   413,   409,
-     221,    95,    36,    15,    9,    2,    1,    1,    1,    1,
-     1159,   658,   444,   340,   272,   782,   656,
-     205,   124,
-  },
-  {  902,   243,    86,    67,    7,    1,    1,   114,   141,
-     1385,  1385,   387,   383,
-     178,    67,    22,    7,    4,    1,    1,    1,    1,    1,
-     1096,   632,   434,   339,   277,   813,   735,
-     171,   109,
-  },
-  {  1081,   337,   133,   112,    15,    1,    1,    92,   137,
-     1350,  1349,   311,   309,
-     115,    34,    8,    2,    1,    1,    1,    1,    1,    1,
-     1016,   595,   418,   342,   283,   870,   883,
-     114,    78,
-  },
-  {  1253,   467,   210,   205,    34,    3,    1,    80,   130,
-     1318,  1313,   258,   260,
-     68,    12,    2,    1,    1,    1,    1,    1,    1,    1,
-     874,   516,   378,   330,   273,   877,  1000,
-     72,    53,
-  },
-  {  1362,   626,   333,   423,   100,    10,    1,    73,   106,
-     1311,  1313,   241,   231,
-     31,    3,    1,    1,    1,    1,    1,    1,    1,    1,
-     620,   368,   286,   302,   245,   814,  1127,
-     34,    28,
-  },
-  {  1203,   743,   460,   774,   284,    36,    1,    13,    25,
-     1956,  1961,   103,   106,
-     3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     248,   131,   149,   272,   165,   535,   813,
-     3,    3,
-  },
-
-  /* AC Intra bias group 4 tables */
-  {  599,   150,    55,    50,    9,    1,    1,   181,    19,
-     1487,  1487,   625,   625,
-     473,   271,   138,    74,    71,    42,    11,    1,    1,    1,
-     1187,   591,   356,   239,   170,   351,   137,
-     395,   194,
-  },
-  {  758,   209,    79,    74,    15,    2,    1,   147,    25,
-     1514,  1514,   521,   520,
-     334,   165,    74,    36,    30,    11,    1,    1,    1,    1,
-     1252,   644,   409,   279,   211,   472,   203,
-     318,   171,
-  },
-  {  852,   252,   100,    98,    20,    3,    1,   130,    26,
-     1493,  1498,   481,   473,
-     268,   123,    51,    23,    15,    3,    1,    1,    1,    1,
-     1256,   652,   426,   294,   231,   543,   242,
-     278,   156,
-  },
-  {  971,   309,   130,   136,    30,    5,    1,   113,    28,
-     1458,  1467,   443,   435,
-     215,    90,    31,    12,    5,    1,    1,    1,    1,    1,
-     1232,   643,   426,   303,   243,   590,   300,
-     235,   136,
-  },
-  {  1100,   399,   180,   206,    53,    9,    1,   101,    29,
-     1419,  1425,   375,   374,
-     158,    47,    10,    1,    1,    1,    1,    1,    1,    1,
-     1193,   609,   426,   319,   256,   643,   383,
-     166,   103,
-  },
-  {  1195,   505,   249,   326,    98,    20,    3,   102,    25,
-     1370,  1356,   355,   347,
-     104,    11,    1,    1,    1,    1,    1,    1,    1,    1,
-     1100,   568,   381,   330,   261,   642,   466,
-     105,    69,
-  },
-  {  1176,   608,   345,   559,   244,    57,    6,   110,    9,
-     1370,  1332,   372,   367,
-     29,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     859,   427,   269,   359,   375,   608,   451,
-     35,    20,
-  },
-  {  1140,   613,   391,   797,   458,   180,    37,    2,    1,
-     2037,  1697,    95,    31,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     360,    49,    23,   198,  1001,   719,   160,
-     1,    1,
-  },
-  /* AC Inter bias group 4 tables */
-  {  931,   272,   105,    96,    16,    1,    1,    91,    52,
-     1481,  1489,   347,   349,
-     174,    74,    28,    12,    8,    3,    1,    1,    1,    1,
-     1247,   719,   490,   356,   279,   706,   363,
-     187,   110,
-  },
-  {  1095,   358,   148,   143,    25,    3,    1,    74,    61,
-     1439,  1457,   304,   302,
-     127,    46,    15,    5,    3,    1,    1,    1,    1,    1,
-     1138,   664,   469,   347,   282,   768,   487,
-     139,    87,
-  },
-  {  1192,   423,   188,   189,    36,    4,    1,    64,    61,
-     1457,  1475,   284,   282,
-     106,    35,    10,    3,    1,    1,    1,    1,    1,    1,
-     1078,   624,   440,   329,   264,   744,   507,
-     117,    73,
-  },
-  {  1275,   496,   231,   258,    52,    6,    1,    53,    55,
-     1458,  1470,   248,   245,
-     77,    20,    5,    1,    1,    1,    1,    1,    1,    1,
-     984,   576,   414,   323,   260,   771,   569,
-     84,    54,
-  },
-  {  1377,   603,   302,   367,    87,    11,    1,    37,    52,
-     1522,  1532,   207,   204,
-     47,    8,    1,    1,    1,    1,    1,    1,    1,    1,
-     840,   493,   366,   291,   231,   690,   636,
-     52,    32,
-  },
-  {  1409,   708,   385,   529,   148,    24,    1,    23,    37,
-     1672,  1670,   163,   162,
-     22,    2,    1,    1,    1,    1,    1,    1,    1,    1,
-     647,   364,   291,   262,   210,   574,   643,
-     26,    14,
-  },
-  {  1348,   778,   481,   755,   245,    53,    4,    13,    19,
-     2114,  2089,   141,   139,
-     7,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     302,   183,   162,   181,   182,   344,   437,
-     8,    3,
-  },
-  {  1560,   769,   410,   664,   243,    58,    1,    1,    1,
-     3017,  2788,    17,    24,
-     3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     34,    16,    8,    55,   134,   105,    86,
-     1,    1,
-  },
-};
-
-#else /* Frequency tables for encoder version < 2 */
-
-const ogg_uint32_t FrequencyCounts_VP3[NUM_HUFF_TABLES][MAX_ENTROPY_TOKENS] = {
-  /* DC Intra bias */
-  {  198,    62,    22,    31,    14,     6,     6,   205,     3,
-     843,   843,   415,   516,
-     660,   509,   412,   347,   560,   779,   941,   930,   661,   377,
-     170,   155,    39,     2,     9,    15,    11,
-     128,    86,
-  },
-  {  299,    92,    34,    39,    15,     6,     6,   132,     1,
-     851,   851,   484,   485,
-     666,   514,   416,   351,   567,   788,   953,   943,   670,   383,
-     117,   119,    26,     4,    17,     7,     1,
-      93,    56,
-  },
-  {  367,   115,   42,   47,   16,    6,    6,   105,    1,
-     896,   896,   492,   493,
-     667,   510,   408,   342,   547,   760,   932,   927,   656,   379,
-     114,   103,   10,    3,    6,    2,    1,
-     88,   49,
-  },
-  {  462,   158,   63,   76,   28,    9,    8,   145,    1,
-     1140,  1140,   573,   574,
-     754,   562,   435,   357,   555,   742,   793,   588,   274,   81,
-     154,   117,   13,    6,   12,    2,    1,
-     104,   62,
-  },
-  {  558,   196,   81,   99,   36,   11,    9,   135,    1,
-     1300,  1301,   606,   607,
-     779,   560,   429,   349,   536,   680,   644,   405,   153,   30,
-     171,   120,   12,    5,   14,    3,    1,
-     104,   53,
-  },
-  {  635,   233,   100,   122,   46,   14,   12,   113,    1,
-     1414,  1415,   631,   631,
-     785,   555,   432,   335,   513,   611,   521,   284,   89,   13,
-     170,   113,   10,    5,   14,    3,    1,
-     102,   62,
-  },
-  {  720,   276,   119,   154,   62,   20,   16,   101,    1,
-     1583,  1583,   661,   661,
-     794,   556,   407,   318,   447,   472,   343,   153,   35,    1,
-     172,   115,   11,    7,   14,    3,    1,
-     112,   70,
-  },
-  {  853,   326,   144,   184,   80,   27,   19,   52,    1,
-     1739,  1740,   684,   685,
-     800,   540,   381,   277,   364,   352,   218,   78,   13,    1,
-     139,   109,    9,    6,   20,    2,    1,
-     94,   50,
-  },
-
-  /* DC Inter Bias */
-  {  490,   154,   57,   53,   10,    2,    1,   238,   160,
-     1391,  1390,   579,   578,
-     491,   273,   172,   118,   152,   156,   127,   79,   41,   39,
-     712,   547,   316,   125,   183,   306,   237,
-     451,   358,
-  },
-  {  566,   184,   70,   65,   11,    2,    1,   235,   51,
-     1414,  1414,   599,   598,
-     510,   285,   180,   124,   157,   161,   131,   82,   42,   40,
-     738,   551,   322,   138,   195,   188,   93,
-     473,   365,
-  },
-  {  711,   261,   111,   126,   27,    4,    1,   137,   52,
-     1506,  1505,   645,   645,
-     567,   316,   199,   136,   172,   175,   142,   88,   45,   48,
-     548,   449,   255,   145,   184,   174,   121,
-     260,   227,
-  },
-  {  823,   319,   144,   175,   43,    7,    1,   53,   42,
-     1648,  1648,   653,   652,
-     583,   329,   205,   139,   175,   176,   139,   84,   44,   34,
-     467,   389,   211,   137,   181,   186,   107,
-     106,   85,
-  },
-  {  948,   411,   201,   276,   85,   16,    2,   39,   33,
-     1778,  1777,   584,   583,
-     489,   265,   162,   111,   140,   140,   108,   64,   38,   23,
-     428,   356,   201,   139,   186,   165,   94,
-     78,   63,
-  },
-  {  1002,   470,   248,   386,   153,   39,    6,   23,   23,
-     1866,  1866,   573,   573,
-     467,   249,   155,   103,   130,   128,   94,   60,   38,   14,
-     323,   263,   159,   111,   156,   153,   74,
-     46,   34,
-  },
-  {  1020,   518,   291,   504,   242,   78,   18,   14,   14,
-     1980,  1979,   527,   526,
-     408,   219,   132,   87,   110,   104,   79,   55,   31,    7,
-     265,   213,   129,   91,   131,   111,   50,
-     31,   20,
-  },
-  {  1018,   544,   320,   591,   338,   139,   47,    5,    2,
-     2123,  2123,   548,   547,
-     414,   212,   126,   83,   101,   96,   79,   60,   23,    1,
-     120,   97,   55,   39,   60,   38,   15,
-     11,    8,
-  },
-
-  /* AC INTRA Tables  */
-  /* AC Intra bias group 1 tables */
-  {  242,   62,   22,   20,    4,    1,    1,   438,    1,
-     593,   593,   489,   490,
-     657,   580,   471,   374,   599,   783,   869,   770,   491,   279,
-     358,   144,   82,   54,   49,   70,    5,
-     289,   107,
-  },
-  {  317,   95,   38,   41,    8,    1,    1,   479,    1,
-     653,   654,   500,   501,
-     682,   611,   473,   376,   582,   762,   806,   656,   358,   155,
-     419,   162,   86,   58,   36,   34,    1,
-     315,   126,
-  },
-  {  382,   121,   49,   59,   15,    3,    1,   496,    1,
-     674,   674,   553,   554,
-     755,   636,   487,   391,   576,   718,   701,   488,   221,   72,
-     448,   161,   107,   56,   37,   29,    1,
-     362,   156,
-  },
-  {  415,   138,   57,   73,   21,    5,    1,   528,    1,
-     742,   741,   562,   563,
-     753,   669,   492,   388,   563,   664,   589,   340,   129,   26,
-     496,   184,   139,   71,   48,   33,    2,
-     387,   166,
-  },
-  {  496,   170,   73,   94,   31,    8,    2,   513,    1,
-     855,   855,   604,   604,
-     769,   662,   477,   356,   486,   526,   381,   183,   51,    5,
-     590,   214,   160,   85,   60,   39,    3,
-     427,   203,
-  },
-  {  589,   207,   89,   116,   40,   13,    3,   491,    1,
-     919,   919,   631,   631,
-     769,   633,   432,   308,   408,   378,   247,   94,   17,    1,
-     659,   247,   201,   105,   73,   51,    3,
-     466,   242,
-  },
-  {  727,   266,   115,   151,   49,   17,    6,   439,    1,
-     977,   977,   642,   642,
-     718,   572,   379,   243,   285,   251,   133,   40,    1,    1,
-     756,   287,   253,   126,   94,   66,    4,
-     492,   280,
-  },
-  {  940,   392,   180,   247,   82,   30,   14,   343,    1,
-     1064,  1064,   615,   616,
-     596,   414,   235,   146,   149,   108,   41,    1,    1,    1,
-     882,   314,   346,   172,   125,   83,    6,
-     489,   291,
-  },
-  /* AC Inter bias group 1 tables */
-  {  440,   102,   33,   23,    2,    1,    1,   465,   85,
-     852,   852,   744,   743,
-     701,   496,   297,   193,   225,   200,   129,   58,   18,    2,
-     798,   450,   269,   202,   145,   308,   154,
-     646,   389,
-  },
-  {  592,   151,   53,   43,    6,    1,    1,   409,   34,
-     875,   875,   748,   747,
-     723,   510,   305,   196,   229,   201,   130,   59,   18,    2,
-     800,   436,   253,   185,   115,   194,   88,
-     642,   368,
-  },
-  {  759,   222,   86,   85,   17,    2,    1,   376,   46,
-     888,   888,   689,   688,
-     578,   408,   228,   143,   165,   141,   84,   35,    7,    1,
-     878,   488,   321,   244,   147,   266,   124,
-     612,   367,
-  },
-  {  912,   298,   122,   133,   34,    7,    1,   261,   44,
-     1092,  1091,   496,   496,
-     409,   269,   150,   95,   106,   87,   49,   16,    1,    1,
-     1102,   602,   428,   335,   193,   323,   157,
-     423,   253,
-  },
-  {  1072,   400,   180,   210,   60,   16,    3,   210,   40,
-     1063,  1063,   451,   451,
-     345,   221,   121,   73,   79,   64,   31,    6,    1,    1,
-     1105,   608,   462,   358,   202,   330,   155,
-     377,   228,
-  },
-  {  1164,   503,   254,   330,   109,   34,    9,   167,   35,
-     1038,  1037,   390,   390,
-     278,   170,   89,   54,   56,   40,   13,    1,    1,    1,
-     1110,   607,   492,   401,   218,   343,   141,
-     323,   192,
-  },
-  {  1173,   583,   321,   486,   196,   68,   23,   124,   23,
-     1037,  1037,   347,   346,
-     232,   139,   69,   40,   37,   20,    2,    1,    1,    1,
-     1128,   584,   506,   410,   199,   301,   113,
-     283,   159,
-  },
-  {  1023,   591,   366,   699,   441,   228,   113,   79,    5,
-     1056,  1056,   291,   291,
-     173,   96,   38,   19,    8,    1,    1,    1,    1,    1,
-     1187,   527,   498,   409,   147,   210,   56,
-     263,   117,
-  },
-
-  /* AC Intra bias group 2 tables */
-  {  311,   74,   27,   27,    5,    1,    1,   470,   24,
-     665,   667,   637,   638,
-     806,   687,   524,   402,   585,   679,   609,   364,   127,   20,
-     448,   210,   131,   76,   52,   111,   19,
-     393,   195,
-  },
-  {  416,   104,   39,   38,    8,    1,    1,   545,   33,
-     730,   731,   692,   692,
-     866,   705,   501,   365,   495,   512,   387,   168,   39,    2,
-     517,   240,   154,   86,   64,   127,   19,
-     461,   247,
-  },
-  {  474,   117,   43,   42,    9,    1,    1,   560,   40,
-     783,   783,   759,   760,
-     883,   698,   466,   318,   404,   377,   215,   66,    7,    1,
-     559,   259,   176,   110,   87,   170,   22,
-     520,   278,
-  },
-  {  582,   149,   53,   53,   12,    2,    1,   473,   39,
-     992,   993,   712,   713,
-     792,   593,   373,   257,   299,   237,   114,   25,    1,    1,
-     710,   329,   221,   143,   116,   226,   26,
-     490,   259,
-  },
-  {  744,   210,   78,   77,   16,    2,    1,   417,   37,
-     1034,  1035,   728,   728,
-     718,   509,   296,   175,   184,   122,   42,    3,    1,    1,
-     791,   363,   255,   168,   145,   311,   35,
-     492,   272,
-  },
-  {  913,   291,   121,   128,   28,    4,    1,   334,   40,
-     1083,  1084,   711,   712,
-     624,   378,   191,   107,   95,   50,    7,    1,    1,    1,
-     876,   414,   288,   180,   164,   382,   39,
-     469,   275,
-  },
-  {  1065,   405,   184,   216,   53,    8,    1,   236,   36,
-     1134,  1134,   685,   686,
-     465,   253,   113,   48,   41,    9,    1,    1,    1,    1,
-     965,   451,   309,   179,   166,   429,   53,
-     414,   249,
-  },
-  {  1148,   548,   301,   438,   160,   42,    6,   84,   17,
-     1222,  1223,   574,   575,
-     272,   111,   23,    6,    2,    1,    1,    1,    1,    1,
-     1060,   502,   328,   159,   144,   501,   54,
-     302,   183,
-  },
-  /* AC Inter bias group 2 tables */
-  {  403,   80,   24,   17,    1,    1,    1,   480,   90,
-     899,   899,   820,   819,
-     667,   413,   228,   133,   139,   98,   42,   10,    1,    1,
-     865,   470,   316,   222,   171,   419,   213,
-     645,   400,
-  },
-  {  698,   169,   59,   49,    6,    1,    1,   414,   101,
-     894,   893,   761,   761,
-     561,   338,   171,   96,   97,   64,   26,    6,    1,    1,
-     896,   494,   343,   239,   192,   493,   215,
-     583,   366,
-  },
-  {  914,   255,   94,   80,   10,    1,    1,   345,   128,
-     935,   935,   670,   671,
-     415,   222,   105,   55,   51,   30,   10,    1,    1,    1,
-     954,   530,   377,   274,   232,   641,   295,
-     456,   298,
-  },
-  {  1103,   359,   146,   135,   20,    1,    1,   235,   119,
-     1042,  1042,   508,   507,
-     293,   146,   65,   33,   30,   16,    4,    1,    1,    1,
-     1031,   561,   407,   296,   265,   813,   317,
-     301,   192,
-  },
-  {  1255,   504,   238,   265,   51,    5,    1,   185,   113,
-     1013,  1013,   437,   438,
-     212,   92,   41,   18,   15,    6,    1,    1,    1,    1,
-     976,   530,   386,   276,   260,   927,   357,
-     224,   148,
-  },
-  {  1292,   610,   332,   460,   127,   16,    1,   136,   99,
-     1014,  1015,   384,   384,
-     153,   65,   25,   11,    6,    1,    1,    1,    1,    1,
-     942,   487,   343,   241,   238,   970,   358,
-     174,   103,
-  },
-  {  1219,   655,   407,   700,   280,   55,    2,   100,   60,
-     1029,  1029,   337,   336,
-     119,   43,   11,    3,    2,    1,    1,    1,    1,    1,
-     894,   448,   305,   199,   213,  1005,   320,
-     136,   77,
-  },
-  {  1099,   675,   435,   971,   581,   168,   12,   37,   16,
-     1181,  1081,   319,   318,
-     66,   11,    6,    1,    1,    1,    1,    1,    1,    1,
-     914,   370,   235,   138,   145,   949,   128,
-     94,   41,
-  },
-
-  /* AC Intra bias group 3 tables */
-  {  486,   112,   39,   34,    6,    1,    1,   541,   67,
-     819,   818,   762,   763,
-     813,   643,   403,   280,   332,   295,   164,   53,    6,    1,
-     632,   294,   180,   131,   105,   208,   109,
-     594,   295,
-  },
-  {  723,   191,   69,   65,   12,    1,    1,   445,   79,
-     865,   865,   816,   816,
-     750,   515,   290,   172,   184,   122,   46,    5,    1,    1,
-     740,   340,   213,   165,   129,   270,   168,
-     603,   326,
-  },
-  {  884,   264,   102,   103,   21,    3,    1,   382,   68,
-     897,   897,   836,   836,
-     684,   427,   227,   119,   119,   70,   16,    1,    1,    1,
-     771,   367,   234,   184,   143,   272,   178,
-     555,   326,
-  },
-  {  1028,   347,   153,   161,   36,    8,    1,   251,   44,
-     1083,  1084,   735,   735,
-     541,   289,   144,   77,   57,   23,    3,    1,    1,    1,
-     926,   422,   270,   215,   176,   301,   183,
-     443,   248,
-  },
-  {  1155,   465,   224,   264,   71,   14,    3,   174,   27,
-     1110,  1111,   730,   731,
-     429,   206,   79,   30,   19,    4,    1,    1,    1,    1,
-     929,   443,   279,   225,   194,   298,   196,
-     354,   223,
-  },
-  {  1191,   576,   296,   415,   144,   36,    8,   114,   16,
-     1162,  1162,   749,   749,
-     338,   108,   29,    8,    5,    1,    1,    1,    1,    1,
-     947,   458,   273,   207,   194,   248,   145,
-     258,   152,
-  },
-  {  1169,   619,   366,   603,   247,   92,   23,   46,    1,
-     1236,  1236,   774,   775,
-     191,   35,   14,    1,    1,    1,    1,    1,    1,    1,
-     913,   449,   260,   214,   194,   180,   82,
-     174,   98,
-  },
-  {  1006,   537,   381,   897,   504,   266,   101,   39,    1,
-     1307,  1307,   668,   667,
-     116,    3,    1,    1,    1,    1,    1,    1,    1,    1,
-     1175,   261,   295,   70,   164,   107,   31,
-     10,   76,
-  },
-  /* AC Inter bias group 3 tables */
-  {  652,   156,   53,   43,    5,    1,    1,   368,   128,
-     983,   984,   825,   825,
-     583,   331,   163,   88,   84,   48,   15,    1,    1,    1,
-     870,   480,   316,   228,   179,   421,   244,
-     562,   349,
-  },
-  {  988,   280,   104,   87,   12,    1,    1,   282,   194,
-     980,   981,   738,   739,
-     395,   189,   80,   37,   31,   12,    2,    1,    1,    1,
-     862,   489,   333,   262,   214,   600,   446,
-     390,   260,
-  },
-  {  1176,   399,   165,   154,   24,    2,    1,   218,   224,
-     1017,  1018,   651,   651,
-     280,   111,   42,   16,    9,    3,    1,    1,    1,    1,
-     787,   469,   324,   269,   229,   686,   603,
-     267,   194,
-  },
-  {  1319,   530,   255,   268,   47,    4,    1,   113,   183,
-     1149,  1150,   461,   461,
-     173,   58,   17,    5,    3,    1,    1,    1,    1,    1,
-     768,   450,   305,   261,   221,   716,   835,
-     136,   97,
-  },
-  {  1362,   669,   355,   465,   104,    9,    1,   76,   153,
-     1253,  1253,   398,   397,
-     102,   21,    5,    1,    1,    1,    1,    1,    1,    1,
-     596,   371,   238,   228,   196,   660,   954,
-     68,   53,
-  },
-  {  1354,   741,   446,   702,   174,   15,    1,   38,   87,
-     1498,  1498,   294,   294,
-     43,    7,    1,    1,    1,    1,    1,    1,    1,    1,
-     381,   283,   165,   181,   155,   544,  1039,
-     25,   21,
-  },
-  {  1262,   885,   546,   947,   263,   18,    1,   18,   27,
-     1908,  1908,   163,   162,
-     14,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     195,   152,   83,   125,   109,   361,   827,
-     7,    5,
-  },
-  {  2539,   951,   369,   554,   212,   18,    1,    1,    1,
-     2290,  2289,   64,   64,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     18,   18,    9,   55,   36,   184,   323,
-     1,    1,
-  },
-
-
-  /* AC Intra bias group 4 tables */
-  {  921,   264,   101,   100,   19,    2,    1,   331,   98,
-     1015,  1016,   799,   799,
-     512,   269,   119,   60,   50,   17,    1,    1,    1,    1,
-     841,   442,   307,   222,   182,   493,   256,
-     438,   310,
-  },
-  {  1147,   412,   184,   206,   50,    6,    1,   242,   141,
-     977,   976,   808,   807,
-     377,   135,   40,   10,    7,    1,    1,    1,    1,    1,
-     788,   402,   308,   223,   205,   584,   406,
-     316,   227,
-  },
-  {  1243,   504,   238,   310,   79,   11,    1,   184,   150,
-     983,   984,   814,   813,
-     285,   56,   10,    1,    1,    1,    1,    1,    1,    1,
-     713,   377,   287,   217,   180,   615,   558,
-     208,   164,
-  },
-  {  1266,   606,   329,   484,   161,   27,    1,   79,   92,
-     1187,  1188,   589,   588,
-     103,   10,    1,    1,    1,    1,    1,    1,    1,    1,
-     680,   371,   278,   221,   244,   614,   728,
-     80,   62,
-  },
-  {  1126,   828,   435,   705,   443,   90,    8,   10,   55,
-     1220,  1219,   350,   350,
-     28,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     602,   330,   222,   168,   158,   612,   919,
-     104,    5,
-  },
-  {  1210,   506,  1014,   926,   474,   240,    4,    1,    44,
-     1801,  1801,   171,   171,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     900,   132,    36,    11,    47,   191,   316,
-     2,    1,
-  },
-  {  1210,   506,  1014,   926,   474,   240,    4,    1,    44,
-     1801,  1801,   171,   171,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     900,   132,    36,    11,    47,   191,   316,
-     2,    1,
-  },
-  {  1210,   506,  1014,   926,   474,   240,    4,    1,    44,
-     1801,  1801,   171,   171,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     900,   132,    36,    11,    47,   191,   316,
-     2,    1,
-  },
-  /* AC Inter bias group 4 tables */
-  {  1064,   325,   129,   117,    20,    2,    1,   266,   121,
-     1000,  1000,   706,   706,
-     348,   162,    67,    32,    25,    11,    1,    1,    1,    1,
-     876,   513,   363,   274,   225,   627,   384,
-     370,   251,
-  },
-  {  1311,   517,   238,   254,    45,    3,    1,   188,   160,
-     1070,  1070,   635,   635,
-     239,    85,    30,    11,    6,    1,    1,    1,    1,    1,
-     744,   420,   313,   239,   206,   649,   541,
-     221,   155,
-  },
-  {  1394,   632,   322,   385,    78,    7,    1,   134,   152,
-     1163,  1164,   607,   607,
-     185,    51,    12,    3,    1,    1,    1,    1,    1,    1,
-     631,   331,   275,   203,   182,   604,   620,
-     146,    98,
-  },
-  {  1410,   727,   407,   546,   146,    19,    1,    67,    88,
-     1485,  1486,   419,   418,
-     103,    18,    3,    1,    1,    1,    1,    1,    1,    1,
-     555,   261,   234,   164,   148,   522,   654,
-      67,    39,
-  },
-  {  1423,   822,   492,   719,   216,    22,    1,    28,    59,
-     1793,  1793,   323,   324,
-     37,    2,    1,    1,    1,    1,    1,    1,    1,    1,
-     376,   138,   158,   102,   119,   400,   604,
-     28,    9,
-  },
-  {  1585,   923,   563,   918,   207,    25,    1,    5,    20,
-     2229,  2230,   172,   172,
-     7,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     191,    40,    56,    22,    65,   243,   312,
-     2,    1,
-  },
-  {  2225,  1100,   408,   608,   133,    8,    1,    1,    1,
-     2658,  2658,    25,    24,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     8,    1,    1,    1,    1,   125,    16,
-     1,    1,
-  },
-  {  2539,   951,   369,   554,   212,    18,    1,    1,    1,
-     2290,  2289,    64,    64,
-     1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
-     18,    18,    9,    55,    36,   184,   323,
-     1,    1,
-  },
-};
-
-#endif /* NEW_FREQS */

Modified: branches/theora-thusnelda/lib/enc/mathops.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mathops.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/mathops.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -147,7 +147,7 @@
   0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
 };
 
-/*Computes the binary exponential of _log2, a log base 2 in Q57 format.*/
+/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
 ogg_int64_t oc_bexp64(ogg_int64_t _z){
   ogg_int64_t w;
   ogg_int64_t z;

Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/mcenc.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -14,11 +14,10 @@
   last mod: $Id$
 
  ********************************************************************/
-
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
-#include "codec_internal.h"
+#include "encint.h"
 
 /*The maximum Y plane SAD value for accepting the median predictor.*/
 #define OC_YSAD_THRESH1            (256)
@@ -71,31 +70,44 @@
 };
 
 
-static void oc_mcenc_find_candidates(CP_INSTANCE *cpi,mc_state *_mcenc,
- int _mbi,int _goldenp){
-  macroblock_t *nemb;
-  macroblock_t *emb;
-  ogg_int32_t   mvapw1;
-  ogg_int32_t   mvapw2;
-  int           a[3][2];
-  int           ncandidates;
-  int           i;
-  emb=cpi->macro+_mbi;
-  if(emb->ncneighbors>0){
+void oc_mcenc_start(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc){
+  ogg_int64_t nframes;
+  /*Set up the accelerated MV weights for previous frame prediction.*/
+  _mcenc->mvapw1[OC_FRAME_PREV]=(ogg_int32_t)1<<17;
+  _mcenc->mvapw2[OC_FRAME_PREV]=(ogg_int32_t)1<<16;
+  /*Set up the accelerated MV weights for golden frame prediction.*/
+  nframes=_enc->state.curframe_num-_enc->state.keyframe_num;
+  _mcenc->mvapw1[OC_FRAME_GOLD]=(ogg_int32_t)(
+   nframes!=1?(nframes<<17)/(nframes-1):0);
+  _mcenc->mvapw2[OC_FRAME_GOLD]=(ogg_int32_t)(
+   nframes!=2?(nframes<<16)/(nframes-2):0);
+}
+
+static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
+ int _mbi,int _frame){
+  oc_mb_enc_info *embs;
+  ogg_int32_t     mvapw1;
+  ogg_int32_t     mvapw2;
+  int             a[3][2];
+  int             ncandidates;
+  unsigned        nmbi;
+  int             i;
+  embs=_enc->mb_info;
+  if(embs[_mbi].ncneighbors>0){
     /*Fill in the first part of set A: the last motion vectors used and the
        vectors from adjacent blocks.*/
     /*Skip a position to store the median predictor in.*/
     ncandidates=1;
-    for(i=0;i<emb->ncneighbors;i++){
-      nemb=cpi->macro+emb->cneighbors[i];
-      _mcenc->candidates[ncandidates][0]=nemb->analysis_mv[0][_goldenp][0];
-      _mcenc->candidates[ncandidates][1]=nemb->analysis_mv[0][_goldenp][1];
+    for(i=0;i<embs[_mbi].ncneighbors;i++){
+      nmbi=embs[_mbi].cneighbors[i];
+      _mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0];
+      _mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1];
       ncandidates++;
     }
     /*Add a few additional vectors to set A: the vector used in the
        previous frame and the (0,0) vector.*/
-    _mcenc->candidates[ncandidates][0]=emb->analysis_mv[1][_goldenp][0];
-    _mcenc->candidates[ncandidates][1]=emb->analysis_mv[1][_goldenp][1];
+    _mcenc->candidates[ncandidates][0]=embs[_mbi].analysis_mv[1][_frame][0];
+    _mcenc->candidates[ncandidates][1]=embs[_mbi].analysis_mv[1][_frame][1];
     ncandidates++;
     _mcenc->candidates[ncandidates][0]=0;
     _mcenc->candidates[ncandidates][1]=0;
@@ -115,30 +127,30 @@
   else{
     /*The upper-left most macro block has no neighbors at all
       We just use 0,0 as the median predictor and its previous motion vector
-      for set A.*/
+       for set A.*/
     _mcenc->candidates[0][0]=0;
     _mcenc->candidates[0][1]=1;
-    _mcenc->candidates[1][0]=emb->analysis_mv[1][_goldenp][0];
-    _mcenc->candidates[1][1]=emb->analysis_mv[1][_goldenp][1];
+    _mcenc->candidates[1][0]=embs[_mbi].analysis_mv[1][_frame][0];
+    _mcenc->candidates[1][1]=embs[_mbi].analysis_mv[1][_frame][1];
     ncandidates=2;
   }
   /*Fill in set B: accelerated predictors for this and adjacent macro
      blocks.*/
   _mcenc->setb0=ncandidates;
-  mvapw1=_mcenc->mvapw1[_goldenp];
-  mvapw2=_mcenc->mvapw2[_goldenp];
+  mvapw1=_mcenc->mvapw1[_frame];
+  mvapw2=_mcenc->mvapw2[_frame];
   /*The first time through the loop use the current macro block.*/
-  nemb=emb;
+  nmbi=_mbi;
   for(i=0;;i++){
     _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
-     OC_DIV_POW2_RE(nemb->analysis_mv[1][_goldenp][0]*mvapw1
-     -nemb->analysis_mv[2][_goldenp][0]*mvapw2,16),31);
+     OC_DIV_POW2_RE(embs[nmbi].analysis_mv[1][_frame][0]*mvapw1
+     -embs[nmbi].analysis_mv[2][_frame][0]*mvapw2,16),31);
     _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
-     OC_DIV_POW2_RE(nemb->analysis_mv[1][_goldenp][1]*mvapw1
-     -nemb->analysis_mv[2][_goldenp][1]*mvapw2,16),31);
+     OC_DIV_POW2_RE(embs[nmbi].analysis_mv[1][_frame][1]*mvapw1
+     -embs[nmbi].analysis_mv[2][_frame][1]*mvapw2,16),31);
     ncandidates++;
-    if(i>=emb->npneighbors)break;
-    nemb=cpi->macro+emb->pneighbors[i];
+    if(i>=embs[_mbi].npneighbors)break;
+    nmbi=embs[_mbi].pneighbors[i];
   }
   /*Truncate to full-pel positions.*/
   for(i=0;i<ncandidates;i++){
@@ -149,328 +161,84 @@
 }
 
 #if 0
-static int oc_sad16_halfpel(CP_INSTANCE *cpi,int mbi,
- int _mvoffset0,int _mvoffset1,int _goldenp,int _best_err){
-  macroblock_t *mb;
-  int           err;
-  int           bi;
-  mb=cpi->macro+mbi;
+static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
+ int _mvoffset0,int _mvoffset1,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _best_err){
+  unsigned err;
+  int      bi;
   err=0;
   for(bi=0;bi<4;bi++){
-    const unsigned char *cur;
-    const unsigned char *ref;
-    ogg_uint32_t         base_offset;
-    int                  fi;
-    fi=mb->Ryuv[0][bi];
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    err+=oc_enc_frag_sad2_thresh(cpi,cur,
-     ref+_mvoffset0,ref+_mvoffset1,cpi->stride[0],_best_err-err);
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
+     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
   }
   return err;
 }
 #endif
 
-static int oc_satd16_halfpel(CP_INSTANCE *cpi,int mbi,
- int _mvoffset0,int _mvoffset1,int _goldenp,int _best_err){
-  macroblock_t *mb;
-  int           err;
-  int           bi;
-  mb=cpi->macro+mbi;
+static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
+ int _mvoffset0,int _mvoffset1,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _best_err){
+  unsigned err;
+  int      bi;
   err=0;
   for(bi=0;bi<4;bi++){
-    const unsigned char *cur;
-    const unsigned char *ref;
-    ogg_uint32_t         base_offset;
-    int                  fi;
-    fi=mb->Ryuv[0][bi];
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    err+=oc_enc_frag_satd2_thresh(cpi,cur,
-     ref+_mvoffset0,ref+_mvoffset1,cpi->stride[0],_best_err-err);
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
+     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
   }
   return err;
 }
 
-static int oc_mcenc_ysad_check_mbcandidate_fullpel(CP_INSTANCE *cpi,
- int _mbi,int _dx,int _dy,int _goldenp,int _block_err[4]){
-  int           stride;
-  int           mvoffset;
-  int           err;
-  int           bi;
-  macroblock_t *mb;
-  mb=cpi->macro+_mbi;
-  /*TODO: customize error function for speed/(quality+size) tradeoff.*/
-  stride=cpi->stride[0];
-  mvoffset=_dx+_dy*stride;
+static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _block_err[4]){
+  unsigned err;
+  int      mvoffset;
+  int      bi;
+  mvoffset=_dx+_dy*_ystride;
   err=0;
   for(bi=0;bi<4;bi++){
-    const unsigned char *cur;
-    const unsigned char *ref;
-    ogg_uint32_t         base_offset;
-    int                  fi;
-    fi=mb->Ryuv[0][bi];
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    _block_err[bi]=oc_enc_frag_sad(cpi,cur,ref+mvoffset,stride);
-    err+=_block_err[bi];
+    ptrdiff_t frag_offs;
+    unsigned  block_err;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    block_err=oc_enc_frag_sad(_enc,
+     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
+    _block_err[bi]=block_err;
+    err+=block_err;
   }
   return err;
 }
 
-static int oc_mcenc_ysatd_check_mbcandidate_fullpel(CP_INSTANCE *cpi,
- int _mbi,int _dx,int _dy,int _goldenp){
-  int           stride;
-  int           mvoffset;
-  int           err;
-  int           bi;
-  macroblock_t *mb;
-  mb=cpi->macro+_mbi;
-  stride=cpi->stride[0];
-  mvoffset=_dx+_dy*stride;
+static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  int mvoffset;
+  int err;
+  int bi;
+  mvoffset=_dx+_dy*_ystride;
   err=0;
   for(bi=0;bi<4;bi++){
-    const unsigned char *cur;
-    const unsigned char *ref;
-    ogg_uint32_t         base_offset;
-    int                  fi;
-    fi=mb->Ryuv[0][bi];
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    err+=oc_enc_frag_satd_thresh(cpi,cur,ref+mvoffset,stride,0xFF000);
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_satd_thresh(_enc,
+     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX);
   }
   return err;
 }
 
-static int oc_mcenc_ysatd_check_bcandidate_fullpel(CP_INSTANCE *cpi,
- int _mbi,int _bi,int _dx,int _dy,int _goldenp){
-  macroblock_t        *mb;
-  const unsigned char *cur;
-  const unsigned char *ref;
-  ogg_uint32_t         base_offset;
-  int                  stride;
-  int                  mvoffset;
-  int                  fi;
-  mb=cpi->macro+_mbi;
-  stride=cpi->stride[0];
-  mvoffset=_dx+_dy*stride;
-  fi=mb->Ryuv[0][_bi];
-  base_offset=cpi->frag_buffer_index[fi];
-  cur=cpi->frame+base_offset;
-  ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-  return oc_enc_frag_satd_thresh(cpi,cur,ref+mvoffset,stride,0xFF000);
+static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
+ ptrdiff_t _frag_offs,int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  return oc_enc_frag_satd_thresh(_enc,
+   _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX);
 }
 
-#if 0
-static int oc_mcenc_ysad_halfpel_mbrefine(CP_INSTANCE *cpi,int _mbi,
- int _vec[2],int _best_err,int _goldenp){
-  int offset_y[9];
-  int stride;
-  int mvoffset_base;
-  int best_site;
-  int sitei;
-  int err;
-  stride=cpi->stride[0];
-  mvoffset_base=_vec[0]+_vec[1]*stride;
-  offset_y[0]=offset_y[1]=offset_y[2]=-stride;
-  offset_y[3]=offset_y[5]=0;
-  offset_y[6]=offset_y[7]=offset_y[8]=stride;
-  best_site=4;
-  for(sitei=0;sitei<8;sitei++){
-    int site;
-    int xmask;
-    int ymask;
-    int dx;
-    int dy;
-    int mvoffset0;
-    int mvoffset1;
-    site=OC_SQUARE_SITES[0][sitei];
-    dx=OC_SQUARE_DX[site];
-    dy=OC_SQUARE_DY[site];
-    /*The following code SHOULD be equivalent to
-        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
-         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
-      However, it should also be much faster, as it involves no multiplies and
-       doesn't have to handle chroma vectors.*/
-    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
-    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
-    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
-    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=oc_sad16_halfpel(cpi,_mbi,mvoffset0,mvoffset1,_goldenp,_best_err);
-    if(err<_best_err){
-      _best_err=err;
-      best_site=site;
-    }
-  }
-  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
-  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
-  return _best_err;
-}
-#endif
-
-static int oc_mcenc_ysatd_halfpel_mbrefine(CP_INSTANCE *cpi,int _mbi,
- int _vec[2],int _best_err,int _goldenp){
-  int offset_y[9];
-  int stride;
-  int mvoffset_base;
-  int best_site;
-  int sitei;
-  int err;
-  stride=cpi->stride[0];
-  mvoffset_base=_vec[0]+_vec[1]*stride;
-  offset_y[0]=offset_y[1]=offset_y[2]=-stride;
-  offset_y[3]=offset_y[5]=0;
-  offset_y[6]=offset_y[7]=offset_y[8]=stride;
-  best_site=4;
-  for(sitei=0;sitei<8;sitei++){
-    int site;
-    int xmask;
-    int ymask;
-    int dx;
-    int dy;
-    int mvoffset0;
-    int mvoffset1;
-    site=OC_SQUARE_SITES[0][sitei];
-    dx=OC_SQUARE_DX[site];
-    dy=OC_SQUARE_DY[site];
-    /*The following code SHOULD be equivalent to
-        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
-         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
-      However, it should also be much faster, as it involves no multiplies and
-       doesn't have to handle chroma vectors.*/
-    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
-    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
-    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
-    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=oc_satd16_halfpel(cpi,_mbi,mvoffset0,mvoffset1,_goldenp,_best_err);
-    if(err<_best_err){
-      _best_err=err;
-      best_site=site;
-    }
-  }
-  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
-  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
-  return _best_err;
-}
-
-#if 0
-static int oc_mcenc_ysad_halfpel_brefine(CP_INSTANCE *cpi,int _mbi,
- int _bi,int _vec[2],int _best_err,int _goldenp){
-  macroblock_t *mb;
-  int           offset_y[9];
-  int           stride;
-  int           mvoffset_base;
-  int           best_site;
-  int           sitei;
-  int           err;
-  int           fi;
-  mb=cpi->macro+_mbi;
-  stride=cpi->stride[0];
-  fi=mb->Ryuv[0][_bi];
-  mvoffset_base=_vec[0]+_vec[1]*stride;
-  offset_y[0]=offset_y[1]=offset_y[2]=-stride;
-  offset_y[3]=offset_y[5]=0;
-  offset_y[6]=offset_y[7]=offset_y[8]=stride;
-  best_site=4;
-  for(sitei=0;sitei<8;sitei++){
-    ogg_uint32_t         base_offset;
-    const unsigned char *cur;
-    const unsigned char *ref;
-    int                  site;
-    int                  xmask;
-    int                  ymask;
-    int                  dx;
-    int                  dy;
-    int                  mvoffset0;
-    int                  mvoffset1;
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    site=OC_SQUARE_SITES[0][sitei];
-    dx=OC_SQUARE_DX[site];
-    dy=OC_SQUARE_DY[site];
-    /*The following code SHOULD be equivalent to
-        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
-         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
-      However, it should also be much faster, as it involves no multiplies and
-       doesn't have to handle chroma vectors.*/
-    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
-    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
-    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
-    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=oc_enc_frag_sad2_thresh(cpi,cur,
-     ref+mvoffset0,ref+mvoffset1,stride,_best_err);
-    if(err<_best_err){
-      _best_err=err;
-      best_site=site;
-    }
-  }
-  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
-  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
-  return _best_err;
-}
-#endif
-
-static int oc_mcenc_ysatd_halfpel_brefine(CP_INSTANCE *cpi,int _mbi,
- int _bi,int _vec[2],int _best_err,int _goldenp){
-  macroblock_t *mb;
-  int           offset_y[9];
-  int           stride;
-  int           mvoffset_base;
-  int           best_site;
-  int           sitei;
-  int           err;
-  int           fi;
-  mb=cpi->macro+_mbi;
-  stride=cpi->stride[0];
-  fi=mb->Ryuv[0][_bi];
-  mvoffset_base=_vec[0]+_vec[1]*stride;
-  offset_y[0]=offset_y[1]=offset_y[2]=-stride;
-  offset_y[3]=offset_y[5]=0;
-  offset_y[6]=offset_y[7]=offset_y[8]=stride;
-  best_site=4;
-  for(sitei=0;sitei<8;sitei++){
-    ogg_uint32_t         base_offset;
-    const unsigned char *cur;
-    const unsigned char *ref;
-    int                  site;
-    int                  xmask;
-    int                  ymask;
-    int                  dx;
-    int                  dy;
-    int                  mvoffset0;
-    int                  mvoffset1;
-    base_offset=cpi->frag_buffer_index[fi];
-    cur=cpi->frame+base_offset;
-    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    site=OC_SQUARE_SITES[0][sitei];
-    dx=OC_SQUARE_DX[site];
-    dy=OC_SQUARE_DY[site];
-    /*The following code SHOULD be equivalent to
-        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
-         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
-      However, it should also be much faster, as it involves no multiplies and
-       doesn't have to handle chroma vectors.*/
-    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
-    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
-    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
-    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=oc_enc_frag_satd2_thresh(cpi,cur,
-     ref+mvoffset0,ref+mvoffset1,stride,_best_err);
-    if(err<_best_err){
-      _best_err=err;
-      best_site=site;
-    }
-  }
-  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
-  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
-  return _best_err;
-}
-
 /*Perform a motion vector search for this macro block against a single
    reference frame.
   As a bonus, individual block motion vectors are computed as well, as much of
@@ -479,10 +247,8 @@
    oc_mb_enc_info structure.
   _mcenc:    The motion compensation context.
   _mbi:      The macro block index.
-  _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.
-  _bmvs:     Returns the individual block motion vectors.*/
-void oc_mcenc_search(CP_INSTANCE *cpi,mc_state *_mcenc,int _mbi,
- int _goldenp,oc_mv _bmvs[4],int *_best_err,int _best_block_err[4]){
+  _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/
+void oc_mcenc_search(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,int _mbi,int _frame){
   /*Note: Traditionally this search is done using a rate-distortion objective
      function of the form D+lambda*R.
     However, xiphmont tested this and found it produced a small degredation,
@@ -498,47 +264,60 @@
      may cause increased degredation in many blocks to come.
     We could artificially reduce lambda to compensate, but it's faster to just
      disable it entirely, and use D (the distortion) as the sole criterion.*/
-  macroblock_t *mb;
-  ogg_int32_t   hit_cache[31];
-  ogg_int32_t   hitbit;
-  int           block_err[4];
-  int           best_vec[2];
-  int           best_err;
-  int           best_block_vec[4][2];
-  int           candx;
-  int           candy;
-  int           bi;
-  mb=cpi->macro+_mbi;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *fragis;
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  oc_mb_enc_info        *embs;
+  ogg_int32_t            hit_cache[31];
+  ogg_int32_t            hitbit;
+  unsigned               best_block_err[4];
+  unsigned               block_err[4];
+  unsigned               best_err;
+  int                    best_vec[2];
+  int                    best_block_vec[4][2];
+  int                    candx;
+  int                    candy;
+  int                    bi;
+  embs=_enc->mb_info;
   /*Find some candidate motion vectors.*/
-  oc_mcenc_find_candidates(cpi,_mcenc,_mbi,_goldenp);
+  oc_mcenc_find_candidates(_enc,_mcenc,_mbi,_frame);
   /*Clear the cache of locations we've examined.*/
   memset(hit_cache,0,sizeof(hit_cache));
   /*Start with the median predictor.*/
   candx=_mcenc->candidates[0][0];
   candy=_mcenc->candidates[0][1];
   hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  ystride=_enc->state.ref_ystride[0];
   /*TODO: customize error function for speed/(quality+size) tradeoff.*/
-  best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,
-   _mbi,candx,candy,_goldenp,block_err);
+  best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
   best_vec[0]=candx;
   best_vec[1]=candy;
-  if(_bmvs){
+  if(_frame==OC_FRAME_PREV){
     for(bi=0;bi<4;bi++){
-      _best_block_err[bi]=block_err[bi];
+      best_block_err[bi]=block_err[bi];
       best_block_vec[bi][0]=candx;
       best_block_vec[bi][1]=candy;
     }
   }
   /*If this predictor fails, move on to set A.*/
   if(best_err>OC_YSAD_THRESH1){
-    int err;
-    int ci;
-    int ncs;
-    int t2;
+    unsigned err;
+    unsigned t2;
+    int      ncs;
+    int      ci;
     /*Compute the early termination threshold for set A.*/
-    t2=mb->aerror;
-    ncs=OC_MINI(3,mb->ncneighbors);
-    for(ci=0;ci<ncs;ci++)t2=OC_MAXI(t2,cpi->macro[mb->cneighbors[ci]].aerror);
+    t2=embs[_mbi].error[_frame];
+    ncs=OC_MINI(3,embs[_mbi].ncneighbors);
+    for(ci=0;ci<ncs;ci++){
+      t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]);
+    }
     t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
     /*Examine the candidates in set A.*/
     for(ci=1;ci<_mcenc->setb0;ci++){
@@ -549,16 +328,16 @@
       hitbit=(ogg_int32_t)1<<candx+15;
       if(hit_cache[candy+15]&hitbit)continue;
       hit_cache[candy+15]|=hitbit;
-      err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,
-       _mbi,candx,candy,_goldenp,block_err);
+      err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+       frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
       if(err<best_err){
         best_err=err;
         best_vec[0]=candx;
         best_vec[1]=candy;
       }
-      if(_bmvs){
-        for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
-          _best_block_err[bi]=block_err[bi];
+      if(_frame==OC_FRAME_PREV){
+        for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+          best_block_err[bi]=block_err[bi];
           best_block_vec[bi][0]=candx;
           best_block_vec[bi][1]=candy;
         }
@@ -572,16 +351,16 @@
         hitbit=(ogg_int32_t)1<<candx+15;
         if(hit_cache[candy+15]&hitbit)continue;
         hit_cache[candy+15]|=hitbit;
-        err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,
-         _mbi,candx,candy,_goldenp,block_err);
+        err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+         frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
         if(err<best_err){
           best_err=err;
           best_vec[0]=candx;
           best_vec[1]=candy;
         }
-        if(_bmvs){
-          for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
-            _best_block_err[bi]=block_err[bi];
+        if(_frame==OC_FRAME_PREV){
+          for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+            best_block_err[bi]=block_err[bi];
             best_block_vec[bi][0]=candx;
             best_block_vec[bi][1]=candy;
           }
@@ -608,15 +387,15 @@
             hitbit=(ogg_int32_t)1<<candx+15;
             if(hit_cache[candy+15]&hitbit)continue;
             hit_cache[candy+15]|=hitbit;
-            err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,
-             _mbi,candx,candy,_goldenp,block_err);
+            err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+             frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
             if(err<best_err){
               best_err=err;
               best_site=site;
             }
-            if(_bmvs){
-              for(bi=0;bi<4;bi++)if(block_err[bi]<_best_block_err[bi]){
-                _best_block_err[bi]=block_err[bi];
+            if(_frame==OC_FRAME_PREV){
+              for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+                best_block_err[bi]=block_err[bi];
                 best_block_vec[bi][0]=candx;
                 best_block_vec[bi][1]=candy;
               }
@@ -629,10 +408,10 @@
         /*Final 4-MV search.*/
         /*Simply use 1/4 of the macro block set A and B threshold as the
            individual block threshold.*/
-        if(_bmvs){
+        if(_frame==OC_FRAME_PREV){
           t2>>=2;
           for(bi=0;bi<4;bi++){
-            if(_best_block_err[bi]>t2){
+            if(best_block_err[bi]>t2){
               /*Square pattern search.
                 We do this in a slightly interesting manner.
                 We continue to check the SAD of all four blocks in the
@@ -673,15 +452,15 @@
                   hitbit=(ogg_int32_t)1<<candx+15;
                   if(hit_cache[candy+15]&hitbit)continue;
                   hit_cache[candy+15]|=hitbit;
-                  err=oc_mcenc_ysad_check_mbcandidate_fullpel(cpi,
-                   _mbi,candx,candy,_goldenp,block_err);
+                  err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+                   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
                   if(err<best_err){
                     best_err=err;
                     best_vec[0]=candx;
                     best_vec[1]=candy;
                   }
-                  for(bj=0;bj<4;bj++)if(block_err[bj]<_best_block_err[bj]){
-                    _best_block_err[bj]=block_err[bj];
+                  for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){
+                    best_block_err[bj]=block_err[bj];
                     best_block_vec[bj][0]=candx;
                     best_block_vec[bj][1]=candy;
                   }
@@ -696,62 +475,258 @@
       }
     }
   }
-  if(!_goldenp)mb->aerror=best_err;
-  else mb->gerror=best_err;
+  embs[_mbi].error[_frame]=(ogg_uint16_t)best_err;
   candx=best_vec[0];
   candy=best_vec[1];
-  *_best_err=oc_mcenc_ysatd_check_mbcandidate_fullpel(cpi,
-   _mbi,candx,candy,_goldenp);
-  mb->analysis_mv[0][_goldenp][0]=(signed char)(candx<<1);
-  mb->analysis_mv[0][_goldenp][1]=(signed char)(candy<<1);
-  if(_bmvs!=NULL){
+  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
+   frag_buf_offs,fragis,candx,candy,src,ref,ystride);
+  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1);
+  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1);
+  if(_frame==OC_FRAME_PREV){
     for(bi=0;bi<4;bi++){
       candx=best_block_vec[bi][0];
       candy=best_block_vec[bi][1];
-      _best_block_err[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(cpi,
-       _mbi,bi,candx,candy,_goldenp);
-      _bmvs[bi][0]=(signed char)(candx<<1);
-      _bmvs[bi][1]=(signed char)(candy<<1);
+      embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
+       frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride);
+      embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1);
+      embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1);
     }
   }
 }
 
-void oc_mcenc_refine1mv(CP_INSTANCE *cpi,int _mbi,int _goldenp,int _err){
-  macroblock_t *mb;
-  int           vec[2];
-  mb=cpi->macro+_mbi;
-  vec[0]=OC_DIV2(mb->analysis_mv[0][_goldenp][0]);
-  vec[1]=OC_DIV2(mb->analysis_mv[0][_goldenp][1]);
-  _err=oc_mcenc_ysatd_halfpel_mbrefine(cpi,_mbi,vec,_err,_goldenp);
-  mb->analysis_mv[0][_goldenp][0]=(signed char)vec[0];
-  mb->analysis_mv[0][_goldenp][1]=(signed char)vec[1];
-  if(!_goldenp)mb->asatd=_err;
-  else mb->gsatd=_err;
+#if 0
+static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
+ int _vec[2],int _best_err,int _frame){
+  const unsigned char *src;
+  const unsigned char *ref;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  mvoffset_base;
+  int                  best_site;
+  int                  sitei;
+  int                  err;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  ystride=_enc->state.ref_ystride[0];
+  mvoffset_base=_vec[0]+_vec[1]*ystride;
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    int site;
+    int xmask;
+    int ymask;
+    int dx;
+    int dy;
+    int mvoffset0;
+    int mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
+    err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
+     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
 }
+#endif
 
-void oc_mcenc_refine4mv(CP_INSTANCE *cpi,int _mbi,int _err[4]){
-  macroblock_t *mb;
-  int           bi;
-  mb=cpi->macro+_mbi;
-  for(bi=0;bi<4;bi++){
-    int vec[2];
-    vec[0]=OC_DIV2(mb->block_mv[bi][0]);
-    vec[1]=OC_DIV2(mb->block_mv[bi][1]);
-    oc_mcenc_ysatd_halfpel_brefine(cpi,_mbi,bi,vec,_err[bi],0);
-    mb->ref_mv[bi][0]=(signed char)vec[0];
-    mb->ref_mv[bi][1]=(signed char)vec[1];
+static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
+ int _mbi,int _vec[2],unsigned _best_err,int _frame){
+  const unsigned char *src;
+  const unsigned char *ref;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  mvoffset_base;
+  int                  best_site;
+  int                  sitei;
+  int                  err;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  ystride=_enc->state.ref_ystride[0];
+  mvoffset_base=_vec[0]+_vec[1]*ystride;
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    int site;
+    int xmask;
+    int ymask;
+    int dx;
+    int dy;
+    int mvoffset0;
+    int mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
+    err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
+     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
   }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
 }
 
-void oc_mcenc_start(CP_INSTANCE *cpi,mc_state *_mcenc){
-  ogg_int64_t nframes;
-  /*Set up the accelerated MV weights for previous frame prediction.*/
-  _mcenc->mvapw1[OC_FRAME_PREV]=(ogg_int32_t)1<<17;
-  _mcenc->mvapw2[OC_FRAME_PREV]=(ogg_int32_t)1<<16;
-  /*Set up the accelerated MV weights for golden frame prediction.*/
-  nframes=cpi->LastKeyFrame;
-  _mcenc->mvapw1[OC_FRAME_GOLD]=(ogg_int32_t)(
-   nframes!=1?(nframes<<17)/(nframes-1):0);
-  _mcenc->mvapw2[OC_FRAME_GOLD]=(ogg_int32_t)(
-   nframes!=2?(nframes<<16)/(nframes-2):0);
+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
+  oc_mb_enc_info *embs;
+  int             vec[2];
+  embs=_enc->mb_info;
+  vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]);
+  vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]);
+  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
+   _mbi,vec,embs[_mbi].satd[_frame],_frame);
+  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0];
+  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1];
 }
+
+#if 0
+static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc,
+ int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ int _offset_y[9],unsigned _best_err){
+  int mvoffset_base;
+  int best_site;
+  int sitei;
+  mvoffset_base=_vec[0]+_vec[1]*_ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    unsigned err;
+    int      site;
+    int      xmask;
+    int      ymask;
+    int      dx;
+    int      dy;
+    int      mvoffset0;
+    int      mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
+    err=oc_enc_frag_sad2_thresh(_enc,_src,
+     _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+#endif
+
+static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
+ int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ int _offset_y[9],unsigned _best_err){
+  int mvoffset_base;
+  int best_site;
+  int sitei;
+  mvoffset_base=_vec[0]+_vec[1]*_ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    unsigned err;
+    int      site;
+    int      xmask;
+    int      ymask;
+    int      dx;
+    int      dy;
+    int      mvoffset0;
+    int      mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
+    err=oc_enc_frag_satd2_thresh(_enc,_src,
+     _ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+
+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
+  oc_mb_enc_info        *embs;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *fragis;
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    offset_y[9];
+  int                    ystride;
+  int                    bi;
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  embs=_enc->mb_info;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    int       vec[2];
+    frag_offs=frag_buf_offs[fragis[bi]];
+    vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]);
+    vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]);
+    embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
+     src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
+    embs[_mbi].ref_mv[bi][0]=(signed char)vec[0];
+    embs[_mbi].ref_mv[bi][1]=(signed char)vec[1];
+  }
+}

Deleted: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/mode.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,1497 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function: mode selection code
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <string.h>
-#include "codec_internal.h"
-#include "modedec.h"
-#include "encoder_lookup.h"
-
-/*Mode decision is done by exhaustively examining all potential choices.
-  Obviously, doing the motion compensation, fDCT, tokenization, and then
-   counting the bits each token uses is computationally expensive.
-  Theora's EOB runs can also split the cost of these tokens across multiple
-   fragments, and naturally we don't know what the optimal choice of Huffman
-   codes will be until we know all the tokens we're going to encode in all the
-   fragments.
-  So we use a simple approach to estimating the bit cost of each mode based
-   upon the SAD value of the residual.
-  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
-   the process is very simple.
-  For each quality index and SAD value, we have a table containing the average
-   number of bits needed to code a fragment.
-  The SAD values are placed into a small number of bins (currently 24).
-  TODO: The remaining portion is no longer current.
-  The bit counts are obtained by examining actual encoded frames, with optimal
-   Huffman codes selected and EOB bits appropriately divided among all the
-   blocks they involve.
-  A separate QIxSAD table is kept for each mode and color plane.
-  It may be possible to combine many of these, but only experimentation
-   will tell which ones truly represent the same distribution.
-
-  @ARTICLE{Kim03,
-    author="Hyun Mun Kim",
-    title="Adaptive Rate Control Using Nonlinear Regression",
-    journal="IEEE Transactions on Circuits and Systems for Video
-    Technology",
-    volume=13,
-    number=5,
-    pages="432--439",
-    month="May",
-    year=2003
-  }*/
-
-/*Pointers to the list of bit lengths for the VLC codes used for each mode
-   scheme.
-  Schemes 0-6 use the same VLC, while scheme 7 uses a FLC.*/
-static const unsigned char *OC_MODE_SCHEME_BITS[8]={
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengths,
-  ModeBitLengthsD,
-};
-
-/*Initialize the mode scheme chooser.
-  This need only be called once per encoder.
-  This is probably the best place to describe the various schemes Theora uses
-   to encode macro block modes.
-  There are 8 possible schemes.
-  Schemes 0-6 use a highly unbalanced Huffman code to code each of the modes.
-  The same set of Huffman codes is used for each of these 7 schemes, but the
-   mode assigned to each code varies.
-  Schemes 1-6 have a fixed mapping from Huffman code to MB mode, while scheme 0
-   writes a custom mapping to the bitstream before all the modes.
-  Finally, scheme 7 just encodes each mode directly in 3 bits.*/
-void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
-  int si;
-  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
-  for(si=1;si<8;si++)_chooser->mode_ranks[si]=ModeSchemes[si-1];
-}
-
-/*Reset the mode scheme chooser.
-  This needs to be called once for each frame, including the first.*/
-static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
-  int si;
-  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
-  /*Scheme 0 starts with 24 bits to store the mode list in.*/
-  _chooser->scheme_bits[0]=24;
-  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
-  for(si=0;si<8;si++){
-    /*Scheme 7 should always start first, and scheme 0 should always start
-       last.*/
-    _chooser->scheme_list[si]=7-si;
-    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
-  }
-}
-
-/*This is the real purpose of this data structure: not actually selecting a
-   mode scheme, but estimating the cost of coding a given mode given all the
-   modes selected so far.
-  This is done via opportunity cost: the cost is defined as the number of bits
-   required to encode all the modes selected so far including the current one
-   using the best possible scheme, minus the number of bits required to encode
-   all the modes selected so far not including the current one using the best
-   possible scheme.
-  The computational expense of doing this probably makes it overkill.
-  Just be happy we take a greedy approach instead of trying to solve the
-   global mode-selection problem (which is NP-hard).
-  _mode: The mode to determine the cost of.
-  Return: The number of bits required to code this mode.*/
-static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
- int _mode){
-  int scheme0;
-  int scheme1;
-  int best_bits;
-  int mode_bits;
-  int si;
-  int scheme_bits;
-  scheme0=_chooser->scheme_list[0];
-  scheme1=_chooser->scheme_list[1];
-  best_bits=_chooser->scheme_bits[scheme0];
-  mode_bits=OC_MODE_SCHEME_BITS[scheme0][_chooser->mode_ranks[scheme0][_mode]];
-  /*Typical case: If the difference between the best scheme and the next best
-     is greater than 6 bits, then adding just one mode cannot change which
-     scheme we use.*/
-  if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
-  /*Otherwise, check to see if adding this mode selects a different scheme as
-     the best.*/
-  si=1;
-  best_bits+=mode_bits;
-  do{
-    /*For any scheme except 0, we can just use the bit cost of the mode's rank
-       in that scheme.*/
-    if(scheme1!=0){
-      scheme_bits=_chooser->scheme_bits[scheme1]+
-       OC_MODE_SCHEME_BITS[scheme1][_chooser->mode_ranks[scheme1][_mode]];
-    }
-    else{
-      int ri;
-      /*For scheme 0, incrementing the mode count could potentially change the
-         mode's rank.
-        Find the index where the mode would be moved to in the optimal list,
-         and use its bit cost instead of the one for the mode's current
-         position in the list.*/
-      /*We don't recompute scheme bits; this is computing opportunity cost, not
-         an update.*/
-      for(ri=_chooser->scheme0_ranks[_mode];ri>0&&
-       _chooser->mode_counts[_mode]>=
-       _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
-      scheme_bits=_chooser->scheme_bits[0]+ModeBitLengths[ri];
-    }
-    if(scheme_bits<best_bits)best_bits=scheme_bits;
-    if(++si>=8)break;
-    scheme1=_chooser->scheme_list[si];
-  }
-  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
-  return best_bits-_chooser->scheme_bits[scheme0];
-}
-
-/*Incrementally update the mode counts and per-scheme bit counts and re-order
-   the scheme lists once a mode has been selected.
-  _mode: The mode that was chosen.*/
-static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
- int _mode){
-  int ri;
-  int si;
-  _chooser->mode_counts[_mode]++;
-  /*Re-order the scheme0 mode list if necessary.*/
-  for(ri=_chooser->scheme0_ranks[_mode];ri>0;ri--){
-    int pmode;
-    pmode=_chooser->scheme0_list[ri-1];
-    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mode])break;
-    /*Reorder the mode ranking.*/
-    _chooser->scheme0_ranks[pmode]++;
-    _chooser->scheme0_list[ri]=pmode;
-  }
-  _chooser->scheme0_ranks[_mode]=ri;
-  _chooser->scheme0_list[ri]=_mode;
-  /*Now add the bit cost for the mode to each scheme.*/
-  for(si=0;si<8;si++){
-    _chooser->scheme_bits[si]+=
-     OC_MODE_SCHEME_BITS[si][_chooser->mode_ranks[si][_mode]];
-  }
-  /*Finally, re-order the list of schemes.*/
-  for(si=1;si<8;si++){
-    int sj;
-    int scheme0;
-    int bits0;
-    sj=si;
-    scheme0=_chooser->scheme_list[si];
-    bits0=_chooser->scheme_bits[scheme0];
-    do{
-      int scheme1;
-      scheme1=_chooser->scheme_list[sj-1];
-      if(bits0>=_chooser->scheme_bits[scheme1])break;
-      _chooser->scheme_list[sj]=scheme1;
-    }
-    while(--sj>0);
-    _chooser->scheme_list[sj]=scheme0;
-  }
-}
-
-typedef struct oc_mode_choice oc_mode_choice;
-
-struct oc_mode_choice{
-  unsigned cost;
-  unsigned ssd;
-  unsigned rate;
-  unsigned overhead;
-};
-
-static void oc_mode_dct_cost_accum(oc_mode_choice *_mode,
- int _qi,int _pli,int _qti,int _sad){
-  int      bin;
-  int      dx;
-  int      y0;
-  int      z0;
-  int      dy;
-  int      dz;
-  unsigned rmse;
-  bin=OC_MINI(_sad>>OC_SAD_SHIFT,OC_SAD_BINS-2);
-  dx=_sad-(bin<<OC_SAD_SHIFT);
-  y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
-  z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
-  dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
-  dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
-  _mode->rate+=OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
-  rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
-  _mode->ssd+=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
-}
-
-static void oc_mode_set_cost(oc_mode_choice *_mode,int _lambda){
- _mode->cost=_mode->ssd+(_mode->rate+_mode->overhead)*_lambda;
-}
-
-
-static const signed char OC_MVMAP[2][64]={
-  {     -15,-15,-14, -14,-13,-13,-12, -12,-11,-11,-10, -10, -9, -9, -8,
-     -8, -7, -7, -6,  -6, -5, -5, -4,  -4, -3, -3, -2,  -2, -1, -1,  0,
-      0,  0,  1,  1,   2,  2,  3,  3,   4,  4,  5,  5,   6,  6,  7,  7,
-      8,  8,  9,  9,  10, 10, 11, 11,  12, 12, 13, 13,  14, 14, 15, 15 },
-  {      -7, -7, -7,  -7, -6, -6, -6,  -6, -5, -5, -5,  -5, -4, -4, -4,
-     -4, -3, -3, -3,  -3, -2, -2, -2,  -2, -1, -1, -1,  -1,  0,  0,  0,
-      0,  0,  0,  0,   1,  1,  1,  1,   2,  2,  2,  2,   3,  3,  3,  3,
-      4,  4,  4,  4,   5,  5,  5,  5,   6,  6,  6,  6,   7,  7,  7,  7 }
-};
-
-static const signed char OC_MVMAP2[2][63]={
-  {   -1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
-    0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
-    0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,
-    0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1 },
-  {   -1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,
-    0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,
-    0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,
-    0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1 }
-};
-
-int oc_get_mv_offsets(int _offsets[2],int _dx,int _dy,
- int _ystride,int _pli,int _pf){
-  int qpx;
-  int qpy;
-  int mx;
-  int my;
-  int mx2;
-  int my2;
-  int offs;
-  qpy=!(_pf&2)&&_pli;
-  my=OC_MVMAP[qpy][_dy+31];
-  my2=OC_MVMAP2[qpy][_dy+31];
-  qpx=!(_pf&1)&&_pli;
-  mx=OC_MVMAP[qpx][_dx+31];
-  mx2=OC_MVMAP2[qpx][_dx+31];
-  offs=my*_ystride+mx;
-  if(mx2||my2){
-    _offsets[1]=offs+my2*_ystride+mx2;
-    _offsets[0]=offs;
-    return 2;
-  }
-  _offsets[0]=offs;
-  return 1;
-}
-
-static int BIntraSAD(CP_INSTANCE *cpi, int fi, int plane){
-  int satd;
-  satd=oc_enc_frag_intra_satd(cpi,
-   cpi->frame+cpi->frag_buffer_index[fi],cpi->stride[plane]);
-  if(plane)satd<<=2;
-  return satd;
-}
-
-static int BInterSAD(CP_INSTANCE *cpi,int _fi,int _dx,int _dy,
- int _pli,int _goldenp){
-  unsigned char *b;
-  unsigned char *r;
-  int            offs[2];
-  int            stride;
-  int            sad;
-  b=cpi->frame+cpi->frag_buffer_index[_fi];
-  r=(_goldenp?cpi->golden:cpi->lastrecon)+cpi->frag_buffer_index[_fi];
-  stride=cpi->stride[_pli];
-  sad=0;
-  if(oc_get_mv_offsets(offs,_dx,_dy,
-   cpi->stride[_pli],_pli,cpi->info.pixelformat)>1){
-    sad=oc_enc_frag_satd2_thresh(cpi,b,r+offs[0],r+offs[1],stride,0xFF000);
-  }
-  else sad=oc_enc_frag_satd_thresh(cpi,b,r+offs[0],stride,0xFF000);
-  /*TODO: <<2? Really? Why?*/
-  if(_pli)return sad<<2;
-  else return sad;
-}
-
-static void oc_cost_intra(CP_INSTANCE *cpi,oc_mode_choice *_mode,
- int _mbi,int _qi){
-  macroblock_t *mb;
-  int           pli;
-  int           bi;
-  mb=cpi->macro+_mbi;
-  _mode->rate=_mode->ssd=0;
-  for(pli=0;pli<3;pli++){
-    for(bi=0;bi<4;bi++){
-      int fi;
-      fi=mb->Ryuv[pli][bi];
-      if(fi<cpi->frag_total){
-        oc_mode_dct_cost_accum(_mode,_qi,pli,0,BIntraSAD(cpi,fi,pli));
-      }
-    }
-  }
-  _mode->overhead=
-   oc_mode_scheme_chooser_cost(&cpi->chooser,CODE_INTRA)<<OC_BIT_SCALE;
-  oc_mode_set_cost(_mode,cpi->lambda);
-}
-
-static void oc_cost_inter(CP_INSTANCE *cpi,oc_mode_choice *_mode,int _mbi,
- int _modei,const signed char *_mv,int _qi){
-  macroblock_t *mb;
-  int           goldenp;
-  int           pli;
-  int           bi;
-  int           dx;
-  int           dy;
-  goldenp=OC_FRAME_FOR_MODE[_modei]==OC_FRAME_GOLD;
-  mb=cpi->macro+_mbi;
-  _mode->rate=_mode->ssd=0;
-  dx=_mv[0];
-  dy=_mv[1];
-  for(pli=0;pli<3;pli++){
-    for(bi=0;bi<4;bi++){
-      int fi;
-      fi=mb->Ryuv[pli][bi];
-      if(fi<cpi->frag_total){
-        oc_mode_dct_cost_accum(_mode,_qi,pli,1,
-         BInterSAD(cpi,fi,dx,dy,pli,goldenp));
-      }
-    }
-  }
-  _mode->overhead=
-   oc_mode_scheme_chooser_cost(&cpi->chooser,_modei)<<OC_BIT_SCALE;
-  oc_mode_set_cost(_mode,cpi->lambda);
-}
-
-static void oc_cost_inter_nomv(CP_INSTANCE *cpi,oc_mode_choice *_mode,int _mbi,
- int _modei,int _qi){
-  const unsigned char *ref;
-  macroblock_t        *mb;
-  int                  pli;
-  int                  bi;
-  ref=_modei==CODE_INTER_NO_MV?cpi->lastrecon:cpi->golden;
-  mb=cpi->macro+_mbi;
-  _mode->rate=_mode->ssd=0;
-  for(pli=0;pli<3;pli++){
-    int stride;
-    stride=cpi->stride[pli];
-    for(bi=0;bi<4;bi++){
-      int fi;
-      fi=mb->Ryuv[pli][bi];
-      if(fi<cpi->frag_total){
-        int offs;
-        int sad;
-        offs=cpi->frag_buffer_index[fi];
-        sad=oc_enc_frag_satd_thresh(cpi,
-         cpi->frame+offs,ref+offs,stride,0xFF000);
-        if(pli)sad<<=2;
-        oc_mode_dct_cost_accum(_mode,_qi,pli,1,sad);
-      }
-    }
-  }
-  _mode->overhead=
-   oc_mode_scheme_chooser_cost(&cpi->chooser,_modei)<<OC_BIT_SCALE;
-  oc_mode_set_cost(_mode,cpi->lambda);
-}
-
-static int oc_cost_inter1mv(CP_INSTANCE *cpi,oc_mode_choice *_mode,int _mbi,
- int _modei,const signed char *_mv,int _qi){
-  int bits0;
-  oc_cost_inter(cpi,_mode,_mbi,_modei,_mv,_qi);
-  bits0=MvBits[_mv[0]+MAX_MV_EXTENT]+MvBits[_mv[1]+MAX_MV_EXTENT];
-  _mode->overhead+=OC_MINI(cpi->MVBits_0+bits0,cpi->MVBits_1+12)
-   -OC_MINI(cpi->MVBits_0,cpi->MVBits_1)<<OC_BIT_SCALE;
-  oc_mode_set_cost(_mode,cpi->lambda);
-  return bits0;
-}
-
-static int oc_cost_inter4mv(CP_INSTANCE *cpi,oc_mode_choice *_mode,int _mbi,
- oc_mv _mv[4],int _qi){
-  macroblock_t *mb;
-  int           pli;
-  int           bi;
-  int           bits0;
-  mb=cpi->macro+_mbi;
-  memcpy(mb->mv,_mv,sizeof(mb->mv));
-  _mode->rate=_mode->ssd=0;
-  bits0=0;
-  for(bi=0;bi<4;bi++){
-    int fi;
-    fi=mb->Ryuv[0][bi];
-    if(fi<cpi->frag_total){
-      int dx;
-      int dy;
-      dx=_mv[bi][0];
-      dy=_mv[bi][1];
-      bits0+=MvBits[dx+MAX_MV_EXTENT]+MvBits[dy+MAX_MV_EXTENT];
-      oc_mode_dct_cost_accum(_mode,_qi,0,1,
-       BInterSAD(cpi,fi,dx,dy,0,0));
-    }
-  }
-  (*OC_SET_CHROMA_MVS_TABLE[cpi->info.pixelformat])(mb->cbmvs,
-   (const oc_mv *)_mv);
-  for(pli=1;pli<3;pli++){
-    for(bi=0;bi<4;bi++){
-      int fi;
-      fi=mb->Ryuv[pli][bi];
-      if(fi<cpi->frag_total){
-        int dx;
-        int dy;
-        dx=mb->cbmvs[bi][0];
-        dy=mb->cbmvs[bi][1];
-        oc_mode_dct_cost_accum(_mode,_qi,pli,1,
-         BInterSAD(cpi,fi,dx,dy,pli,0));
-      }
-    }
-  }
-  _mode->overhead=oc_mode_scheme_chooser_cost(&cpi->chooser,CODE_INTER_FOURMV)
-   +OC_MINI(cpi->MVBits_0+bits0,cpi->MVBits_1+48)
-   -OC_MINI(cpi->MVBits_0,cpi->MVBits_1)<<OC_BIT_SCALE;
-  oc_mode_set_cost(_mode,cpi->lambda);
-  return bits0;
-}
-
-#include "quant_lookup.h"
-
-static void uncode_frag(CP_INSTANCE *cpi, int fi, int plane){
-  int bi;
-  int stride;
-  bi=cpi->frag_buffer_index[fi];
-  stride=cpi->stride[plane];
-  cpi->frag_coded[fi]=0;
-  oc_enc_frag_copy(cpi,cpi->recon+bi,cpi->lastrecon+bi,stride);
-}
-
-typedef struct{
-  int uncoded_ac_ssd;
-  int coded_ac_ssd;
-  int ac_cost;
-  int dc_flag;
-} rd_metric_t;
-
-typedef struct{
-  int plane;
-  int qi;
-  ogg_int16_t re_q[2][3][64];
-  oc_iquant *iq[2];
-  quant_tables *qq[2];
-  int xqp;
-  int yqp;
-  int ssdmul;
-} plane_state_t;
-
-static void ps_setup_frame(CP_INSTANCE *cpi, plane_state_t *ps){
-  int i,j,k;
-  int qi = cpi->BaseQ; /* temporary */;
-
-  ps->qi = qi;
-  for(i=0;i<2;i++)
-    for(j=0;j<3;j++)
-      for(k=0;k<64;k++)
-        ps->re_q[i][j][k]=cpi->quant_tables[i][j][k][qi];
-}
-
-static void ps_setup_plane(CP_INSTANCE *cpi, plane_state_t *ps, int plane){
-  ps->plane = plane;
-  ps->iq[0] = cpi->iquant_tables[0][plane][ps->qi];
-  ps->iq[1] = cpi->iquant_tables[1][plane][ps->qi];
-  ps->qq[0] = &(cpi->quant_tables[0][plane]);
-  ps->qq[1] = &(cpi->quant_tables[1][plane]);
-  ps->xqp = (plane && cpi->info.pixelformat != OC_PF_444);
-  ps->yqp = (plane && cpi->info.pixelformat == OC_PF_420);
-  ps->ssdmul = (ps->xqp+1)*(ps->yqp+1);
-}
-
-/* coding overhead is unscaled */
-#include<stdio.h>
-static int TQB (CP_INSTANCE *cpi,plane_state_t *ps,int mode,int fi,
- int _dx,int _dy,int coding_overhead,rd_metric_t *mo,long *rho_count,
- token_checkpoint_t **stack){
-  const int keyframe = (cpi->FrameType == KEY_FRAME);
-  const oc_iquant *iq = ps->iq[mode != CODE_INTRA];
-  ogg_int16_t buffer[64]OC_ALIGN16;
-  ogg_int16_t data[64]OC_ALIGN16;
-  const int bi = cpi->frag_buffer_index[fi];
-  const int stride = cpi->stride[ps->plane];
-  const unsigned char *frame_ptr = &cpi->frame[bi];
-  unsigned char *lastrecon = ((mode == CODE_USING_GOLDEN ||
-                               mode == CODE_GOLDEN_MV) ?
-                              cpi->golden : cpi->lastrecon)+bi;
-  unsigned char *thisrecon = cpi->recon+bi;
-  int nonzero=0;
-  const ogg_int16_t *dequant = ps->re_q[mode != CODE_INTRA][ps->plane];
-  int uncoded_ssd=0,coded_ssd=0;
-  int uncoded_dc=0,coded_dc=0,dc_flag=0;
-  int lambda = cpi->lambda;
-  token_checkpoint_t *checkpoint=*stack;
-  int mv_offs[2];
-  int nmv_offs;
-  int cost;
-  int ci;
-  int pi;
-
-  cpi->frag_coded[fi]=1;
-
-  /* by way of explanation: although the f_array coding overhead
-     determination is accurate, it is greedy using very coarse-grained
-     local information.  Allowing it to mildly discourage coding turns
-     out to be beneficial, but it's not clear that allowing it to
-     encourage coding through negative coding overhead deltas is
-     useful.  For that reason, we disallow negative
-     coding_overheads */
-  if(coding_overhead<0)coding_overhead = 0;
-
-  /* motion comp */
-  switch(mode){
-    case CODE_INTRA:{
-      nmv_offs=0;
-      oc_enc_frag_sub_128(cpi,data,frame_ptr,stride);
-    }break;
-    case CODE_USING_GOLDEN:
-    case CODE_INTER_NO_MV:{
-      nmv_offs=1;
-      mv_offs[0]=0;
-      oc_enc_frag_sub(cpi,data,frame_ptr,lastrecon,stride);
-    }break;
-    default:{
-      nmv_offs=oc_get_mv_offsets(mv_offs,_dx,_dy,
-       stride,ps->plane,cpi->info.pixelformat);
-      if(nmv_offs>1){
-        oc_enc_frag_copy2(cpi,thisrecon,
-         lastrecon+mv_offs[0],lastrecon+mv_offs[1],stride);
-        oc_enc_frag_sub(cpi,data,frame_ptr,thisrecon,stride);
-      }
-      else oc_enc_frag_sub(cpi,data,frame_ptr,lastrecon+mv_offs[0],stride);
-    }break;
-  }
-
-#if defined(OC_COLLECT_METRICS)
-  int sad=0;
-  if(mode==CODE_INTRA)sad=BIntraSAD(cpi,fi,ps->plane);
-  else{
-    sad=BInterSAD(cpi,fi,_dx,_dy,ps->plane,
-     OC_FRAME_FOR_MODE[mode]==OC_FRAME_GOLD);
-  }
-  cpi->frag_sad[fi]=sad;
-#endif
-
-  if(!keyframe){
-    if(mode==CODE_INTER_NO_MV){
-      for(pi=0;pi<64;pi++){
-        uncoded_ssd += data[pi]*data[pi];
-        uncoded_dc += data[pi];
-      }
-    }else{
-      oc_enc_frag_sub(cpi,buffer,frame_ptr,cpi->lastrecon+bi,stride);
-      for(pi=0;pi<64;pi++){
-        uncoded_ssd += buffer[pi]*buffer[pi];
-        uncoded_dc += buffer[pi];
-      }
-    }
-    uncoded_ssd <<= 4; /* scale to match DCT domain */
-  }
-
-  /* transform */
-  oc_enc_fdct8x8(cpi,buffer,data);
-
-  /* collect rho metrics, quantize */
-  {
-    int          zzi;
-#if 0
-    quant_tables *qq = ps->qq[mode != CODE_INTRA];
-#endif
-    for(zzi=0;zzi<64;zzi++){
-      int v;
-      int val;
-      int d;
-      ci=dezigzag_index[zzi];
-      v=buffer[ci];
-      d=dequant[zzi];
-      /* rho-domain distribution */
-      val=v<<1;
-      v=abs(val);
-#if 0
-      {
-        ogg_int16_t *qqq = (*qq)[zzi];
-        int pos;
-        for(pos=64;pos>0;pos--)if(v<qqq[pos-1])break;
-        rho_count[pos]++;
-      }
-#endif
-      if(v>=d){
-        int s;
-        s=OC_SIGNMASK(val);
-        /*The bias added here rounds ties away from zero, since token
-           optimization can only decrease the magnitude of the quantized
-           value.*/
-        val+=(d+s)^s;
-        /*Note the arithmetic right shift is not guaranteed by ANSI C.
-          Hopefully no one still uses ones-complement architectures.*/
-        val=((iq[zzi].m*(ogg_int32_t)val>>16)+val>>iq[zzi].l)-s;
-        data[zzi]=OC_CLAMPI(-580,val,580);
-        nonzero=zzi;
-      }
-      else data[zzi]=0;
-    }
-  }
-  cpi->frag_dc[fi] = data[0];
-
-  /* tokenize */
-  cost = dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack,mode==CODE_INTRA?3:0);
-
-  /*Reconstruct.*/
-  oc_enc_dequant_idct8x8(cpi,buffer,data,
-   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
-  if(mode==CODE_INTRA)oc_enc_frag_recon_intra(cpi,thisrecon,stride,buffer);
-  else{
-    oc_enc_frag_recon_inter(cpi,thisrecon,
-     nmv_offs==1?lastrecon+mv_offs[0]:thisrecon,stride,buffer);
-  }
-
-#if defined(OC_COLLECT_METRICS)
-  {
-#else
-  if(!keyframe){
-#endif
-    /* in retrospect, should we have skipped this block? */
-    oc_enc_frag_sub(cpi,buffer,frame_ptr,thisrecon,stride);
-    for(pi=0;pi<64;pi++){
-      coded_ssd+=buffer[pi]*buffer[pi];
-      coded_dc+=buffer[pi];
-    }
-    coded_ssd <<= 4; /* scale to match DCT domain */
-    /* We actually only want the AC contribution to the SSDs */
-    uncoded_ssd -= ((uncoded_dc*uncoded_dc)>>2);
-    coded_ssd -= ((coded_dc*coded_dc)>>2);
-#if defined(OC_COLLECT_METRICS)
-    cpi->frag_ssd[fi]=coded_ssd;
-  }
-  if(!keyframe){
-#endif
-    /* for undersampled planes */
-    /*coded_ssd*=ps->ssdmul;*/
-    /*uncoded_ssd*=ps->ssdmul;*/
-    mo->uncoded_ac_ssd+=uncoded_ssd;
-
-    /* DC is a special case; if there's more than a full-quantizer
-       improvement in the effective DC component, always force-code
-       the block */
-    if( abs(uncoded_dc)-abs(coded_dc) > (dequant[0]<<1)){
-      mo->dc_flag = dc_flag = 1;
-    }
-
-    if(!dc_flag && uncoded_ssd <= coded_ssd+(coding_overhead+cost)*lambda){
-      /* Hm, not worth it.  roll back */
-      tokenlog_rollback(cpi, checkpoint, (*stack)-checkpoint);
-      *stack = checkpoint;
-      uncode_frag(cpi,fi,ps->plane);
-
-      mo->coded_ac_ssd+=uncoded_ssd;
-      //fprintf(stderr,"skip(%d:%d)",coding_overhead,cost);
-
-      return 0;
-    }else{
-
-      //fprintf(stderr,"*****(%d:%d)",coding_overhead,cost);
-
-      mo->coded_ac_ssd+=coded_ssd;
-      mo->ac_cost+=cost;
-
-    }
-  }
-
-  //for(i=0;i<64;i++)
-  //if(data[i]!=0)cpi->rho_postop++;
-
-  return 1;
-}
-
-static int macroblock_phase_Y[4][4] = {{0,1,3,2},{0,2,3,1},{0,2,3,1},{3,2,0,1}};
-
-/* mode_overhead is scaled by << OC_BIT_SCALE */
-static int TQMB_Y(CP_INSTANCE *cpi,macroblock_t *mb,int mb_phase,
- plane_state_t *ps,long *rc,int mode_overhead,int *mb_mv_bits_0,fr_state_t *fr){
-
-  int full_checkpoint = cpi->fr_full_count;
-  int partial_checkpoint = cpi->fr_partial_count;
-  int block_checkpoint = cpi->fr_block_count;
-  fr_state_t fr_checkpoint = *fr;
-  unsigned char *cp=cpi->frag_coded;
-  int mode = mb->mode;
-  int coded = 0;
-  int i;
-  token_checkpoint_t stack[64*5]; /* worst case token usage for 4 fragments*/
-  token_checkpoint_t *stackptr = stack;
-  //int rho_check = cpi->rho_postop;
-
-  rd_metric_t mo;
-  memset(&mo,0,sizeof(mo));
-
-  for(i=0;i<4;i++){
-    /* Blocks must be handled in Hilbert order which is defined by MB
-       position within the SB.  And, of course, the MVs have to be in
-       raster order just to make it more difficult. */
-    int bi = macroblock_phase_Y[mb_phase][i];
-    int fi = mb->Ryuv[0][bi];
-
-    if(TQB(cpi,ps,mode,fi,mb->mv[bi][0],mb->mv[bi][1],
-     fr_cost1(fr),&mo,rc,&stackptr)){
-      fr_codeblock(cpi,fr);
-      coded++;
-    }
-    else fr_skipblock(cpi,fr);
-  }
-
-
-  if(cpi->FrameType != KEY_FRAME){
-    int bi;
-    if(coded && !mo.dc_flag){
-      /* block by block, still coding the MB.  Now consider the
-         macroblock coding cost as a whole (mode and MV) */
-      int codecost = mo.ac_cost+fr_cost4(&fr_checkpoint,fr)+(mode_overhead>>OC_BIT_SCALE);
-      if(mo.uncoded_ac_ssd <= mo.coded_ac_ssd+cpi->lambda*codecost){
-
-        /* taking macroblock overhead into account, it is not worth coding this MB */
-        tokenlog_rollback(cpi, stack, stackptr-stack);
-        memcpy(fr,&fr_checkpoint,sizeof(fr_checkpoint));
-        cpi->fr_full_count = full_checkpoint;
-        cpi->fr_partial_count = partial_checkpoint;
-        cpi->fr_block_count = block_checkpoint;
-        /*cpi->rho_postop = rho_check;*/
-
-        for(i=0;i<4;i++){
-          int fi = mb->Ryuv[0][i];
-          if(cp[fi])
-            uncode_frag(cpi,fi,0);
-          fr_skipblock(cpi,fr);
-        }
-        coded=0;
-
-      }
-    }
-
-    if(coded==0){
-      mb->mode = CODE_INTER_NO_MV; /* No luma blocks coded, mode is forced */
-      mb->coded = 0;
-      memset(mb->mv,0,sizeof(mb->mv));
-      memset(mb->cbmvs,0,sizeof(mb->cbmvs));
-      return 0;
-    }
-    /*Assume that a 1mv with a single coded block is always cheaper than a 4mv
-       with a single coded block.
-      This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
-       skipped blocks, while a 1MV does not.*/
-    else if(coded==1&&mode==CODE_INTER_FOURMV){
-      int dx;
-      int dy;
-      mode=mb->mode=CODE_INTER_PLUS_MV;
-      for(bi=0;!cp[mb->Ryuv[0][bi]];bi++);
-      dx=mb->mv[bi][0];
-      dy=mb->mv[bi][1];
-      mb->cbmvs[0][0]=mb->cbmvs[1][0]=mb->cbmvs[2][0]=mb->cbmvs[3][0]=
-       mb->mv[0][0]=mb->mv[1][0]=mb->mv[2][0]=mb->mv[3][0]=(signed char)dx;
-      mb->cbmvs[0][1]=mb->cbmvs[1][1]=mb->cbmvs[2][1]=mb->cbmvs[3][1]=
-       mb->mv[0][1]=mb->mv[1][1]=mb->mv[2][1]=mb->mv[3][1]=(signed char)dy;
-      *mb_mv_bits_0=MvBits[dx+MAX_MV_EXTENT]+MvBits[dy+MAX_MV_EXTENT];
-    }
-    mb->coded=0;
-    for(bi=0;bi<4;bi++)mb->coded|=cp[mb->Ryuv[0][bi]]<<bi;
-  }
-
-  /* Commit tokenization */
-  tokenlog_commit(cpi, stack, stackptr-stack);
-
-  return coded;
-}
-
-static const unsigned char OC_MACROBLOCK_PHASE[16]={
-  0,1,3,2,0,2,3,1,0,2,3,1,3,2,0,1
-};
-
-static int TQSB_UV ( CP_INSTANCE *cpi, superblock_t *sb, plane_state_t *ps, long *rc, fr_state_t *fr){
-  int pf = cpi->info.pixelformat;
-  int i;
-  int coded = 0;
-  rd_metric_t mo;
-  token_checkpoint_t stack[64*2]; /* worst case token usage for 1 fragment*/
-  memset(&mo,0,sizeof(mo));
-
-  for(i=0;i<16;i++){
-    int fi = sb->f[i];
-
-    if(fi<cpi->frag_total){
-      token_checkpoint_t *stackptr;
-      macroblock_t       *mb;
-      int                 bi;
-      stackptr = stack;
-      mb=cpi->macro+sb->m[i];
-      bi=OC_MACROBLOCK_PHASE[i]&pf;
-      if(TQB(cpi,ps,mb->mode,fi,mb->cbmvs[bi][0],mb->cbmvs[bi][1],
-       fr_cost1(fr),&mo,rc,&stackptr)){
-        fr_codeblock(cpi,fr);
-        tokenlog_commit(cpi, stack, stackptr-stack);
-        coded++;
-      }else{
-        fr_skipblock(cpi,fr);
-      }
-    }
-  }
-
-  return coded;
-}
-
-int PickModes(CP_INSTANCE *cpi, int recode){
-  int qi;
-  superblock_t *sb;
-  superblock_t *sb_end;
-  int i,j;
-  ogg_uint32_t interbits;
-  ogg_uint32_t intrabits;
-  mc_state mcenc;
-  oc_mv last_mv;
-  oc_mv prior_mv;
-  long rho_count[65];
-  plane_state_t ps;
-  fr_state_t fr;
-  interbits=intrabits=0;
-  last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
-  oc_mode_scheme_chooser_reset(&cpi->chooser);
-  ps_setup_frame(cpi,&ps);
-  ps_setup_plane(cpi,&ps,0);
-  fr_clear(cpi,&fr);
-  cpi->fr_full_count=0;
-  cpi->fr_partial_count=0;
-  cpi->fr_block_count=0;
-
-  //cpi->rho_postop=0;
-
-  memset(rho_count,0,sizeof(rho_count));
-  cpi->MVBits_0 = 0;
-  cpi->MVBits_1 = 0;
-
-  if(!recode)
-    oc_mcenc_start(cpi, &mcenc);
-
-  dct_tokenize_init(cpi);
-
-  /* Choose mvs, modes; must be done in Hilbert order */
-  /* quantize and code Luma */
-  qi=cpi->BaseQ;
-  sb = cpi->super[0];
-  sb_end = sb + cpi->super_n[0];
-  for(; sb<sb_end; sb++){
-
-    for(j = 0; j<4; j++){ /* mode addressing is through Y plane, always 4 MB per SB */
-      macroblock_t *mb;
-      int           mbi;
-      mbi=sb->m[j];
-      if(mbi>=cpi->macro_total)continue;
-      mb=cpi->macro+mbi;
-      if(!recode){
-        /*Motion estimation:
-          We always do a basic 1MV search for all macroblocks, coded or not,
-           keyframe or not.*/
-        /*Move the motion vector predictors back a frame.*/
-        memmove(mb->analysis_mv+1,mb->analysis_mv,2*sizeof(mb->analysis_mv[0]));
-        /*Search the last frame.*/
-        oc_mcenc_search(cpi,&mcenc,mbi,0,
-         mb->block_mv,&mb->asatd,mb->block_satd);
-        /*Search the golden frame.*/
-        oc_mcenc_search(cpi,&mcenc,mbi,1,NULL,&mb->gsatd,NULL);
-      }
-      if(cpi->FrameType==KEY_FRAME){
-        mb->mode=CODE_INTRA;
-        /* Transform, quantize, collect rho metrics */
-        TQMB_Y(cpi,mb,j,&ps,rho_count,0,NULL,&fr);
-      }
-      else{
-        oc_mode_choice modes[8];
-        int            mb_mv_bits_0;
-        int            mb_gmv_bits_0;
-        int            mb_4mv_bits_0;
-        int            mb_4mv_bits_1;
-        int            inter_mv_pref;
-        int            mode;
-        /*Find the block choice with the lowest estimated coding cost.
-          If a Cb or Cr block is coded but no Y' block from a macro block then
-           the mode MUST be CODE_INTER_NO_MV.
-          This is the default state to which the mode data structure is
-           initialised in encoder and decoder at the start of each frame.*/
-        /*Block coding cost is estimated from correlated SATD metrics.*/
-        /*At this point, all blocks that are in frame are still marked coded.*/
-        if(!recode){
-          memcpy(mb->unref_mv,mb->analysis_mv[0],sizeof(mb->unref_mv));
-          mb->refined=0;
-        }
-        oc_cost_inter_nomv(cpi,modes+CODE_INTER_NO_MV,mbi,CODE_INTER_NO_MV,qi);
-        oc_cost_intra(cpi,modes+CODE_INTRA,mbi,qi);
-        intrabits+=modes[CODE_INTRA].rate;
-        mb_mv_bits_0=oc_cost_inter1mv(cpi,modes+CODE_INTER_PLUS_MV,mbi,
-         CODE_INTER_PLUS_MV,mb->unref_mv[0],qi);
-        oc_cost_inter(cpi,modes+CODE_INTER_LAST_MV,mbi,
-         CODE_INTER_LAST_MV,last_mv,qi);
-        oc_cost_inter(cpi,modes+CODE_INTER_PRIOR_LAST,mbi,
-         CODE_INTER_PRIOR_LAST,prior_mv,qi);
-        oc_cost_inter_nomv(cpi,modes+CODE_USING_GOLDEN,mbi,
-         CODE_USING_GOLDEN,qi);
-        mb_gmv_bits_0=oc_cost_inter1mv(cpi,modes+CODE_GOLDEN_MV,mbi,
-         CODE_GOLDEN_MV,mb->unref_mv[1],qi);
-        mb_4mv_bits_0=oc_cost_inter4mv(cpi,modes+CODE_INTER_FOURMV,mbi,
-         mb->block_mv,qi);
-        mb_4mv_bits_1=48;
-        /*The explicit MV modes (2,6,7) have not yet gone through halfpel
-           refinement.
-          We choose the explicit MV mode that's already furthest ahead on bits
-           and refine only that one.
-          We have to be careful to remember which ones we've refined so that
-           we don't refine it again if we re-encode this frame.*/
-        inter_mv_pref=cpi->lambda*3<<OC_BIT_SCALE;
-        if(modes[CODE_INTER_FOURMV].cost<modes[CODE_INTER_PLUS_MV].cost&&
-         modes[CODE_INTER_FOURMV].cost<modes[CODE_GOLDEN_MV].cost){
-          if(!(mb->refined&0x80)){
-            oc_mcenc_refine4mv(cpi, mbi, mb->block_satd);
-            mb->refined|=0x80;
-          }
-          mb_4mv_bits_0=oc_cost_inter4mv(cpi,modes+CODE_INTER_FOURMV,mbi,
-           mb->ref_mv,qi);
-        }
-        else if(modes[CODE_GOLDEN_MV].cost+inter_mv_pref<
-         modes[CODE_INTER_PLUS_MV].cost){
-          if(!(mb->refined&0x40)){
-            oc_mcenc_refine1mv(cpi,mbi,1,mb->gsatd);
-            mb->refined|=0x40;
-          }
-          mb_gmv_bits_0=oc_cost_inter1mv(cpi,modes+CODE_GOLDEN_MV,mbi,
-           CODE_GOLDEN_MV,mb->analysis_mv[0][1],qi);
-        }
-        if(!(mb->refined&0x04)){
-          oc_mcenc_refine1mv(cpi,mbi,0,mb->asatd);
-          mb->refined|=0x04;
-        }
-        mb_mv_bits_0=oc_cost_inter1mv(cpi,modes+CODE_INTER_PLUS_MV,mbi,
-         CODE_INTER_PLUS_MV,mb->analysis_mv[0][0],qi);
-        /*Finally, pick the mode with the cheapest estimated bit cost.*/
-        /*We prefer CODE_INTER_PLUS_MV, but not over LAST and LAST2.*/
-        mode=0;
-        if(modes[1].cost<modes[0].cost)mode=1;
-        if(modes[3].cost<modes[mode].cost)mode=3;
-        if(modes[4].cost<modes[mode].cost)mode=4;
-        if(modes[5].cost<modes[mode].cost)mode=5;
-        if(modes[6].cost<modes[mode].cost)mode=6;
-        if(modes[7].cost<modes[mode].cost)mode=7;
-        if(mode==CODE_INTER_LAST_MV||mode==CODE_INTER_PRIOR_LAST){
-          inter_mv_pref=0;
-        }
-        if(modes[2].cost<modes[mode].cost+inter_mv_pref)mode=2;
-        /*If we picked something other than 4MV, propagate the MV to the
-           blocks.*/
-        if(mode!=CODE_INTER_FOURMV){
-          int dx;
-          int dy;
-          switch(mode){
-            case CODE_INTER_PLUS_MV:{
-              dx=mb->analysis_mv[0][0][0];
-              dy=mb->analysis_mv[0][0][1];
-            }break;
-            case CODE_INTER_LAST_MV:{
-              dx=last_mv[0];
-              dy=last_mv[1];
-            }break;
-            case CODE_INTER_PRIOR_LAST:{
-              dx=prior_mv[0];
-              dy=prior_mv[1];
-            }break;
-            case CODE_GOLDEN_MV:{
-              dx=mb->analysis_mv[0][1][0];
-              dy=mb->analysis_mv[0][1][1];
-            }break;
-            default:dx=dy=0;break;
-          }
-          mb->cbmvs[0][0]=mb->cbmvs[1][0]=mb->cbmvs[2][0]=mb->cbmvs[3][0]=
-           mb->mv[0][0]=mb->mv[1][0]=mb->mv[2][0]=mb->mv[3][0]=(signed char)dx;
-          mb->cbmvs[0][1]=mb->cbmvs[1][1]=mb->cbmvs[2][1]=mb->cbmvs[3][1]=
-           mb->mv[0][1]=mb->mv[1][1]=mb->mv[2][1]=mb->mv[3][1]=(signed char)dy;
-        }
-        mb->mode=mode;
-        /* Transform, quantize, collect rho metrics */
-        if(TQMB_Y(cpi,mb,j,&ps,rho_count,modes[mode].overhead,&mb_mv_bits_0,&fr)){
-          switch(mb->mode){
-            case CODE_INTER_PLUS_MV:{
-              prior_mv[0]=last_mv[0];
-              prior_mv[1]=last_mv[1];
-              /*mb->mv[0] is not the same as analysis_mv[0][0] if we're
-                 backing out from a 4MV.*/
-              last_mv[0]=mb->mv[0][0];
-              last_mv[1]=mb->mv[0][1];
-              cpi->MVBits_0+=mb_mv_bits_0;
-              cpi->MVBits_1+=12;
-            }break;
-            case CODE_INTER_PRIOR_LAST:{
-              oc_mv temp;
-              temp[0]=prior_mv[0];
-              temp[1]=prior_mv[1];
-              prior_mv[0]=last_mv[0];
-              prior_mv[1]=last_mv[1];
-              last_mv[0]=temp[0];
-              last_mv[1]=temp[1];
-            }break;
-            case CODE_GOLDEN_MV:{
-              cpi->MVBits_0 += mb_gmv_bits_0;
-              cpi->MVBits_1 += 12;
-            }break;
-            case CODE_INTER_FOURMV:{
-              int bi;
-              prior_mv[0]=last_mv[0];
-              prior_mv[1]=last_mv[1];
-              for(bi=0;bi<4;bi++){
-                if(mb->coded&(1<<bi)){
-                  cpi->MVBits_0+=MvBits[mb->mv[bi][0]+MAX_MV_EXTENT]
-                   +MvBits[mb->mv[bi][1]+MAX_MV_EXTENT];
-                  cpi->MVBits_1+=12;
-                  last_mv[0]=mb->mv[bi][0];
-                  last_mv[1]=mb->mv[bi][1];
-                }
-                /*Replace the block MVs for not-coded blocks with (0,0).*/
-                else mb->mv[bi][0]=mb->mv[bi][1]=0;
-              }
-              if(mb->coded!=0xF){
-                (*OC_SET_CHROMA_MVS_TABLE[cpi->info.pixelformat])(mb->cbmvs,
-                 (const oc_mv *)mb->mv);
-              }
-            }break;
-            default:break;
-          }
-          oc_mode_scheme_chooser_update(&cpi->chooser,mb->mode);
-          interbits+=modes[mb->mode].rate+modes[mb->mode].overhead;
-        }
-      }
-    }
-    fr_finishsb(cpi,&fr);
-  }
-
-  dct_tokenize_mark_ac_chroma(cpi);
-
-  /* code chroma U */
-  sb = cpi->super[1];
-  sb_end = sb + cpi->super_n[1];
-  ps_setup_plane(cpi,&ps,1);
-  for(; sb<sb_end; sb++){
-    TQSB_UV(cpi, sb, &ps, rho_count, &fr);
-    fr_finishsb(cpi,&fr);
-  }
-
-  /* code chroma V */
-  sb = cpi->super[2];
-  sb_end = sb + cpi->super_n[2];
-  ps_setup_plane(cpi,&ps,2);
-  for(; sb<sb_end; sb++){
-    TQSB_UV(cpi, sb, &ps, rho_count, &fr);
-    fr_finishsb(cpi,&fr);
-  }
-
-  for(i=1;i<65;i++)
-  rho_count[i]+=rho_count[i-1];
-
-  memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
-  if(cpi->FrameType != KEY_FRAME){
-
-    if(interbits>intrabits) return 1; /* short circuit */
-
-    /* finish adding flagging overhead costs to inter bit counts */
-
-    if(cpi->MVBits_0 < cpi->MVBits_1)
-      interbits += (cpi->MVBits_0 << OC_BIT_SCALE);
-    else
-      interbits += (cpi->MVBits_1 << OC_BIT_SCALE);
-
-    interbits += (cpi->chooser.scheme_bits[cpi->chooser.scheme_list[0]] << OC_BIT_SCALE);
-
-    if(interbits>intrabits) return 1; /* short circuit */
-
-    /* The easiest way to count the bits needed for coded/not coded fragments is
-       to code them. */
-    {
-      ogg_uint32_t bits = oggpackB_bits(cpi->oggbuffer);
-      fr_write(cpi,&fr);
-      interbits += ((oggpackB_bits(cpi->oggbuffer) - bits) << OC_BIT_SCALE);
-    }
-
-    if(interbits>intrabits) return 1;
-
-  }
-  return 0;
-}
-
-#if defined(OC_COLLECT_METRICS)
-# include <stdio.h>
-# include <math.h>
-
-# define OC_ZWEIGHT   (0.25)
-# define OC_BIN(_sad) (OC_MINI((_sad)>>OC_SAD_SHIFT,OC_SAD_BINS-1))
-
-static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
- double _w,int _sad,int _rate,double _rmse){
-  double rate;
-  /*Accumulate statistics without the scaling; this lets us change the scale
-     factor yet still use old data.*/
-  rate=ldexp(_rate,-OC_BIT_SCALE);
-  if(_metrics->fragw>0){
-    double dsad;
-    double drate;
-    double drmse;
-    double w;
-    dsad=_sad-_metrics->sad/_metrics->fragw;
-    drate=rate-_metrics->rate/_metrics->fragw;
-    drmse=_rmse-_metrics->rmse/_metrics->fragw;
-    w=_metrics->fragw*_w/(_metrics->fragw+_w);
-    _metrics->sad2+=dsad*dsad*w;
-    _metrics->sadrate+=dsad*drate*w;
-    _metrics->rate2+=drate*drate*w;
-    _metrics->sadrmse+=dsad*drmse*w;
-    _metrics->rmse2+=drmse*drmse*w;
-  }
-  _metrics->fragw+=_w;
-  _metrics->sad+=_sad*_w;
-  _metrics->rate+=rate*_w;
-  _metrics->rmse+=_rmse*_w;
-}
-
-static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
- const oc_mode_metrics *_src,int _n){
-  int i;
-  /*Find a non-empty set of metrics.*/
-  for(i=0;i<_n&&_src[i].fragw<=0;i++);
-  if(i>=_n){
-    memset(_dst,0,sizeof(*_dst));
-    return;
-  }
-  memcpy(_dst,_src+i,sizeof(*_dst));
-  /*And iterate over the remaining non-empty sets of metrics.*/
-  for(i++;i<_n;i++)if(_src[i].fragw>0){
-    double wa;
-    double wb;
-    double dsad;
-    double drate;
-    double drmse;
-    double w;
-    wa=_dst->fragw;
-    wb=_src[i].fragw;
-    dsad=_src[i].sad/wb-_dst->sad/wa;
-    drate=_src[i].rate/wb-_dst->rate/wa;
-    drmse=_src[i].rmse/wb-_dst->rmse/wa;
-    w=wa*wb/(wa+wb);
-    _dst->fragw+=_src[i].fragw;
-    _dst->sad+=_src[i].sad;
-    _dst->rate+=_src[i].rate;
-    _dst->rmse+=_src[i].rmse;
-    _dst->sad2+=_src[i].sad2+dsad*dsad*w;
-    _dst->sadrate+=_src[i].sadrate+dsad*drate*w;
-    _dst->rate2+=_src[i].rate2+drate*drate*w;
-    _dst->sadrmse+=_src[i].sadrmse+dsad*drmse*w;
-    _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
-  }
-}
-
-static void oc_enc_mode_metrics_update(CP_INSTANCE *cpi,int _qi){
-  int pli;
-  int qti;
-  oc_enc_restore_fpu(cpi);
-  /*Compile collected SAD/rate/RMSE metrics into a form that's immediately
-     useful for mode decision.*/
-  /*Convert raw collected data into cleaned up sample points.*/
-  for(pli=0;pli<3;pli++){
-    for(qti=0;qti<2;qti++){
-      double fragw;
-      int    bin0;
-      int    bin1;
-      int    bin;
-      fragw=0;
-      bin0=bin1=0;
-      for(bin=0;bin<OC_SAD_BINS;bin++){
-        oc_mode_metrics metrics;
-        OC_MODE_RD[_qi][pli][qti][bin].rate=0;
-        OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
-        /*Find some points on either side of the current bin.*/
-        while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
-          fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
-        }
-        while(bin0+1<bin&&bin0+1<bin1&&
-         fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
-          fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
-        }
-        /*Merge statistics and fit lines.*/
-        oc_mode_metrics_merge(&metrics,
-         OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
-        if(metrics.fragw>0&&metrics.sad2>0){
-          double a;
-          double b;
-          double msad;
-          double mrate;
-          double mrmse;
-          double rate;
-          double rmse;
-          msad=metrics.sad/metrics.fragw;
-          mrate=metrics.rate/metrics.fragw;
-          mrmse=metrics.rmse/metrics.fragw;
-          /*Compute the points on these lines corresponding to the actual bin
-             value.*/
-          b=metrics.sadrate/metrics.sad2;
-          a=mrate-b*msad;
-          rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
-          OC_MODE_RD[_qi][pli][qti][bin].rate=
-           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
-          b=metrics.sadrmse/metrics.sad2;
-          a=mrmse-b*msad;
-          rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
-          OC_MODE_RD[_qi][pli][qti][bin].rmse=
-           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
-        }
-      }
-    }
-  }
-}
-
-static int parse_eob_run(int token, int eb){
-  switch(token){
-  case DCT_EOB_TOKEN:
-    return 1;
-  case DCT_EOB_PAIR_TOKEN:
-    return 2;
-  case DCT_EOB_TRIPLE_TOKEN:
-    return 3;
-  case DCT_REPEAT_RUN_TOKEN:
-    return eb+4;
-  case DCT_REPEAT_RUN2_TOKEN:
-    return eb+8;
-  case DCT_REPEAT_RUN3_TOKEN:
-    return eb+16;
-  case DCT_REPEAT_RUN4_TOKEN:
-    return eb;
-  default:
-    return 0;
-  }
-}
-
-
-static void ModeMetricsGroup(CP_INSTANCE *cpi, int group, int huffY, int huffC, int eobcounts[64], int *actual_bits){
-  int ti=0;
-  int *stack = cpi->dct_eob_fi_stack[group];
-  int *tfi = cpi->dct_token_frag[group];
-  int ty = cpi->dct_token_ycount[group];
-  int tn = cpi->dct_token_count[group];
-
-  for(ti=0;ti<tn;ti++){
-    int token = cpi->dct_token[group][ti];
-    int bits = cpi->huff_codes[(ti<ty ? huffY : huffC)][token].nbits + OC_DCT_TOKEN_EXTRA_BITS[token];
-
-    if(token>DCT_REPEAT_RUN4_TOKEN){
-      /* not an EOB run; this token belongs to a single fragment */
-      int fi = tfi[ti];
-      actual_bits[fi] += (bits<<OC_BIT_SCALE);
-    }else{
-
-      int run = parse_eob_run(token, cpi->dct_token_eb[group][ti]);
-      int fi = stack[eobcounts[group]];
-      actual_bits[fi]+=(bits<<OC_BIT_SCALE);
-
-      if(ti+1<tn){
-        /* tokens follow EOB so it must be entirely ensconced within this plane/group */
-        eobcounts[group]+=run;
-      }else{
-        /* EOB is the last token in this plane/group, so it may span into the next plane/group */
-        int n = cpi->dct_eob_fi_count[group];
-        while(run){
-          int rem = n - eobcounts[group];
-          if(rem>run)rem=run;
-
-          eobcounts[group]+=rem;
-          run -= rem;
-          if(run){
-            group++;
-            n = cpi->dct_eob_fi_count[group];
-            stack = cpi->dct_eob_fi_stack[group];
-          }
-        }
-      }
-    }
-  }
-}
-
-void ModeMetrics(CP_INSTANCE *cpi){
-  double fragw;
-  int interp = (cpi->FrameType!=KEY_FRAME);
-  int huff[4];
-  int fi,gi;
-  int y = cpi->frag_n[0];
-  int u = y + cpi->frag_n[1];
-  int v = cpi->frag_total;
-  unsigned char *cp = cpi->frag_coded;
-  int *sp = cpi->frag_sad;
-  int *mp = cpi->frag_mbi;
-  int eobcounts[64];
-  int qi = cpi->BaseQ; /* temporary */
-  int actual_bits[cpi->frag_total];
-  oc_enc_restore_fpu(cpi);
-  /*Weight the fragments by the inverse frame size; this prevents HD content
-     from dominating the statistics.*/
-  fragw=1.0/cpi->frag_n[0];
-  memset(actual_bits,0,sizeof(actual_bits));
-  memset(eobcounts,0,sizeof(eobcounts));
-  huff[0] = cpi->huffchoice[interp][0][0];
-  huff[1] = cpi->huffchoice[interp][0][1];
-  huff[2] = cpi->huffchoice[interp][1][0];
-  huff[3] = cpi->huffchoice[interp][1][1];
-
-  memset(cpi->dist_dist,0,sizeof(cpi->dist_dist));
-  memset(cpi->dist_bits,0,sizeof(cpi->dist_bits));
-
-  if(!oc_has_mode_metrics){
-    FILE *fmetrics;
-    int   qi;
-    memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
-    fmetrics=fopen("modedec.stats","rb");
-    if(fmetrics!=NULL){
-      fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
-      fclose(fmetrics);
-    }
-    for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(cpi,qi);
-    oc_has_mode_metrics=1;
-  }
-
-  /* count bits for tokens */
-  ModeMetricsGroup(cpi, 0, huff[0], huff[1], eobcounts, actual_bits);
-  for(gi=1;gi<=AC_TABLE_2_THRESH;gi++)
-    ModeMetricsGroup(cpi, gi,  huff[2], huff[3], eobcounts, actual_bits);
-  for(;gi<=AC_TABLE_3_THRESH;gi++)
-    ModeMetricsGroup(cpi, gi, huff[2]+AC_HUFF_CHOICES, huff[3]+AC_HUFF_CHOICES, eobcounts, actual_bits);
-  for(;gi<=AC_TABLE_4_THRESH;gi++)
-    ModeMetricsGroup(cpi, gi, huff[2]+AC_HUFF_CHOICES*2, huff[3]+AC_HUFF_CHOICES*2, eobcounts, actual_bits);
-  for(;gi<BLOCK_SIZE;gi++)
-    ModeMetricsGroup(cpi, gi, huff[2]+AC_HUFF_CHOICES*3, huff[3]+AC_HUFF_CHOICES*3, eobcounts, actual_bits);
-
-  /* accumulate */
-  for(fi=0;fi<v;fi++)if(cp[fi]){
-    int mbi = mp[fi];
-    macroblock_t *mb = &cpi->macro[mbi];
-    int mode = mb->mode;
-    int plane = (fi<y ? 0 : (fi<u ? 1 : 2));
-    int bin = OC_BIN(sp[fi]);
-    oc_mode_metrics_add(OC_MODE_METRICS[qi][plane][mode!=CODE_INTRA]+bin,
-     fragw,sp[fi],actual_bits[fi],sqrt(cpi->frag_ssd[fi]));
-  }
-  /* update global SAD/rate estimation matrix */
-  oc_enc_mode_metrics_update(cpi,qi);
-}
-
-void oc_enc_mode_metrics_dump(CP_INSTANCE *cpi){
-  FILE *fmetrics;
-  int   qi;
-  /*Generate sample points for complete list of QI values.*/
-  for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(cpi,qi);
-  fmetrics=fopen("modedec.stats","wb");
-  if(fmetrics!=NULL){
-    fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
-    fclose(fmetrics);
-  }
-  fprintf(stdout,
-   "/*File generated by libtheora with OC_COLLECT_METRICS"
-   " defined at compile time.*/\n"
-   "#if !defined(_modedec_H)\n"
-   "# define _modedec_H (1)\n"
-   "\n"
-   "\n"
-   "\n"
-   "# if defined(OC_COLLECT_METRICS)\n"
-   "typedef struct oc_mode_metrics oc_mode_metrics;\n"
-   "# endif\n"
-   "typedef struct oc_mode_rd      oc_mode_rd;\n"
-   "\n"
-   "\n"
-   "\n"
-   "/*The number of extra bits of precision at which to store rate"
-   " metrics.*/\n"
-   "# define OC_BIT_SCALE  (%i)\n"
-   "/*The number of extra bits of precision at which to store RMSE metrics.\n"
-   "  This must be at least half OC_BIT_SCALE (rounded up).*/\n"
-   "# define OC_RMSE_SCALE (%i)\n"
-   "/*The number of bins to partition statistics into.*/\n"
-   "# define OC_SAD_BINS   (%i)\n"
-   "/*The number of bits of precision to drop"
-   " from SAD scores to assign them to a\n"
-   "   bin.*/\n"
-   "# define OC_SAD_SHIFT  (%i)\n"
-   "\n"
-   "\n"
-   "\n"
-   "# if defined(OC_COLLECT_METRICS)\n"
-   "struct oc_mode_metrics{\n"
-   "  double fragw;\n"
-   "  double sad;\n"
-   "  double rate;\n"
-   "  double rmse;\n"
-   "  double sad2;\n"
-   "  double sadrate;\n"
-   "  double rate2;\n"
-   "  double sadrmse;\n"
-   "  double rmse2;\n"
-   "};\n"
-   "\n"
-   "\n"
-   "int             oc_has_mode_metrics;\n"
-   "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
-   "# endif\n"
-   "\n"
-   "\n"
-   "\n"
-   "struct oc_mode_rd{\n"
-   "  ogg_int16_t rate;\n"
-   "  ogg_int16_t rmse;\n"
-   "};\n"
-   "\n"
-   "\n"
-   "# if !defined(OC_COLLECT_METRICS)\n"
-   "static const\n"
-   "# endif\n"
-   "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
-   OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
-  for(qi=0;qi<64;qi++){
-    int pli;
-    fprintf(stdout,"  {\n");
-    for(pli=0;pli<3;pli++){
-      int qti;
-      fprintf(stdout,"    {\n");
-      for(qti=0;qti<2;qti++){
-        int bin;
-        static const char *pl_names[3]={"Y'","Cb","Cr"};
-        static const char *qti_names[2]={"INTRA","INTER"};
-        fprintf(stdout,"      /*%s  qi=%i  %s*/\n",
-         pl_names[pli],qi,qti_names[qti]);
-        fprintf(stdout,"      {\n");
-        fprintf(stdout,"        ");
-        for(bin=0;bin<OC_SAD_BINS;bin++){
-          if(bin&&!(bin&0x3))fprintf(stdout,"\n        ");
-          fprintf(stdout,"{%5i,%5i}",
-           OC_MODE_RD[qi][pli][qti][bin].rate,
-           OC_MODE_RD[qi][pli][qti][bin].rmse);
-          if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
-        }
-        fprintf(stdout,"\n      }");
-        if(qti<1)fprintf(stdout,",");
-        fprintf(stdout,"\n");
-      }
-      fprintf(stdout,"    }");
-      if(pli<2)fprintf(stdout,",");
-      fprintf(stdout,"\n");
-    }
-    fprintf(stdout,"  }");
-    if(qi<63)fprintf(stdout,",");
-    fprintf(stdout,"\n");
-  }
-  fprintf(stdout,
-   "};\n"
-   "\n"
-   "#endif\n");
-}
-#endif

Modified: branches/theora-thusnelda/lib/enc/modedec.h
===================================================================
--- branches/theora-thusnelda/lib/enc/modedec.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/modedec.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,4 +1,4 @@
-//*File generated by libtheora with OC_COLLECT_METRICS defined at compile time.*/
+/*File generated by libtheora with OC_COLLECT_METRICS defined at compile time.*/
 #if !defined(_modedec_H)
 # define _modedec_H (1)
 
@@ -27,13 +27,13 @@
 # if defined(OC_COLLECT_METRICS)
 struct oc_mode_metrics{
   double fragw;
-  double sad;
+  double satd;
   double rate;
   double rmse;
-  double sad2;
-  double sadrate;
+  double satd2;
+  double satdrate;
   double rate2;
-  double sadrmse;
+  double satdrmse;
   double rmse2;
 };
 

Deleted: branches/theora-thusnelda/lib/enc/quant_lookup.h
===================================================================
--- branches/theora-thusnelda/lib/enc/quant_lookup.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/quant_lookup.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,54 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "codec_internal.h"
-
-#define MIN16 ((1<<16)-1)
-#define SHIFT16 (1<<16)
-
-#define MIN_LEGAL_QUANT_ENTRY 8
-#define MIN_DEQUANT_VAL       2
-#define IDCT_SCALE_FACTOR     2 /* Shift left bits to improve IDCT precision */
-#define OLD_SCHEME            1
-
-
-/******************************
- * lookup table for DCT coefficient zig-zag ordering
- * ****************************/
-
-static const ogg_uint32_t zigzag_index[64] = {
-   0,  1,  5,  6, 14, 15, 27, 28,
-   2,  4,  7, 13, 16, 26, 29, 42,
-   3,  8, 12, 17, 25, 30, 41, 43,
-   9, 11, 18, 24, 31, 40, 44, 53,
-  10, 19, 23, 32, 39, 45, 52, 54,
-  20, 22, 33, 38, 46, 51, 55, 60,
-  21, 34, 37, 47, 50, 56, 59, 61,
-  35, 36, 48, 49, 57, 58, 62, 63
-};
-
-static const ogg_uint32_t dezigzag_index[64] = {
-  0,  1,  8,  16,  9,  2,  3, 10,
-  17, 24, 32, 25, 18, 11,  4,  5,
-  12, 19, 26, 33, 40, 48, 41, 34,
-  27, 20, 13,  6,  7, 14, 21, 28,
-  35, 42, 49, 56, 57, 50, 43, 36,
-  29, 22, 15, 23, 30, 37, 44, 51,
-  58, 59, 52, 45, 38, 31, 39, 46,
-  53, 60, 61, 54, 47, 55, 62, 63
-};
-

Copied: branches/theora-thusnelda/lib/enc/rate.c (from rev 16052, branches/theora-thusnelda/lib/enc/encoder_toplevel.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/rate.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/rate.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,269 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type){
+  ogg_int64_t l;
+  int         qi;
+  qi=_enc->state.qis[0];
+  /*For now, lambda is fixed depending on the qi value and frame type:
+      lambda=scale[qti]*(qavg[qti][qi]**1.5),
+     where scale={2.25,1.125}.
+    A more adaptive scheme might perform better, but Theora's behavior does not
+     seem to conform to existing models in the literature.*/
+  /*If rate control is active, use the lambda for the _target_ quantizer.
+    This allows us to scale to rates slightly lower than we'd normally be able
+     to reach, and give the rate control a semblance of "fractional qi"
+     precision.*/
+  if(_enc->state.info.target_bitrate>0)l=_enc->rc.log_qtarget;
+  else l=_enc->log_qavg[_frame_type][qi];
+  /*Raise to the 1.5 power.*/
+  l+=l>>1;
+  /*Multiply by 1.125.*/
+  l+=0x00570068E7EF5A1ELL;
+  /*And multiply by an extra factor of 2 for INTRA frames.*/
+  if(!_frame_type)l+=OC_Q57(1);
+  /*The upper bound here is 0x48000.*/
+  _enc->lambda=(int)oc_bexp64(l);
+}
+
+
+
+void oc_rc_state_init(oc_rc_state *_rc,const oc_enc_ctx *_enc){
+  ogg_int64_t npixels;
+  ogg_int64_t ibpp;
+  /*TODO: These parameters should be exposed in a th_encode_ctl() API.*/
+  _rc->bits_per_frame=(_enc->state.info.target_bitrate*
+   (ogg_int64_t)_enc->state.info.fps_denominator)/
+   _enc->state.info.fps_numerator;
+  /*Insane framerates or frame sizes mean insane bitrates.
+    Let's not get carried away.*/
+  if(_rc->bits_per_frame>0x400000000000LL){
+    _rc->bits_per_frame=(ogg_int64_t)0x400000000000LL;
+  }
+  else if(_rc->bits_per_frame<32)_rc->bits_per_frame=32;
+  /*The buffer size is set equal to the keyframe interval, clamped to the range
+     [8,256] frames.
+    The 8 frame minimum gives us some chance to distribute bit estimation
+     errors.
+    The 256 frame maximum means we'll require 8-10 seconds of pre-buffering at
+     24-30 fps, which is not unreasonable.*/
+  _rc->buf_delay=_enc->keyframe_frequency_force>256?
+   256:_enc->keyframe_frequency_force;
+  _rc->buf_delay=OC_MAXI(_rc->buf_delay,12);
+  _rc->max=_rc->bits_per_frame*_rc->buf_delay;
+  /*Start with a buffer fullness of 75%.
+    We can require fully half the buffer for a keyframe, and so this initial
+     level gives us maximum flexibility for over/under-shooting in subsequent
+     frames.*/
+  _rc->target=_rc->fullness=(_rc->max+1>>1)+(_rc->max+2>>2);
+  /*Pick exponents and initial scales for quantizer selection.*/
+  npixels=_enc->state.info.frame_width*
+   (ogg_int64_t)_enc->state.info.frame_height;
+  _rc->log_npixels=oc_blog64(npixels);
+  ibpp=npixels/_rc->bits_per_frame;
+  if(ibpp<1){
+    _rc->exp[0]=59;
+    _rc->log_scale[0]=oc_blog64(1997)-OC_Q57(8);
+  }
+  else if(ibpp<2){
+    _rc->exp[0]=55;
+    _rc->log_scale[0]=oc_blog64(1604)-OC_Q57(8);
+  }
+  else{
+    _rc->exp[0]=48;
+    _rc->log_scale[0]=oc_blog64(834)-OC_Q57(8);
+  }
+  if(ibpp<4){
+    _rc->exp[1]=100;
+    _rc->log_scale[1]=oc_blog64(2249)-OC_Q57(8);
+  }
+  else if(ibpp<8){
+    _rc->exp[1]=95;
+    _rc->log_scale[1]=oc_blog64(1751)-OC_Q57(8);
+  }
+  else{
+    _rc->exp[1]=73;
+    _rc->log_scale[1]=oc_blog64(1260)-OC_Q57(8);
+  }
+  _rc->prev_drop_count=0;
+  _rc->log_drop_scale=OC_Q57(0);
+}
+
+void oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial){
+  /*Note, setting OC_SCALE_SMOOTHING[1] to 0x80 (0.5), which one might expect
+     to be a reasonable value, actually causes a feedback loop with, e.g., 12
+     fps content encoded at 24 fps; use values near 0 or near 1 for now.
+    TODO: Should probably revisit using an exponential moving average in the
+     first place at some point; dup tracking should help as well.*/
+  static const unsigned OC_SCALE_SMOOTHING[2]={0x13,0x00};
+  if(_bits>0){
+    ogg_int64_t log_scale;
+    ogg_int64_t log_bits;
+    ogg_int64_t log_qexp;
+    /*Compute the estimated scale factor for this frame type.*/
+    log_bits=oc_blog64(_bits);
+    log_qexp=_enc->log_qavg[_qti][_qi]-OC_Q57(2);
+    log_qexp=(log_qexp>>6)*(_enc->rc.exp[_qti]);
+    log_scale=OC_MINI(log_bits-_enc->rc.log_npixels+log_qexp,OC_Q57(16));
+    /*Use it to set that factor directly if this was a trial.*/
+    if(_trial)_enc->rc.log_scale[_qti]=log_scale;
+    else{
+      /*Otherwise update an exponential moving average.*/
+      _enc->rc.log_scale[_qti]=log_scale
+       +(_enc->rc.log_scale[_qti]-log_scale+128>>8)*OC_SCALE_SMOOTHING[_qti];
+      /*And update a simple exponential moving average to estimate the "real"
+         frame rate taking drops and duplicates into account.*/
+      _enc->rc.log_drop_scale=_enc->rc.log_drop_scale
+       +oc_blog64(_enc->rc.prev_drop_count+1)>>1;
+      _enc->rc.prev_drop_count=_enc->dup_count;
+    }
+  }
+  else{
+    /*We dropped this frame.*/
+    /*Add it to the previous frame's dup count.*/
+    _enc->rc.prev_drop_count+=1+_enc->dup_count;
+    /*If this was the first frame of this type, lower the expected scale, but
+       don't set it to zero outright.*/
+    if(_trial)_enc->rc.log_scale[_qti]>>=1;
+  }
+  if(!_trial){
+    /*And update the buffer fullness level.*/
+    _enc->rc.fullness+=_enc->rc.bits_per_frame*(1+_enc->dup_count)-_bits;
+    /*If we're too quick filling the buffer, that rate is lost forever.*/
+    if(_enc->rc.fullness>_enc->rc.max)_enc->rc.fullness=_enc->rc.max;
+  }
+}
+
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp){
+  ogg_int64_t  rate_total;
+  ogg_uint32_t next_key_frame;
+  int          nframes[2];
+  int          buf_delay;
+  ogg_int64_t  log_qtarget;
+  int          best_qi;
+  ogg_int64_t  best_qdiff;
+  int          old_qi;
+  int          qi;
+  /*Figure out how to re-distribute bits so that we hit our fullness target
+     before the last keyframe in our current buffer window (after the current
+     frame), or the end of the buffer window, whichever comes first.*/
+  next_key_frame=_qti?_enc->keyframe_frequency_force
+   -(_enc->state.curframe_num-_enc->state.keyframe_num):0;
+  nframes[0]=(_enc->rc.buf_delay-OC_MINI(next_key_frame,_enc->rc.buf_delay)
+   +_enc->keyframe_frequency_force-1)/_enc->keyframe_frequency_force;
+  if(nframes[0]+_qti>1){
+    buf_delay=next_key_frame+(nframes[0]-1)*_enc->keyframe_frequency_force;
+    nframes[0]--;
+  }
+  else buf_delay=_enc->rc.buf_delay;
+  nframes[1]=buf_delay-nframes[0];
+  rate_total=_enc->rc.fullness-_enc->rc.target
+   +buf_delay*_enc->rc.bits_per_frame;
+  /*Downgrade the delta frame rate to correspond to the recent drop count
+     history.*/
+  if(_enc->rc.prev_drop_count>0||_enc->rc.log_drop_scale>OC_Q57(0)){
+    ogg_int64_t dup_scale;
+    dup_scale=oc_bexp64((_enc->rc.log_drop_scale
+     +oc_blog64(_enc->rc.prev_drop_count+1)>>1)+OC_Q57(8));
+    if(dup_scale<nframes[1]<<8){
+      int dup_scalei;
+      dup_scalei=(int)dup_scale;
+      if(dup_scalei>0)nframes[1]=((nframes[1]<<8)+dup_scalei-1)/dup_scalei;
+    }
+    else nframes[1]=!!nframes[1];
+  }
+  /*If there aren't enough bits to achieve our desired fullness level, use the
+     minimum quality permitted.*/
+  if(rate_total<=buf_delay)log_qtarget=OC_QUANT_MAX_LOG;
+  else{
+    static const unsigned char KEY_RATIO[2]={32,17};
+    ogg_int64_t   log_scale0;
+    ogg_int64_t   log_scale1;
+    ogg_int64_t   prevr;
+    ogg_int64_t   curr;
+    ogg_int64_t   realr;
+    int           i;
+    log_scale0=_enc->rc.log_scale[_qti]+_enc->rc.log_npixels;
+    log_scale1=_enc->rc.log_scale[1-_qti]+_enc->rc.log_npixels;
+    curr=(rate_total+(buf_delay>>1))/buf_delay;
+    realr=curr*KEY_RATIO[_qti]+16>>5;
+    for(i=0;i<10;i++){
+      ogg_int64_t rdiff;
+      ogg_int64_t rderiv;
+      ogg_int64_t log_rpow;
+      ogg_int64_t rscale;
+      ogg_int64_t drscale;
+      ogg_int64_t bias;
+      prevr=curr;
+      log_rpow=oc_blog64(prevr)-log_scale0;
+      log_rpow=(log_rpow+(_enc->rc.exp[_qti]>>1))/_enc->rc.exp[_qti]*
+       _enc->rc.exp[1-_qti];
+      rscale=nframes[1-_qti]*KEY_RATIO[1-_qti]*
+       oc_bexp64(log_scale1+log_rpow);
+      rdiff=nframes[_qti]*KEY_RATIO[_qti]*prevr+rscale-(rate_total<<5);
+      drscale=(rscale+(_enc->rc.exp[_qti]>>1))/_enc->rc.exp[_qti]*
+       _enc->rc.exp[1-_qti]/prevr;
+      rderiv=nframes[_qti]*KEY_RATIO[_qti]+drscale;
+      if(rderiv==0)break;
+      bias=rderiv+OC_SIGNMASK(rdiff^rderiv)^OC_SIGNMASK(rdiff^rderiv);
+      curr=prevr-((rdiff<<1)+bias)/(rderiv<<1);
+      realr=curr*KEY_RATIO[_qti]+16>>5;
+      if(curr<=0||realr>rate_total||prevr==curr)break;
+    }
+    log_qtarget=OC_Q57(2)-((oc_blog64(realr)-log_scale0+(_enc->rc.exp[_qti]>>1))/
+     _enc->rc.exp[_qti]<<6);
+    log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
+  }
+  /*If this was not one of the initial frames, limit the change in quality.*/
+  old_qi=_enc->state.qis[0];
+  if(_clamp){
+    ogg_int64_t log_qmin;
+    ogg_int64_t log_qmax;
+    /*Clamp the target quantizer to within [0.8*Q,1.2*Q], where Q is the
+       current quantizer.
+      TODO: With user-specified quant matrices, we need to enlarge these limits
+       if they don't actually let us change qi values.*/
+    log_qmin=_enc->log_qavg[_qti][old_qi]-0x00A4D3C25E68DC58LL;
+    log_qmax=_enc->log_qavg[_qti][old_qi]+0x00A4D3C25E68DC58LL;
+    log_qtarget=OC_CLAMPI(log_qmin,log_qtarget,log_qmax);
+  }
+  /*Search for the quantizer that matches the target most closely.
+    We don't assume a linear ordering, but when there are ties we do pick the
+     quantizer closest to the current one.*/
+  best_qi=_enc->state.info.quality;
+  best_qdiff=_enc->log_qavg[_qti][best_qi]-log_qtarget;
+  best_qdiff=best_qdiff+OC_SIGNMASK(best_qdiff)^OC_SIGNMASK(best_qdiff);
+  for(qi=_enc->state.info.quality+1;qi<64;qi++){
+    ogg_int64_t qdiff;
+    qdiff=_enc->log_qavg[_qti][qi]-log_qtarget;
+    qdiff=qdiff+OC_SIGNMASK(qdiff)^OC_SIGNMASK(qdiff);
+    if(qdiff<best_qdiff||
+     qdiff==best_qdiff&&abs(qi-old_qi)<abs(best_qi-old_qi)){
+      best_qi=qi;
+      best_qdiff=qdiff;
+    }
+  }
+  /*Save the quantizer target for lambda calculations.*/
+  _enc->rc.log_qtarget=log_qtarget;
+  return best_qi;
+}

Copied: branches/theora-thusnelda/lib/enc/tokenize.c (from rev 16052, branches/theora-thusnelda/lib/enc/dct_encode.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/tokenize.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -0,0 +1,722 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+static int oc_make_eob_token(int _run_count){
+  if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
+  else{
+    int cat;
+    cat=OC_ILOGNZ_32(_run_count)-3;
+    cat=OC_MINI(cat,3);
+    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
+  }
+}
+
+static int oc_make_eob_token_full(int _run_count,int *_eb){
+  if(_run_count<4){
+    *_eb=0;
+    return OC_DCT_EOB1_TOKEN+_run_count-1;
+  }
+  else{
+    int cat;
+    cat=OC_ILOGNZ_32(_run_count)-3;
+    cat=OC_MINI(cat,3);
+    *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
+    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
+  }
+}
+
+static int oc_decode_eob_token(int _token,int _eb){
+  return -oc_dct_token_skip(_token,_eb);
+}
+
+static int oc_make_dct_token(int _zzi,int _zzj,int _val){
+  int zero_run;
+  int token;
+  int val;
+  val=abs(_val);
+  zero_run=_zzj-_zzi;
+  if(zero_run>0){
+    int adj;
+    /*Implement a minor restriction so that we know that extending a combo
+       token from stack 1 will never overflow during DC fix-ups.*/
+    adj=_zzi!=1;
+    if(val<2&&zero_run<17+adj){
+      if(zero_run<6)token=OC_DCT_RUN_CAT1A+zero_run-1;
+      else if(zero_run<10)token=OC_DCT_RUN_CAT1B;
+      else token=OC_DCT_RUN_CAT1C;
+    }
+    else if(val<4&&zero_run<3+adj){
+      if(zero_run<2)token=OC_DCT_RUN_CAT2A;
+      else token=OC_DCT_RUN_CAT2B;
+    }
+    else{
+      if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
+      else token=OC_DCT_ZRL_TOKEN;
+    }
+  }
+  else if(val<3)token=OC_ONE_TOKEN+(val-1<<1)+(_val<0);
+  else if(val<7)token=OC_DCT_VAL_CAT2+val-3;
+  else if(val<9)token=OC_DCT_VAL_CAT3;
+  else if(val<13)token=OC_DCT_VAL_CAT4;
+  else if(val<21)token=OC_DCT_VAL_CAT5;
+  else if(val<37)token=OC_DCT_VAL_CAT6;
+  else if(val<69)token=OC_DCT_VAL_CAT7;
+  else token=OC_DCT_VAL_CAT8;
+  return token;
+}
+
+static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
+  int neg;
+  int zero_run;
+  int token;
+  int eb;
+  neg=_val<0;
+  _val=abs(_val);
+  zero_run=_zzj-_zzi;
+  if(zero_run>0){
+    int adj;
+    /*Implement a minor restriction on stack 1 so that we know during DC fixups
+       that extending a dctrun token from stack 1 will never overflow.*/
+    adj=_zzi!=1;
+    if(_val<2&&zero_run<17+adj){
+      if(zero_run<6){
+        token=OC_DCT_RUN_CAT1A+zero_run-1;
+        eb=neg;
+      }
+      else if(zero_run<10){
+        token=OC_DCT_RUN_CAT1B;
+        eb=zero_run-6+(neg<<2);
+      }
+      else{
+        token=OC_DCT_RUN_CAT1C;
+        eb=zero_run-10+(neg<<3);
+      }
+    }
+    else if(_val<4&&zero_run<3+adj){
+      if(zero_run<2){
+        token=OC_DCT_RUN_CAT2A;
+        eb=_val-2+(neg<<1);
+      }
+      else{
+        token=OC_DCT_RUN_CAT2B;
+        eb=zero_run-2+(_val-2<<1)+(neg<<2);
+      }
+    }
+    else{
+      if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
+      else token=OC_DCT_ZRL_TOKEN;
+      eb=zero_run-1;
+    }
+  }
+  else if(_val<3){
+    token=OC_ONE_TOKEN+(_val-1<<1)+neg;
+    eb=0;
+  }
+  else if(_val<7){
+    token=OC_DCT_VAL_CAT2+_val-3;
+    eb=neg;
+  }
+  else if(_val<9){
+    token=OC_DCT_VAL_CAT3;
+    eb=_val-7+(neg<<1);
+  }
+  else if(_val<13){
+    token=OC_DCT_VAL_CAT4;
+    eb=_val-9+(neg<<2);
+  }
+  else if(_val<21){
+    token=OC_DCT_VAL_CAT5;
+    eb=_val-13+(neg<<3);
+  }
+  else if(_val<37){
+    token=OC_DCT_VAL_CAT6;
+    eb=_val-21+(neg<<4);
+  }
+  else if(_val<69){
+    token=OC_DCT_VAL_CAT7;
+    eb=_val-37+(neg<<5);
+  }
+  else{
+    token=OC_DCT_VAL_CAT8;
+    eb=_val-69+(neg<<9);
+  }
+  *_eb=eb;
+  return token;
+}
+
+/*Token logging to allow a few fragments of efficient rollback.
+  Late SKIP analysis is tied up in the tokenization process, so we need to be
+   able to undo a fragment's tokens on a whim.*/
+
+static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
+   0,16,16,16,16,16,32,32,
+  32,32,32,32,32,32,32,48,
+  48,48,48,48,48,48,48,48,
+  48,48,48,48,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
+  return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
+   +OC_DCT_TOKEN_EXTRA_BITS[_token];
+}
+
+static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
+ oc_token_checkpoint *_cp,int _pli,int _zzi){
+  _cp->pli=_pli;
+  _cp->zzi=_zzi;
+  _cp->eob_run=_enc->eob_run[_pli][_zzi];
+  _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
+}
+
+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
+ const oc_token_checkpoint *_stack,int _n){
+  int i;
+  for(i=_n;i-->0;){
+    int pli;
+    int zzi;
+    pli=_stack[i].pli;
+    zzi=_stack[i].zzi;
+    _enc->eob_run[pli][zzi]=_stack[i].eob_run;
+    _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
+  }
+}
+
+static void oc_enc_token_log(oc_enc_ctx *_enc,
+ int _pli,int _zzi,int _token,int _eb){
+  ptrdiff_t ti;
+  ti=_enc->ndct_tokens[_pli][_zzi]++;
+  _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
+  _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
+}
+
+static void oc_enc_eob_log(oc_enc_ctx *_enc,
+ int _pli,int _zzi,int _run_count){
+  int token;
+  int eb;
+  token=oc_make_eob_token_full(_run_count,&eb);
+  oc_enc_token_log(_enc,_pli,_zzi,token,eb);
+}
+
+static int oc_enc_tokenize_dctval(oc_enc_ctx *_enc,int _pli,
+ int _zzi,int _zzj,int _val){
+  int eob_run;
+  int token;
+  int eb;
+  /*Emit pending EOB run if any.*/
+  eob_run=_enc->eob_run[_pli][_zzi];
+  if(eob_run>0){
+    oc_enc_eob_log(_enc,_pli,_zzi,eob_run);
+    _enc->eob_run[_pli][_zzi]=0;
+  }
+  token=oc_make_dct_token_full(_zzi,_zzj,_val,&eb);
+  oc_enc_token_log(_enc,_pli,_zzi,token,eb);
+  /*Return 0 if we didn't tokenize the value, just the zero run preceding it.*/
+  return _val==0||token!=OC_DCT_SHORT_ZRL_TOKEN&&token!=OC_DCT_ZRL_TOKEN;
+}
+
+static void oc_enc_tokenize_eobrun(oc_enc_ctx *_enc,int _pli,int _zzi){
+  int eob_run;
+  eob_run=_enc->eob_run[_pli][_zzi];
+  eob_run++;
+  if(eob_run>=4095){
+    oc_enc_eob_log(_enc,_pli,_zzi,eob_run);
+    eob_run=0;
+  }
+  _enc->eob_run[_pli][_zzi]=eob_run;
+}
+
+/*The opportunity cost of a DCT coefficient is the cost to flush any pending
+   EOB run plus the cost of the coefficient itself.
+  This encourages us to keep long EOB runs going in the higher/chroma
+   coefficients.
+  Technically this cost should be weighted by the probability that we expect a
+   future fragment to continue it, but that's qi- and zzi-dependent.
+  Note: Assumes AC coefficients only (_zzi>0).*/
+static int oc_enc_tokenize_dctval_bits(oc_enc_ctx *_enc,int _pli,
+ int _zzi,int _zzj,int _val){
+  int huffi;
+  int eob_run;
+  int token;
+  int bits;
+  huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
+  /*If there was an EOB run pending, count the cost of flushing it.*/
+  eob_run=_enc->eob_run[_pli][_zzi];
+  if(eob_run)bits=oc_token_bits(_enc,huffi,_zzi,oc_make_eob_token(eob_run));
+  else bits=0;
+  /*Count the cost of the token.*/
+  token=oc_make_dct_token(_zzi,_zzj,_val);
+  bits+=oc_token_bits(_enc,huffi,_zzi,token);
+  /*If token was a pure zero run, we've not yet coded the value.*/
+  if(token==OC_DCT_SHORT_ZRL_TOKEN||token==OC_DCT_ZRL_TOKEN){
+    eob_run=_enc->eob_run[_pli][_zzj];
+    if(eob_run)bits+=oc_token_bits(_enc,huffi,_zzj,oc_make_eob_token(eob_run));
+    bits+=oc_token_bits(_enc,huffi,_zzj,oc_make_dct_token(_zzj,_zzj,_val));
+  }
+  return bits;
+}
+
+/*The opportunity cost of an in-progress EOB run of size N+1 is the cost of
+   flushing a run of size N+1 minus the cost of flushing a run of size N.
+  Note: Assumes AC coefficients only (_zzi>0).*/
+static int oc_enc_tokenize_eobrun_bits(oc_enc_ctx *_enc,int _pli,int _zzi){
+  int eob_run;
+  int huffi;
+  eob_run=_enc->eob_run[_pli][_zzi];
+  huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
+  if(eob_run>0){
+    /*Note: We must be able to add another block to this run, or we would have
+       flushed it already.*/
+    return oc_token_bits(_enc,huffi,_zzi,oc_make_eob_token(eob_run+1))
+     -oc_token_bits(_enc,huffi,_zzi,oc_make_eob_token(eob_run));
+  }
+  else return oc_token_bits(_enc,huffi,_zzi,OC_DCT_EOB1_TOKEN);
+}
+
+
+void oc_enc_tokenize_start(oc_enc_ctx *_enc){
+  memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
+  memset(_enc->eob_run,0,sizeof(_enc->eob_run));
+  memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
+  memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
+}
+
+/*No final DC to encode yet (DC prediction hasn't been done), so simply assume
+   there will be a nonzero DC value and code.
+  That's not a true assumption but it can be fixed-up as DC is being tokenized
+   later.*/
+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,ptrdiff_t _fragi,ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,int _pli,
+ oc_token_checkpoint **_stack,int _acmin){
+  oc_token_checkpoint *stack;
+  int                  zzi;
+  int                  zzj;
+  int                  total_bits;
+  int                  lambda;
+  stack=*_stack;
+  lambda=_enc->lambda;
+  total_bits=0;
+  /*Skip DC for now.*/
+  zzi=1;
+  for(zzj=zzi;!_qdct[zzj]&&++zzj<64;);
+  while(zzj<64){
+    int v;
+    int d;
+    int mask;
+    int best_bits;
+    int best_d;
+    int zzk;
+    int k;
+    v=_dct[OC_FZIG_ZAG[zzj]];
+    d=_qdct[zzj];
+    for(zzk=zzj+1;zzk<64&&!_qdct[zzk];zzk++);
+    /*Only apply R-D optimizaton if we're past the minimum allowed.*/
+    if(zzj>=_acmin){
+      int best_cost;
+      int bits2;
+      if(zzk>=64){
+        best_bits=oc_enc_tokenize_eobrun_bits(_enc,_pli,zzi);
+        if(zzj+1<64)bits2=oc_enc_tokenize_eobrun_bits(_enc,_pli,zzj+1);
+        else bits2=0;
+      }
+      else{
+        best_bits=oc_enc_tokenize_dctval_bits(_enc,_pli,zzi,zzk,_qdct[zzk]);
+        bits2=oc_enc_tokenize_dctval_bits(_enc,_pli,zzj+1,zzk,_qdct[zzk]);
+      }
+      best_cost=v*v+best_bits*lambda;
+      best_d=0;
+      mask=OC_SIGNMASK(d);
+      for(k=abs(d);k>0;k--){
+        int dk;
+        int dd;
+        int bits;
+        int cost;
+        dk=k+mask^mask;
+        dd=dk*_dequant[zzj]-v;
+        bits=oc_enc_tokenize_dctval_bits(_enc,_pli,zzi,zzj,dk);
+        cost=dd*dd+(bits+bits2)*lambda;
+        if(cost<=best_cost){
+          best_cost=cost;
+          best_bits=bits;
+          best_d=dk;
+        }
+      }
+      _qdct[zzj]=best_d;
+      if(best_d==0){
+        zzj=zzk;
+        continue;
+      }
+    }
+    else{
+      best_d=d;
+      best_bits=oc_enc_tokenize_dctval_bits(_enc,_pli,zzi,zzj,best_d);
+    }
+    total_bits+=best_bits;
+    oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+    if(!oc_enc_tokenize_dctval(_enc,_pli,zzi,zzj,best_d)){
+      oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzj);
+      oc_enc_tokenize_dctval(_enc,_pli,zzj,zzj,best_d);
+    }
+    zzi=zzj+1;
+    zzj=zzk;
+  }
+  if(zzi<64){
+    /*We don't include the actual EOB cost for this block.
+      It will be paid for by the fragment that terminates the EOB run.
+    total_bits+=oc_enc_tokenize_eobrun_bits(_enc,_pli,zzi);*/
+    oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+    oc_enc_tokenize_eobrun(_enc,_pli,zzi);
+  }
+  *_stack=stack;
+  return total_bits;
+}
+
+static void oc_enc_pred_dc_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  ogg_int16_t             *frag_dc;
+  ptrdiff_t                fragi;
+  int                     *pred_last;
+  int                      nhfrags;
+  int                      nvfrags;
+  int                      fragx;
+  int                      fragy;
+  fplane=_enc->state.fplanes+_pli;
+  frags=_enc->state.frags;
+  frag_dc=_enc->frag_dc;
+  pred_last=_enc->dc_pred_last[_pli];
+  nhfrags=fplane->nhfrags;
+  nvfrags=fplane->nvfrags;
+  fragi=fplane->froffset+_y0*nhfrags;
+  for(fragy=_y0;fragy<_yend;fragy++){
+    for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+      if(frags[fragi].coded){
+        frag_dc[fragi]=frags[fragi].dc
+         -oc_frag_pred_dc(frags+fragi,fplane,fragx,fragy,pred_last);
+        pred_last[OC_FRAME_FOR_MODE[frags[fragi].mb_mode]]=frags[fragi].dc;
+      }
+    }
+  }
+}
+
+static void oc_enc_tokenize_dc(oc_enc_ctx *_enc){
+  const ogg_int16_t *frag_dc;
+  const ptrdiff_t   *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  int                pli;
+  frag_dc=_enc->frag_dc;
+  coded_fragis=_enc->state.coded_fragis;
+  ncoded_fragis=fragii=0;
+  for(pli=0;pli<3;pli++){
+    unsigned char *dct_tokens0;
+    unsigned char *dct_tokens1;
+    ogg_uint16_t  *extra_bits0;
+    ogg_uint16_t  *extra_bits1;
+    ptrdiff_t      ti0;
+    ptrdiff_t      ti1r;
+    ptrdiff_t      ti1w;
+    int            eob_run0;
+    int            eob_run1;
+    int            neobs1;
+    int            token;
+    int            eb;
+    int            token1;
+    int            eb1;
+    /*TODO: Move this inline with reconstruction.*/
+    oc_enc_pred_dc_rows(_enc,pli,0,_enc->state.fplanes[pli].nvfrags);
+    dct_tokens0=_enc->dct_tokens[pli][0];
+    dct_tokens1=_enc->dct_tokens[pli][1];
+    extra_bits0=_enc->extra_bits[pli][0];
+    extra_bits1=_enc->extra_bits[pli][1];
+    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    ti0=ti1w=ti1r=0;
+    eob_run0=eob_run1=neobs1=0;
+    for(;fragii<ncoded_fragis;fragii++){
+      int val;
+      /*All tokens in the 1st AC coefficient stack are regenerated as the DC
+         coefficients are produced.
+        This can be done in-place; stack 1 cannot get larger.*/
+      if(!neobs1){
+        /*There's no active EOB run in stack 1; read the next token.*/
+        token1=dct_tokens1[ti1r];
+        eb1=extra_bits1[ti1r];
+        ti1r++;
+        if(token1<OC_NDCT_EOB_TOKEN_MAX){
+          neobs1=oc_decode_eob_token(token1,eb1);
+          /*It's an EOB run; add it to the current (inactive) one.
+            Because we may have moved entries to stack 0, we may have an
+             opportunity to merge two EOB runs in stack 1.*/
+          eob_run1+=neobs1;
+        }
+      }
+      val=frag_dc[coded_fragis[fragii]];
+      if(val){
+        /*There was a non-zero DC value, so there's no alteration to stack 1
+           for this fragment; just code the stack 0 token.*/
+        /*Flush any pending EOB run.*/
+        if(eob_run0>0){
+          token=oc_make_eob_token_full(eob_run0,&eb);
+          dct_tokens0[ti0]=(unsigned char)token;
+          extra_bits0[ti0]=(ogg_uint16_t)eb;
+          ti0++;
+          eob_run0=0;
+        }
+        token=oc_make_dct_token_full(0,0,val,&eb);
+        dct_tokens0[ti0]=(unsigned char)token;
+        extra_bits0[ti0]=(ogg_uint16_t)eb;
+        ti0++;
+      }
+      else{
+        /*Zero DC value; that means the entry in stack 1 might need to be coded
+           from stack 0.
+          This requires a stack 1 fixup.*/
+        if(neobs1){
+          /*We're in the middle of an active EOB run in stack 1.
+            Move it to stack 0.*/
+          if(++eob_run0>=4095){
+            token=oc_make_eob_token_full(eob_run0,&eb);
+            dct_tokens0[ti0]=(unsigned char)token;
+            extra_bits0[ti0]=(ogg_uint16_t)eb;
+            ti0++;
+            eob_run0=0;
+          }
+          eob_run1--;
+        }
+        else{
+          /*No active EOB run in stack 1, so we can't extend one in stack 0.
+            Flush it if we've got it.*/
+          if(eob_run0>0){
+            token=oc_make_eob_token_full(eob_run0,&eb);
+            dct_tokens0[ti0]=(unsigned char)token;
+            extra_bits0[ti0]=(ogg_uint16_t)eb;
+            ti0++;
+            eob_run0=0;
+          }
+          /*Stack 1 token is one of: a pure zero run token, a single
+             coefficient token, or a zero run/coefficient combo token.
+            A zero run token is expanded and moved to token stack 0, and the
+             stack 1 entry dropped.
+            A single coefficient value may be transformed into combo token that
+             is moved to stack 0, or if it cannot be combined, it is left alone
+             and a single length-1 zero run is emitted in stack 0.
+            A combo token is extended and moved to stack 0.
+            During AC coding, we restrict the run lengths on combo tokens for
+             stack 1 to guarantee we can extend them.*/
+          switch(token1){
+            case OC_DCT_SHORT_ZRL_TOKEN:{
+              if(eb1<7){
+                dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+                extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+                ti0++;
+                /*Don't write the AC coefficient back out.*/
+                continue;
+              }
+              /*Fall through.*/
+            }
+            case OC_DCT_ZRL_TOKEN:{
+              dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_ONE_TOKEN:
+            case OC_MINUS_ONE_TOKEN:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
+              extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_TWO_TOKEN:
+            case OC_MINUS_TWO_TOKEN:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+              extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_DCT_VAL_CAT2:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+              extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_DCT_RUN_CAT1A:
+            case OC_DCT_RUN_CAT1A+1:
+            case OC_DCT_RUN_CAT1A+2:
+            case OC_DCT_RUN_CAT1A+3:{
+              dct_tokens0[ti0]=(unsigned char)(token1+1);
+              extra_bits0[ti0]=(ogg_uint16_t)eb1;
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_DCT_RUN_CAT1A+4:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_DCT_RUN_CAT1B:{
+              if((eb1&3)<3){
+                dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+                extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+                ti0++;
+                /*Don't write the AC coefficient back out.*/
+                continue;
+              }
+              eb1=((eb1&4)<<1)-1;
+              /*Fall through.*/
+            }
+            case OC_DCT_RUN_CAT1C:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+            case OC_DCT_RUN_CAT2A:{
+              eb1=(eb1<<1)-1;
+              /*Fall through.*/
+            }
+            case OC_DCT_RUN_CAT2B:{
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+            }continue;
+          }
+          /*We can't merge tokens, write a short zero run and keep going.*/
+          dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+          extra_bits0[ti0]=0;
+          ti0++;
+        }
+      }
+      if(!neobs1){
+        /*Flush any (inactive) EOB run.*/
+        if(eob_run1>0){
+          token=oc_make_eob_token_full(eob_run1,&eb);
+          dct_tokens1[ti1w]=(unsigned char)token;
+          extra_bits1[ti1w]=(ogg_uint16_t)eb;
+          ti1w++;
+          eob_run1=0;
+        }
+        /*There's no active EOB run, so log the current token.*/
+        dct_tokens1[ti1w]=(unsigned char)token1;
+        extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+        ti1w++;
+      }
+      else{
+        /*Otherwise consume one EOB from the current run.*/
+        neobs1--;
+        /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
+        if(eob_run1-neobs1>=4095){
+          token=oc_make_eob_token_full(4095,&eb);
+          dct_tokens1[ti1w]=(unsigned char)token;
+          extra_bits1[ti1w]=(ogg_uint16_t)eb;
+          ti1w++;
+          eob_run1-=4095;
+        }
+      }
+    }
+    /*Flush the trailing EOB runs.*/
+    if(eob_run0>0){
+      token=oc_make_eob_token_full(eob_run0,&eb);
+      dct_tokens0[ti0]=(unsigned char)token;
+      extra_bits0[ti0]=(ogg_uint16_t)eb;
+      ti0++;
+    }
+    if(eob_run1>0){
+      token=oc_make_eob_token_full(eob_run1,&eb);
+      dct_tokens1[ti1w]=(unsigned char)token;
+      extra_bits1[ti1w]=(ogg_uint16_t)eb;
+      ti1w++;
+    }
+    _enc->ndct_tokens[pli][0]=ti0;
+    _enc->ndct_tokens[pli][1]=ti1w;
+  }
+}
+
+/*DC prediction, post-facto DC tokenization (has to be completed after DC
+   predict), AC coefficient fix-ups and EOB run welding.*/
+void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
+  int pli;
+  int zzi;
+  /*Emit final EOB runs for the AC coefficients.
+    This must be done before we tokenize the DC coefficients, so we can
+     properly track the 1st AC coefficient to the end of the list.*/
+  for(pli=0;pli<3;pli++)for(zzi=1;zzi<64;zzi++){
+    int eob_run;
+    eob_run=_enc->eob_run[pli][zzi];
+    if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
+  }
+  /*Fill in the DC token list and fix-up the 1st AC coefficient.*/
+  oc_enc_tokenize_dc(_enc);
+  /*Merge the final EOB run of one token list with the start of the next, if
+     possible.*/
+  for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
+    int       old_tok1;
+    int       old_tok2;
+    int       old_eb1;
+    int       old_eb2;
+    int       new_tok;
+    int       new_eb;
+    int       zzj;
+    int       plj;
+    ptrdiff_t ti;
+    int       run_count;
+    /*Make sure this coefficient has tokens at all.*/
+    if(_enc->ndct_tokens[pli][zzi]<=0)continue;
+    /*Ensure the first token is an EOB run.*/
+    old_tok2=_enc->dct_tokens[pli][zzi][0];
+    if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
+    /*Search for a previous coefficient that has any tokens at all.*/
+    old_tok1=OC_NDCT_EOB_TOKEN_MAX;
+    for(zzj=zzi,plj=pli;zzj>=0;zzj--){
+      while(plj-->0){
+        ti=_enc->ndct_tokens[plj][zzj]-1;
+        if(ti>=_enc->dct_token_offs[plj][zzj]){
+          old_tok1=_enc->dct_tokens[plj][zzj][ti];
+          break;
+        }
+      }
+      if(plj>=0)break;
+      plj=3;
+    }
+    /*Ensure its last token was an EOB run.*/
+    if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
+    /*Pull off the associated extra bits, if any, and decode the runs.*/
+    /*ti is always initialized; if your compiler thinks otherwise, it is dumb.*/
+    old_eb1=_enc->extra_bits[plj][zzj][ti];
+    old_eb2=_enc->extra_bits[pli][zzi][0];
+    run_count=oc_decode_eob_token(old_tok1,old_eb1)
+     +oc_decode_eob_token(old_tok2,old_eb2);
+    /*We can't possibly combine these into one run.
+      It might be possible to split them more optimally, but we'll just leave
+       them as-is.*/
+    if(run_count>=4096)continue;
+    /*We CAN combine them into one run.*/
+    new_tok=oc_make_eob_token_full(run_count,&new_eb);
+    _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
+    _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
+    _enc->dct_token_offs[pli][zzi]++;
+  }
+}

Modified: branches/theora-thusnelda/lib/enc/toplevel_lookup.h
===================================================================
--- branches/theora-thusnelda/lib/enc/toplevel_lookup.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/toplevel_lookup.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -15,7 +15,7 @@
 
  ********************************************************************/
 
-#include "codec_internal.h"
+#include "encint.h"
 
 /*The default quantization parameters used by VP3.1.*/
 static const int OC_VP31_RANGE_SIZES[1]={63};

Deleted: branches/theora-thusnelda/lib/enc/x86/mmxenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/mmxenc.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/x86/mmxenc.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -1,64 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dct_decode_mmx.c 15078 2008-06-27 22:07:19Z xiphmont $
-
- ********************************************************************/
-#include <string.h>
-#include "x86enc.h"
-#include "../../dec/x86/mmxloop.h"
-
-#if defined(OC_X86_ASM)
-
-/*Apply the loop filter.*/
-void oc_enc_loop_filter_mmx(CP_INSTANCE *cpi,int _flimit){
-  unsigned char OC_ALIGN8  ll[8];
-  unsigned char           *cp;
-  ogg_uint32_t            *bp;
-  int                      pli;
-  cp=cpi->frag_coded;
-  bp=cpi->frag_buffer_index;
-  if(_flimit==0)return;
-  memset(ll,_flimit,sizeof(ll));
-  for(pli=0;pli<3;pli++){
-    ogg_uint32_t *bp_begin;
-    ogg_uint32_t *bp_end;
-    int           stride;
-    int           h;
-    bp_begin=bp;
-    bp_end=bp+cpi->frag_n[pli];
-    stride=cpi->stride[pli];
-    h=cpi->frag_h[pli];
-    while(bp<bp_end){
-      ogg_uint32_t *bp_left;
-      ogg_uint32_t *bp_right;
-      bp_left=bp;
-      bp_right=bp+h;
-      for(;bp<bp_right;bp++,cp++)if(*cp){
-        if(bp>bp_left)OC_LOOP_FILTER_H_MMX(cpi->lastrecon+bp[0],stride,ll);
-        if(bp_left>bp_begin){
-          OC_LOOP_FILTER_V_MMX(cpi->lastrecon+bp[0],stride,ll);
-        }
-        if(bp+1<bp_right&&!cp[1]){
-          OC_LOOP_FILTER_H_MMX(cpi->lastrecon+bp[0]+8,stride,ll);
-        }
-        if(bp+h<bp_end&&!cp[h]){
-          OC_LOOP_FILTER_V_MMX(cpi->lastrecon+bp[h],stride,ll);
-        }
-      }
-    }
-  }
-  __asm__ __volatile__("emms\n\t");
-}
-
-#endif

Modified: branches/theora-thusnelda/lib/enc/x86/x86enc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/x86enc.c	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/x86/x86enc.c	2009-06-13 16:04:06 UTC (rev 16102)
@@ -20,33 +20,29 @@
 
 #include "../../cpu.c"
 
-void oc_enc_vtable_init_x86(CP_INSTANCE *_cpi){
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
   cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_cpi);
+  oc_enc_vtable_init_c(_enc);
   if(cpu_flags&OC_CPU_X86_MMX){
-    _cpi->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
-    _cpi->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
-    _cpi->opt_vtable.frag_copy=oc_frag_copy_mmx;
-    _cpi->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
-    _cpi->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
-    _cpi->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
-    _cpi->opt_vtable.enc_loop_filter=oc_enc_loop_filter_mmx;
-    _cpi->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
-    _cpi->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
-    _cpi->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
-    _cpi->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _cpi->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _cpi->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
-    _cpi->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
-    _cpi->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
+    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
+    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
-    _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
 # endif
   }
 }

Modified: branches/theora-thusnelda/lib/enc/x86/x86enc.h
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/x86enc.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/enc/x86/x86enc.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -17,10 +17,10 @@
 
 #if !defined(_x86_x86enc_H)
 # define _x86_x86enc_H (1)
-# include "../codec_internal.h"
+# include "../encint.h"
 # include "../../dec/x86/x86int.h"
 
-void oc_enc_vtable_init_x86(CP_INSTANCE *_cpi);
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
 
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
@@ -43,6 +43,5 @@
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
 void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_enc_loop_filter_mmx(CP_INSTANCE *_cpi,int _flimit);
 
 #endif

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-06-13 08:04:20 UTC (rev 16101)
+++ branches/theora-thusnelda/lib/internal.h	2009-06-13 16:04:06 UTC (rev 16102)
@@ -14,7 +14,6 @@
     last mod: $Id$
 
  ********************************************************************/
-
 #if !defined(_internal_H)
 # define _internal_H (1)
 # include <stdlib.h>
@@ -25,10 +24,10 @@
 # include "theora/theora.h"
 
 # if defined(_MSC_VER)
-/*Thank you Microsoft, I know the order of operations.*/
-#  pragma warning(disable:4554)
 /*Disable missing EMMS warnings.*/
 #  pragma warning(disable:4799)
+/*Thank you Microsoft, I know the order of operations.*/
+#  pragma warning(disable:4554)
 # endif
 /*You, too, gcc.*/
 # if defined(__GNUC_PREREQ)
@@ -210,10 +209,7 @@
 
 /*Information about a fragment which intersects the border of the displayable
    region.
-  This marks which pixels belong to the displayable region, and is used to
-   ensure that pixels outside of this region are never referenced.
-  This allows applications to pass in buffers that are really the size of the
-   displayable region without causing a segfault.*/
+  This marks which pixels belong to the displayable region.*/
 struct oc_border_info{
   /*A bit mask marking which pixels are in the displayable region.
     Pixel (x,y) corresponds to bit (y<<3|x).*/
@@ -235,8 +231,8 @@
      frame, not just the displayable one.
     There are no fragments outside the coded frame by construction.*/
   unsigned   invalid:1;
-  /*The quality index used for this fragment's AC coefficients.*/
-  unsigned   qi:6;
+  /*The index of the quality index used for this fragment's AC coefficients.*/
+  unsigned   qii:6;
   /*The mode of the macroblock this fragment belongs to.*/
   unsigned   mb_mode:3;
   /*The index of the associated border information for fragments which lie
@@ -301,7 +297,7 @@
 
 
 
-/*Common state information between the encoder and decoder.*/
+/*State information common to both the encoder and decoder.*/
 struct oc_theora_state{
   /*The stream information.*/
   th_info             info;
@@ -326,12 +322,6 @@
   oc_sb_flags        *sb_flags;
   /*The total number of super blocks in a single frame.*/
   unsigned            nsbs;
-  /*The number of macro blocks in the X direction.*/
-  unsigned            nhmbs;
-  /*The number of macro blocks in the Y direction.*/
-  unsigned            nvmbs;
-  /*The total number of macro blocks.*/
-  size_t              nmbs;
   /*The fragments from each color plane that belong to each macro block.
     Fragments are stored in image order (left to right then top to bottom).
     When chroma components are decimated, the extra fragments have an index of
@@ -341,32 +331,33 @@
     A negative number indicates the macro block lies entirely outside the
      coded frame.*/
   signed char        *mb_modes;
-  /*The list of coded fragments, in coded order.*/
+  /*The number of macro blocks in the X direction.*/
+  unsigned            nhmbs;
+  /*The number of macro blocks in the Y direction.*/
+  unsigned            nvmbs;
+  /*The total number of macro blocks.*/
+  size_t              nmbs;
+  /*The list of coded fragments, in coded order.
+    Uncoded fragments are stored in reverse order from the end of the list.*/
   ptrdiff_t          *coded_fragis;
   /*The number of coded fragments in each plane.*/
   ptrdiff_t           ncoded_fragis[3];
-  /*The list of uncoded fragments.
-    This just past the end of the list, which is in reverse order, and
-     uses the same block of allocated storage as the coded_fragis list.*/
-  ptrdiff_t          *uncoded_fragis;
-  /*The number of uncoded fragments in each plane.*/
-  ptrdiff_t           nuncoded_fragis[3];
-  /*The list of coded macro blocks in the Y plane, in coded order.*/
-  unsigned           *coded_mbis;
-  /*The number of coded macro blocks in the Y plane.*/
-  size_t              ncoded_mbis;
+  /*The total number of coded fragments.*/
+  ptrdiff_t           ntotal_coded_fragis;
+  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
+  int                 ref_frame_idx[4];
+  /*The actual buffers used for the previously decoded frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[4];
+  /*The storage for the reference frame buffers.*/
+  unsigned char      *ref_frame_data[4];
+  /*The strides for each plane in the reference frames.*/
+  int                 ref_ystride[3];
   /*The number of unique border patterns.*/
   int                 nborders;
   /*The unique border patterns for all border fragments.
     The borderi field of fragments which straddle the border indexes this
      list.*/
   oc_border_info      borders[16];
-  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
-  int                 ref_frame_idx[3];
-  /*The actual buffers used for the previously decoded frames.*/
-  th_ycbcr_buffer     ref_frame_bufs[3];
-  /*The storage for the reference frame buffers.*/
-  unsigned char      *ref_frame_data[3];
   /*The frame number of the last keyframe.*/
   ogg_int64_t         keyframe_num;
   /*The frame number of the current frame.*/
@@ -374,15 +365,17 @@
   /*The granpos of the current frame.*/
   ogg_int64_t         granpos;
   /*The type of the current frame.*/
-  int                 frame_type;
-  /*The quality indices of the current frame.*/
-  unsigned char       qis[3];
+  unsigned char       frame_type;
+  /*The bias to add to the frame count when computing granule positions.*/
+  unsigned char       granpos_bias;
   /*The number of quality indices used in the current frame.*/
   unsigned char       nqis;
-  /*The dequantization tables.
-    Note that these are stored in zig-zag order.*/
-  oc_quant_table     *dequant_tables[2][3];
-  oc_quant_tables     dequant_table_data[2][3]OC_ALIGN16;
+  /*The quality indices of the current frame.*/
+  unsigned char       qis[3];
+  /*The dequantization tables, stored in zig-zag order, and indexed by
+     qi, pli, qti, and zzi.*/
+  ogg_uint16_t       *dequant_tables[64][3][2];
+  oc_quant_table      dequant_table_data[64][3][2]OC_ALIGN16;
   /*Loop filter strength parameters.*/
   unsigned char       loop_filter_limits[64];
 };
@@ -437,7 +430,7 @@
 int oc_frag_pred_dc(const oc_fragment *_frag,
  const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
 
-int oc_state_init(oc_theora_state *_state,const th_info *_info);
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
 void oc_state_clear(oc_theora_state *_state);
 void oc_state_vtable_init_c(oc_theora_state *_state);
 void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
@@ -447,8 +440,8 @@
 void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
  th_ycbcr_buffer _img);
 int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
-int oc_state_get_mv_offsets(const oc_theora_state *_state,int *_offsets,
- int _dx,int _dy,int _ystride,int _pli);
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
+ int _pli,int _dx,int _dy);
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);