[xiph-commits] r16130 - in branches/theora-thusnelda/lib: dec dec/x86 enc

Sun Jun 14 11:50:23 PDT 2009

Author: tterribe
Date: 2009-06-14 11:50:22 -0700 (Sun, 14 Jun 2009)
New Revision: 16130

Modified:
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/enc/analyze.c
   branches/theora-thusnelda/lib/enc/encint.h
   branches/theora-thusnelda/lib/enc/encode.c
   branches/theora-thusnelda/lib/enc/mcenc.c
   branches/theora-thusnelda/lib/enc/tokenize.c
Log:
Pipeline encode so that MB mode decision, transform, quantization,
 tokenization, reconstruction, loop filtering, and boundary extension are all
 performed on a couple super blocks rows before moving on to subsequent rows.
This means we only have to load the frame data into cache once, and gives a
 3.1% speed improvement on x86-32, and a 1% improvement on x86-64 (measured for
 a single 1080p file at a single rate).


Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================

--- branches/theora-thusnelda/lib/dec/decode.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -1308,7 +1308,7 @@
   const ptrdiff_t    *uncoded_fragis[3];
   ptrdiff_t           ncoded_fragis[3];
   ptrdiff_t           nuncoded_fragis[3];
-  const ogg_uint16_t *qtables[3][3][2];
+  const ogg_uint16_t *dequant[3][3][2];
   int                 fragy0[3];
   int                 fragy_end[3];
   int                 pred_last[3][3];
@@ -1355,7 +1355,7 @@
   for(pli=0;pli<3;pli++){
     for(qii=0;qii<_dec->state.nqis;qii++){
       for(qti=0;qti<2;qti++){
-        _pipe->qtables[pli][qii][qti]=
+        _pipe->dequant[pli][qii][qti]=
          _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
       }
     }
@@ -1445,7 +1445,7 @@
   ti=_pipe->ti[_pli];
   ebi=_pipe->ebi[_pli];
   eob_runs=_pipe->eob_runs[_pli];
-  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->qtables[_pli][0][qti][0];
+  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
   for(fragii=0;fragii<ncoded_fragis;fragii++){
     /*This array is made twice as large as necessary so that an invalid zero
        run cannot cause a buffer overflow.*/
@@ -1483,7 +1483,7 @@
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,
-     dc_quant[qti],_pipe->qtables[_pli][frags[fragi].qii][qti]);
+     dc_quant[qti],_pipe->dequant[_pli][frags[fragi].qii][qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -2028,7 +2028,7 @@
     oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
     notstart=0;
     notdone=1;
-    for(stripe_fragy=notstart=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+    for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
       int avail_fragy0;
       int avail_fragy_end;
       avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -1004,11 +1004,6 @@
   _bv+=127;
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
-  /*The following loops are constructed somewhat non-intuitively on purpose.
-    The main idea is: if a block boundary has at least one coded fragment on
-     it, the filter is applied to it.
-    However, the order that the filters are applied in matters, and VP3 chose
-     the somewhat strange ordering used below.*/
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
@@ -1017,6 +1012,11 @@
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
   ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
   while(fragi0<fragi0_end){
     ptrdiff_t fragi;
     ptrdiff_t fragi_end;

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -97,7 +97,6 @@
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
  int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   unsigned char OC_ALIGN8  ll[8];
-  const th_img_plane      *iplane;
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -109,22 +108,21 @@
   int                      ystride;
   int                      nhfrags;
   memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
-  iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
-  /*The following loops are constructed somewhat non-intuitively on purpose.
-    The main idea is: if a block boundary has at least one coded fragment on
-     it, the filter is applied to it.
-    However, the order that the filters are applied in matters, and VP3 chose
-     the somewhat strange ordering used below.*/
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
-  ystride=iplane->stride;
+  ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
   ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
   while(fragi0<fragi0_end){
     ptrdiff_t fragi;
     ptrdiff_t fragi_end;

Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/analyze.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -21,63 +21,13 @@
 
 
 
-typedef struct oc_plane_state oc_plane_state;
-typedef struct oc_frag_state  oc_frag_state;
-typedef struct oc_mode_choice oc_mode_choice;
-typedef struct oc_rd_metric   oc_rd_metric;
+typedef struct oc_fr_state           oc_fr_state;
+typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
+typedef struct oc_rd_metric          oc_rd_metric;
+typedef struct oc_mode_choice        oc_mode_choice;
 
 
 
-/*Temporary encoder state for a single color plane.*/
-struct oc_plane_state{
-  /*Condensed dequantization tables.*/
-  const ogg_uint16_t *dequant[3][2];
-  /*Condensed quantization tables.*/
-  const oc_iquant    *enquant[3][2];
-  /*Plane index.*/
-  int                 pli;
-};
-
-
-
-/*State to track coded block flags and their bit cost.*/
-struct oc_frag_state{
-  unsigned   sb_partial_count:16;
-  unsigned   sb_full_count:16;
-  unsigned   b_count:8;
-  unsigned   b_pend:8;
-  signed int sb_partial_last:2;
-  signed int sb_full_last:2;
-  signed int b_last:2;
-  unsigned   sb_partial:1;
-  unsigned   sb_coded:1;
-  unsigned   sb_partial_break:1;
-  unsigned   sb_full_break:1;
-  ptrdiff_t  bits;
-};
-
-
-
-/*Cost information about a MB mode.*/
-struct oc_mode_choice{
-  unsigned cost;
-  unsigned ssd;
-  unsigned rate;
-  unsigned overhead;
-};
-
-
-
-/*Cost information about the coded blocks in a MB.*/
-struct oc_rd_metric{
-  int uncoded_ac_ssd;
-  int coded_ac_ssd;
-  int ac_bits;
-  int dc_flag;
-};
-
-
-
 /*There are 8 possible schemes used to encode macro block modes.
   Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
   The same set of Huffman codes is used for each of these 7 schemes, but the
@@ -244,36 +194,41 @@
 
 
 
-static void oc_plane_state_plane_setup(oc_enc_ctx *_enc,oc_plane_state *_ps,
- int _pli){
-  int qii;
-  int qti;
-  _ps->pli=_pli;
-  for(qii=0;qii<_enc->state.nqis;qii++){
-    int qi;
-    qi=_enc->state.qis[qii];
-    for(qti=0;qti<2;qti++){
-      _ps->dequant[qii][qti]=_enc->state.dequant_tables[qi][_pli][qti];
-      _ps->enquant[qii][qti]=_enc->enquant_tables[qi][_pli][qti];
-    }
-  }
-}
-
-
-
+/*The number of bits required to encode a super block run.
+  _run_count: The desired run count; must be positive and less than 4130.*/
 static int oc_sb_run_bits(int _run_count){
   int i;
   for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
   return OC_SB_RUN_CODE_NBITS[i];
 }
 
+/*The number of bits required to encode a block run.
+  _run_count: The desired run count; must be positive and less than 30.*/
 static int oc_block_run_bits(int _run_count){
   return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
 }
 
 
 
-static void oc_fr_state_init(oc_frag_state *_fr){
+/*State to track coded block flags and their bit cost.*/
+struct oc_fr_state{
+  unsigned   sb_partial_count:16;
+  unsigned   sb_full_count:16;
+  unsigned   b_count:8;
+  unsigned   b_pend:8;
+  signed int sb_partial_last:2;
+  signed int sb_full_last:2;
+  signed int b_last:2;
+  unsigned   sb_partial:1;
+  unsigned   sb_coded:1;
+  unsigned   sb_partial_break:1;
+  unsigned   sb_full_break:1;
+  ptrdiff_t  bits;
+};
+
+
+
+static void oc_fr_state_init(oc_fr_state *_fr){
   _fr->sb_partial_last=-1;
   _fr->sb_partial_count=0;
   _fr->sb_partial_break=0;
@@ -289,7 +244,7 @@
 }
 
 
-static void oc_fr_skip_block(oc_frag_state *_fr){
+static void oc_fr_skip_block(oc_fr_state *_fr){
   if(_fr->sb_coded){
     if(!_fr->sb_partial){
       /*The super block was previously fully coded.*/
@@ -321,7 +276,7 @@
   _fr->sb_partial=1;
 }
 
-static void oc_fr_code_block(oc_frag_state *_fr){
+static void oc_fr_code_block(oc_fr_state *_fr){
   if(_fr->sb_partial){
     if(!_fr->sb_coded){
       /*The super block was previously completely uncoded...*/
@@ -353,7 +308,7 @@
   _fr->sb_coded=1;
 }
 
-static void oc_fr_finish_sb(oc_frag_state *_fr){
+static void oc_fr_finish_sb(oc_fr_state *_fr){
   /*Update the partial flag.*/
   int partial;
   partial=_fr->sb_partial&_fr->sb_coded;
@@ -399,7 +354,7 @@
   _fr->sb_coded=0;
 }
 
-static void oc_fr_flush(oc_frag_state *_fr){
+static void oc_fr_flush(oc_fr_state *_fr){
   /*Flush any pending partial run.*/
   if(_fr->sb_partial_break)_fr->bits++;
   if(_fr->sb_partial_count)_fr->bits+=oc_sb_run_bits(_fr->sb_partial_count);
@@ -410,9 +365,9 @@
   if(_fr->b_count)_fr->bits+=oc_block_run_bits(_fr->b_count);
 }
 
-static int oc_fr_cost1(const oc_frag_state *_fr){
-  oc_frag_state tmp;
-  int           bits;
+static int oc_fr_cost1(const oc_fr_state *_fr){
+  oc_fr_state tmp;
+  int         bits;
   *&tmp=*_fr;
   oc_fr_skip_block(&tmp);
   bits=tmp.bits;
@@ -421,8 +376,8 @@
   return tmp.bits-bits;
 }
 
-static int oc_fr_cost4(const oc_frag_state *_pre,const oc_frag_state *_post){
-  oc_frag_state tmp;
+static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
+  oc_fr_state tmp;
   *&tmp=*_pre;
   oc_fr_skip_block(&tmp);
   oc_fr_skip_block(&tmp);
@@ -433,23 +388,132 @@
 
 
 
-static void oc_enc_frag_uncode(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi){
-  const unsigned char *src;
-  unsigned char       *dst;
-  ptrdiff_t            frag_offs;
-  int                  ystride;
-  frag_offs=_enc->state.frag_buf_offs[_fragi];
-  ystride=_enc->state.ref_ystride[_pli];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
-  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]];
-  oc_frag_copy(&_enc->state,dst+frag_offs,src+frag_offs,ystride);
-  _enc->state.frags[_fragi].coded=0;
-  /*We do NOT update frags[_fragi].mb_mode or frag_mvs[_fragi], since they are
-     not subsequently referenced by uncoded fragments.*/
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+  int                 bounding_values[256];
+  oc_fr_state         fr[3];
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t *dequant[3][3][2];
+  /*Condensed quantization tables.*/
+  const oc_iquant    *enquant[3][3][2];
+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+  ptrdiff_t          *coded_fragis[3];
+  ptrdiff_t          *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  /*The starting row for the current MCU in each plane.*/
+  int                 fragy0[3];
+  /*The ending row for the current MCU in each plane.*/
+  int                 fragy_end[3];
+  /*The number of tokens for zzi=1 for each color plane.*/
+  int                 ndct_tokens1[3];
+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
+  int                 eob_run1[3];
+  /*The number of vertical super blocks in an MCU.*/
+  int                 mcu_nvsbs;
+  /*Whether or not the loop filter is enabled.*/
+  int                 loop_filter;
+};
+
+
+static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
+  ptrdiff_t *coded_fragis;
+  int        pli;
+  int        qii;
+  int        qti;
+  /*Initialize the per-plane coded block flag trackers.
+    These are used for bit-estimation purposes only; the real flag bits span
+     all three planes, so we can't compute them in parallel.*/
+  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
+  /*Set up per-plane pointers to the coded and uncoded fragments lists.
+    Unlike the decoder, each planes' coded and uncoded fragment list is kept
+     separate during the analysis stage; we only make the coded list for all
+     three planes contiguous right before the final packet is output
+     (destroying the uncoded lists, which are no longer needed).*/
+  coded_fragis=_enc->state.coded_fragis;
+  for(pli=0;pli<3;pli++){
+    _pipe->coded_fragis[pli]=coded_fragis;
+    coded_fragis+=_enc->state.fplanes[pli].nfrags;
+    _pipe->uncoded_fragis[pli]=coded_fragis;
+  }
+  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
+  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
+  /*Set up condensed quantizer tables.*/
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<_enc->state.nqis;qii++){
+      int qi;
+      qi=_enc->state.qis[qii];
+      for(qti=0;qti<2;qti++){
+        _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+        _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+      }
+    }
+  }
+  /*Initialize the tokenization state.*/
+  for(pli=0;pli<3;pli++){
+    _pipe->ndct_tokens1[pli]=0;
+    _pipe->eob_run1[pli]=0;
+  }
+  /*If chroma is sub-sampled in the vertical direction, we have to encode two
+     super block rows of Y' for each super block row of Cb and Cr.*/
+  _pipe->mcu_nvsbs=1<<!(_enc->state.info.pixel_fmt&2);
+  /*Initialize the bounding value array for the loop filter.*/
+  _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
+   _pipe->bounding_values);
 }
 
+static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
+  int refi;
+  /*Copy over all the uncoded fragments from this plane and advance the uncoded
+     fragment list.*/
+  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+  oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
+   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+  _pipe->nuncoded_fragis[_pli]=0;
+  /*Perform DC prediction.*/
+  oc_enc_pred_dc_frag_rows(_enc,_pli,
+   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
+  /*Finish DC tokenization.*/
+  oc_enc_tokenize_dc_frag_list(_enc,_pli,
+   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
+   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
+  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
+  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
+  /*And advance the coded fragment list.*/
+  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->ncoded_fragis[_pli]=0;
+  /*Apply the loop filter if necessary.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  if(_pipe->loop_filter){
+    oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
+     refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+  }
+  else _sdelay=_edelay=0;
+  /*To fill borders, we have an additional two pixel delay, since a fragment
+     in the next row could filter its top edge, using two pixels from a
+     fragment in this row.
+    But there's no reason to delay a full fragment between the two.*/
+  oc_state_borders_fill_rows(&_enc->state,refi,_pli,
+   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
+   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
+}
+
+
+
+/*Cost information about the coded blocks in a MB.*/
+struct oc_rd_metric{
+  int uncoded_ac_ssd;
+  int coded_ac_ssd;
+  int ac_bits;
+  int dc_flag;
+};
+
+
+
 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
- oc_plane_state *_ps,ptrdiff_t _fragi,int _overhead_bits,
+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
  oc_rd_metric *_mo,oc_token_checkpoint **_stack){
   ogg_int16_t          buffer[64]OC_ALIGN16;
   ogg_int16_t          data[64]OC_ALIGN16;
@@ -478,7 +542,7 @@
   int                  zzi;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
-  ystride=_enc->state.ref_ystride[_ps->pli];
+  ystride=_enc->state.ref_ystride[_pli];
   mb_mode=frags[_fragi].mb_mode;
   src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
   ref=_enc->state.ref_frame_data[
@@ -507,7 +571,7 @@
     default:{
       const oc_mv *frag_mvs;
       frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
-      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_ps->pli,
+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
        frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
       if(nmv_offs>1){
         oc_enc_frag_copy2(_enc,dst,
@@ -577,8 +641,9 @@
   /*Transform:*/
   oc_enc_fdct8x8(_enc,buffer,data);
   /*Quantize:*/
-  dequant=_ps->dequant[0][mb_mode!=OC_MODE_INTRA];
-  enquant=_ps->enquant[0][mb_mode!=OC_MODE_INTRA];
+  /*TODO: Block-level quantizers.*/
+  dequant=_pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA];
+  enquant=_pipe->enquant[_pli][0][mb_mode!=OC_MODE_INTRA];
   nonzero=0;
   for(zzi=0;zzi<64;zzi++){
     int v;
@@ -603,12 +668,10 @@
     }
     else data[zzi]=0;
   }
-  frags[_fragi].dc=data[0];
-  frags[_fragi].coded=1;
   /*Tokenize.*/
   checkpoint=*_stack;
   ac_bits=oc_enc_tokenize_ac(_enc,_fragi,data,dequant,buffer,
-   _ps->pli,_stack,mb_mode==OC_MODE_INTRA?3:0);
+   _pli,_stack,mb_mode==OC_MODE_INTRA?3:0);
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
   oc_dequant_idct8x8(&_enc->state,buffer,data,
@@ -617,7 +680,7 @@
   else{
     oc_enc_frag_recon_inter(_enc,dst,
      nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
-  }
+ }
 #if !defined(OC_COLLECT_METRICS)
   if(frame_type!=OC_INTRA_FRAME)
 #endif
@@ -663,8 +726,8 @@
       /*Hm, not worth it; roll back.*/
       oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
       *_stack=checkpoint;
-      oc_enc_frag_uncode(_enc,_ps->pli,_fragi);
       _mo->coded_ac_ssd+=uncoded_ssd;
+      frags[_fragi].coded=0;
       return 0;
     }
     else{
@@ -673,12 +736,14 @@
       _mo->ac_bits+=ac_bits;
     }
   }
+  frags[_fragi].dc=data[0];
+  frags[_fragi].coded=1;
   return 1;
 }
 
 /* mode_overhead is scaled by << OC_BIT_SCALE */
 static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
- oc_plane_state *_ps,int _mbi,int _mode_overhead,oc_frag_state *_fr){
+ oc_enc_pipeline_state *_pipe,int _mbi,int _mode_overhead){
   /*Worst case token stack usage for 4 fragments.*/
   oc_token_checkpoint  stack[64*4];
   oc_token_checkpoint *stackptr;
@@ -687,18 +752,22 @@
   oc_fragment         *frags;
   ptrdiff_t           *coded_fragis;
   ptrdiff_t            ncoded_fragis;
+  ptrdiff_t           *uncoded_fragis;
+  ptrdiff_t            nuncoded_fragis;
   oc_rd_metric         mo;
-  oc_frag_state        fr_checkpoint;
+  oc_fr_state          fr_checkpoint;
   int                  mb_mode;
   int                  ncoded;
   ptrdiff_t            fragi;
   int                  bi;
-  *&fr_checkpoint=*_fr;
+  *&fr_checkpoint=*(_pipe->fr+0);
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
   mb_modes=_enc->state.mb_modes;
   frags=_enc->state.frags;
-  coded_fragis=_enc->state.coded_fragis;
-  ncoded_fragis=_enc->state.ncoded_fragis[0];
+  coded_fragis=_pipe->coded_fragis[0];
+  ncoded_fragis=_pipe->ncoded_fragis[0];
+  uncoded_fragis=_pipe->uncoded_fragis[0];
+  nuncoded_fragis=_pipe->nuncoded_fragis[0];
   mb_mode=mb_modes[_mbi];
   ncoded=0;
   stackptr=stack;
@@ -707,12 +776,15 @@
     fragi=sb_maps[_mbi>>2][_mbi&3][bi];
     frags[fragi].mb_mode=mb_mode;
     if(oc_enc_block_transform_quantize(_enc,
-     _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
-      oc_fr_code_block(_fr);
+     _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
+      oc_fr_code_block(_pipe->fr+0);
       coded_fragis[ncoded_fragis++]=fragi;
       ncoded++;
     }
-    else oc_fr_skip_block(_fr);
+    else{
+      *(uncoded_fragis-++nuncoded_fragis)=fragi;
+      oc_fr_skip_block(_pipe->fr+0);
+    }
   }
   if(_enc->state.frame_type!=OC_INTRA_FRAME){
     if(ncoded>0&&!mo.dc_flag){
@@ -720,27 +792,27 @@
       /*Some individual blocks were worth coding.
         See if that's still true when accounting for mode and MV overhead.*/
       cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
-       +oc_fr_cost4(&fr_checkpoint,_fr)+(_mode_overhead>>OC_BIT_SCALE));
+       +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+(_mode_overhead>>OC_BIT_SCALE));
       if(mo.uncoded_ac_ssd<=cost){
         /*Taking macroblock overhead into account, it is not worth coding this
            MB.*/
         oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
-        *_fr=*&fr_checkpoint;
+        *(_pipe->fr+0)=*&fr_checkpoint;
         for(bi=0;bi<4;bi++){
           fragi=sb_maps[_mbi>>2][_mbi&3][bi];
-          if(frags[fragi].coded)oc_enc_frag_uncode(_enc,0,fragi);
-          oc_fr_skip_block(_fr);
+          if(frags[fragi].coded){
+            *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            frags[fragi].coded=0;
+          }
+          oc_fr_skip_block(_pipe->fr+0);
         }
         ncoded_fragis-=ncoded;
         ncoded=0;
       }
     }
-    if(ncoded==0){
-      /*No luma blocks coded, mode is forced.*/
-      mb_modes[_mbi]=OC_MODE_INTER_NOMV;
-      return 0;
-    }
-    /*Assume that a 1mv with a single coded block is always cheaper than a 4mv
+    /*If no luma blocks coded, the mode is forced.*/
+    if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+    /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
        with a single coded block.
       This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
        skipped blocks, while a 1MV does not.*/
@@ -748,21 +820,26 @@
       mb_modes[_mbi]=OC_MODE_INTER_MV;
     }
   }
-  _enc->state.ncoded_fragis[0]=ncoded_fragis;
+  _pipe->ncoded_fragis[0]=ncoded_fragis;
+  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
   return ncoded;
 }
 
 static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
- oc_plane_state *_ps,int _sbi_start,int _sbi_end,oc_frag_state *_fr){
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
   const oc_sb_map *sb_maps;
   oc_sb_flags     *sb_flags;
   ptrdiff_t       *coded_fragis;
   ptrdiff_t        ncoded_fragis;
+  ptrdiff_t       *uncoded_fragis;
+  ptrdiff_t        nuncoded_fragis;
   int              sbi;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
   sb_flags=_enc->state.sb_flags;
-  coded_fragis=_enc->state.coded_fragis+_enc->state.fplanes[_ps->pli].froffset;
-  ncoded_fragis=_enc->state.ncoded_fragis[_ps->pli];
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  uncoded_fragis=_pipe->uncoded_fragis[_pli];
+  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
   for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
     /*Worst case token stack usage for 1 fragment.*/
     oc_token_checkpoint stack[64];
@@ -777,18 +854,22 @@
         oc_token_checkpoint *stackptr;
         stackptr=stack;
         if(oc_enc_block_transform_quantize(_enc,
-         _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
+         _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
           coded_fragis[ncoded_fragis++]=fragi;
-          oc_fr_code_block(_fr);
+          oc_fr_code_block(_pipe->fr+_pli);
         }
-        else oc_fr_skip_block(_fr);
+        else{
+          *(uncoded_fragis-++nuncoded_fragis)=fragi;
+          oc_fr_skip_block(_pipe->fr+_pli);
+        }
       }
     }
-    oc_fr_finish_sb(_fr);
-    sb_flags[sbi].coded_fully=_fr->sb_full_last;
-    sb_flags[sbi].coded_partially=_fr->sb_partial_last;
+    oc_fr_finish_sb(_pipe->fr+_pli);
+    sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full_last;
+    sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial_last;
   }
-  _enc->state.ncoded_fragis[_ps->pli]=ncoded_fragis;
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
 }
 
 /*Mode decision is done by exhaustively examining all potential choices.
@@ -828,6 +909,16 @@
     year=2003
   }*/
 
+/*Cost information about a MB mode.*/
+struct oc_mode_choice{
+  unsigned cost;
+  unsigned ssd;
+  unsigned rate;
+  unsigned overhead;
+};
+
+
+
 static void oc_mode_dct_cost_accum(oc_mode_choice *_modec,
  int _qi,int _pli,int _qti,int _satd){
   unsigned rmse;
@@ -852,7 +943,7 @@
 }
 
 static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
- _modec->cost=_modec->ssd+(_modec->rate+_modec->overhead)*_lambda;
+  _modec->cost=_modec->ssd+(_modec->rate+_modec->overhead)*_lambda;
 }
 
 static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
@@ -1078,8 +1169,7 @@
 int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode){
   oc_set_chroma_mvs_func  set_chroma_mvs;
   oc_mcenc_ctx            mcenc;
-  oc_plane_state          ps;
-  oc_frag_state           fr;
+  oc_enc_pipeline_state   pipe;
   oc_mv                   last_mv;
   oc_mv                   prior_mv;
   ogg_int64_t             interbits;
@@ -1097,16 +1187,20 @@
   oc_fragment            *frags;
   oc_mv                  *frag_mvs;
   int                     qi;
+  unsigned                stripe_sby;
+  int                     notstart;
+  int                     notdone;
+  int                     vdec;
   unsigned                sbi;
   unsigned                sbi_end;
+  int                     refi;
   int                     pli;
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
   _enc->state.frame_type=_frame_type;
-  if(!_recode)oc_mcenc_start(_enc,&mcenc);
-  oc_fr_state_init(&fr);
+  if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_start(_enc,&mcenc);
   oc_mode_scheme_chooser_reset(&_enc->chooser);
   oc_enc_tokenize_start(_enc);
-  oc_plane_state_plane_setup(_enc,&ps,0);
+  oc_enc_pipeline_init(_enc,&pipe);
   _enc->mv_bits[0]=_enc->mv_bits[1]=0;
   interbits=intrabits=0;
   last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
@@ -1129,253 +1223,285 @@
   frags=_enc->state.frags;
   frag_mvs=_enc->state.frag_mvs;
   sbi_end=_enc->state.fplanes[0].nsbs;
-  for(sbi=0;sbi<sbi_end;sbi++){
-    int quadi;
-    /*Mode addressing is through Y plane, always 4 MB per SB.*/
-    for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
-      unsigned  mbi;
-      int       mb_mode;
-      int       dx;
-      int       dy;
-      int       mapii;
-      int       mapi;
-      int       bi;
-      ptrdiff_t fragi;
-      mbi=sbi<<2|quadi;
-      if(!_recode&&_enc->state.curframe_num>0){
-        /*Motion estimation:
-          We always do a basic 1MV search for all macroblocks, coded or not,
-           keyframe or not.*/
-        /*Move the motion vector predictors back a frame.*/
-        memmove(embs[mbi].analysis_mv+1,
-         embs[mbi].analysis_mv,2*sizeof(embs[mbi].analysis_mv[0]));
-        /*Search the last frame.*/
-        oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_PREV);
-        /*Search the golden frame.*/
-        oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_GOLD);
-      }
-      dx=dy=0;
-      if(_enc->state.frame_type==OC_INTRA_FRAME){
-        mb_modes[mbi]=mb_mode=OC_MODE_INTRA;
-        oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,0,&fr);
-      }
-      else{
-        oc_mode_choice modes[8];
-        int            mb_mv_bits_0;
-        int            mb_gmv_bits_0;
-        int            mb_4mv_bits_0;
-        int            mb_4mv_bits_1;
-        int            inter_mv_pref;
-        /*Find the block choice with the lowest estimated coding cost.
-          If a Cb or Cr block is coded but no Y' block from a macro block then
-           the mode MUST be OC_MODE_INTER_NOMV.
-          This is the default state to which the mode data structure is
-           initialised in encoder and decoder at the start of each frame.*/
-        /*Block coding cost is estimated from correlated SATD metrics.*/
-        /*At this point, all blocks that are in frame are still marked coded.*/
-        if(!_recode){
-          memcpy(embs[mbi].unref_mv,
-           embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
-          embs[mbi].refined=0;
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  notstart=0;
+  notdone=1;
+  for(stripe_sby=0;notdone;stripe_sby+=pipe.mcu_nvsbs){
+    const oc_fragment_plane *fplane;
+    int                      sby_end;
+    fplane=_enc->state.fplanes+0;
+    sby_end=fplane->nvsbs;
+    notdone=stripe_sby+pipe.mcu_nvsbs<sby_end;
+    if(notdone)sby_end=stripe_sby+pipe.mcu_nvsbs;
+    sbi=stripe_sby*fplane->nhsbs;
+    sbi_end=sby_end*fplane->nhsbs;
+    pipe.fragy0[0]=stripe_sby<<2;
+    pipe.fragy_end[0]=sby_end<<2;
+    for(;sbi<sbi_end;sbi++){
+      int quadi;
+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        unsigned  mbi;
+        int       mb_mode;
+        int       dx;
+        int       dy;
+        int       mapii;
+        int       mapi;
+        int       bi;
+        ptrdiff_t fragi;
+        mbi=sbi<<2|quadi;
+        if(!_recode&&_enc->state.curframe_num>0){
+          /*Motion estimation:
+            We always do a basic 1MV search for all macroblocks, coded or not,
+             keyframe or not.*/
+          /*Move the motion vector predictors back a frame.*/
+          memmove(embs[mbi].analysis_mv+1,
+           embs[mbi].analysis_mv,2*sizeof(embs[mbi].analysis_mv[0]));
+          /*Search the last frame.*/
+          oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_PREV);
+          /*Search the golden frame.*/
+          oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_GOLD);
         }
-        oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,OC_MODE_INTER_NOMV,qi);
-        oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,qi);
-        intrabits+=modes[OC_MODE_INTRA].rate;
-        mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
-         OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],qi);
-        oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
-         OC_MODE_INTER_MV_LAST,last_mv,qi);
-        oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
-         OC_MODE_INTER_MV_LAST2,prior_mv,qi);
-        oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
-         OC_MODE_GOLDEN_NOMV,qi);
-        mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
-         OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],qi);
-        mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-         embs[mbi].block_mv,qi);
-        mb_4mv_bits_1=48;
-        /*The explicit MV modes (2,6,7) have not yet gone through halfpel
-           refinement.
-          We choose the explicit MV mode that's already furthest ahead on bits
-           and refine only that one.
-          We have to be careful to remember which ones we've refined so that
-           we don't refine it again if we re-encode this frame.*/
-        inter_mv_pref=_enc->lambda*3<<OC_BIT_SCALE;
-        if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
-         modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
-          if(!(embs[mbi].refined&0x80)){
-            oc_mcenc_refine4mv(_enc,mbi);
-            embs[mbi].refined|=0x80;
-          }
-          mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-           embs[mbi].ref_mv,qi);
+        dx=dy=0;
+        if(_enc->state.frame_type==OC_INTRA_FRAME){
+          mb_modes[mbi]=mb_mode=OC_MODE_INTRA;
+          oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
         }
-        else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
-         modes[OC_MODE_INTER_MV].cost){
-          if(!(embs[mbi].refined&0x40)){
-            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
-            embs[mbi].refined|=0x40;
+        else{
+          oc_mode_choice modes[8];
+          int            mb_mv_bits_0;
+          int            mb_gmv_bits_0;
+          int            mb_4mv_bits_0;
+          int            mb_4mv_bits_1;
+          int            inter_mv_pref;
+          /*Find the block choice with the lowest estimated coding cost.
+            If a Cb or Cr block is coded but no Y' block from a macro block then
+             the mode MUST be OC_MODE_INTER_NOMV.
+            This is the default state to which the mode data structure is
+             initialised in encoder and decoder at the start of each frame.*/
+          /*Block coding cost is estimated from correlated SATD metrics.*/
+          /*At this point, all blocks that are in frame are still marked coded.*/
+          if(!_recode){
+            memcpy(embs[mbi].unref_mv,
+             embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
+            embs[mbi].refined=0;
           }
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,OC_MODE_INTER_NOMV,qi);
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,qi);
+          intrabits+=modes[OC_MODE_INTRA].rate;
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],qi);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
+           OC_MODE_INTER_MV_LAST,last_mv,qi);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
+           OC_MODE_INTER_MV_LAST2,prior_mv,qi);
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+           OC_MODE_GOLDEN_NOMV,qi);
           mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
-           OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],qi);
-        }
-        if(!(embs[mbi].refined&0x04)){
-          oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
-          embs[mbi].refined|=0x04;
-        }
-        mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
-         OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],qi);
-        /*Finally, pick the mode with the cheapest estimated bit cost.*/
-        mb_mode=0;
-        if(modes[1].cost<modes[0].cost)mb_mode=1;
-        if(modes[3].cost<modes[mb_mode].cost)mb_mode=3;
-        if(modes[4].cost<modes[mb_mode].cost)mb_mode=4;
-        if(modes[5].cost<modes[mb_mode].cost)mb_mode=5;
-        if(modes[6].cost<modes[mb_mode].cost)mb_mode=6;
-        if(modes[7].cost<modes[mb_mode].cost)mb_mode=7;
-        /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
-        if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
-          inter_mv_pref=0;
-        }
-        if(modes[2].cost<modes[mb_mode].cost+inter_mv_pref)mb_mode=2;
-        mb_modes[mbi]=mb_mode;
-        /*Propagate the MVs to the luma blocks.*/
-        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
-          switch(mb_mode){
-            case OC_MODE_INTER_MV:{
-              dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
-              dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
-            }break;
-            case OC_MODE_INTER_MV_LAST:{
-              dx=last_mv[0];
-              dy=last_mv[1];
-            }break;
-            case OC_MODE_INTER_MV_LAST2:{
-              dx=prior_mv[0];
-              dy=prior_mv[1];
-            }break;
-            case OC_MODE_GOLDEN_MV:{
-              dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
-              dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
-            }break;
+           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],qi);
+          mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+           embs[mbi].block_mv,qi);
+          mb_4mv_bits_1=48;
+          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+             refinement.
+            We choose the explicit MV mode that's already furthest ahead on bits
+             and refine only that one.
+            We have to be careful to remember which ones we've refined so that
+             we don't refine it again if we re-encode this frame.*/
+          inter_mv_pref=_enc->lambda*3<<OC_BIT_SCALE;
+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
+           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+            if(!(embs[mbi].refined&0x80)){
+              oc_mcenc_refine4mv(_enc,mbi);
+              embs[mbi].refined|=0x80;
+            }
+            mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].ref_mv,qi);
           }
-          for(bi=0;bi<4;bi++){
-            fragi=mb_maps[mbi][0][bi];
-            frag_mvs[fragi][0]=(signed char)dx;
-            frag_mvs[fragi][1]=(signed char)dy;
+          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
+           modes[OC_MODE_INTER_MV].cost){
+            if(!(embs[mbi].refined&0x40)){
+              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
+              embs[mbi].refined|=0x40;
+            }
+            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],qi);
           }
-        }
-        if(oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,
-         modes[mb_mode].overhead,&fr)>0){
-          int orig_mb_mode;
-          orig_mb_mode=mb_mode;
-          mb_mode=mb_modes[mbi];
-          switch(mb_mode){
-            case OC_MODE_INTER_MV:{
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
-              /*If we're backing out from 4MV, find the MV we're actually
-                 using.*/
-              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
-                for(bi=0;;bi++){
+          if(!(embs[mbi].refined&0x04)){
+            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
+            embs[mbi].refined|=0x04;
+          }
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],qi);
+          /*Finally, pick the mode with the cheapest estimated bit cost.*/
+          mb_mode=0;
+          if(modes[1].cost<modes[0].cost)mb_mode=1;
+          if(modes[3].cost<modes[mb_mode].cost)mb_mode=3;
+          if(modes[4].cost<modes[mb_mode].cost)mb_mode=4;
+          if(modes[5].cost<modes[mb_mode].cost)mb_mode=5;
+          if(modes[6].cost<modes[mb_mode].cost)mb_mode=6;
+          if(modes[7].cost<modes[mb_mode].cost)mb_mode=7;
+          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
+          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
+            inter_mv_pref=0;
+          }
+          if(modes[2].cost<modes[mb_mode].cost+inter_mv_pref)mb_mode=2;
+          mb_modes[mbi]=mb_mode;
+          /*Propagate the MVs to the luma blocks.*/
+          if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+            switch(mb_mode){
+              case OC_MODE_INTER_MV:{
+                dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
+                dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
+              }break;
+              case OC_MODE_INTER_MV_LAST:{
+                dx=last_mv[0];
+                dy=last_mv[1];
+              }break;
+              case OC_MODE_INTER_MV_LAST2:{
+                dx=prior_mv[0];
+                dy=prior_mv[1];
+              }break;
+              case OC_MODE_GOLDEN_MV:{
+                dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
+                dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
+              }break;
+            }
+            for(bi=0;bi<4;bi++){
+              fragi=mb_maps[mbi][0][bi];
+              frag_mvs[fragi][0]=(signed char)dx;
+              frag_mvs[fragi][1]=(signed char)dy;
+            }
+          }
+          if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
+           modes[mb_mode].overhead)>0){
+            int orig_mb_mode;
+            orig_mb_mode=mb_mode;
+            mb_mode=mb_modes[mbi];
+            switch(mb_mode){
+              case OC_MODE_INTER_MV:{
+                memcpy(prior_mv,last_mv,sizeof(prior_mv));
+                /*If we're backing out from 4MV, find the MV we're actually
+                   using.*/
+                if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
+                  for(bi=0;;bi++){
+                    fragi=mb_maps[mbi][0][bi];
+                    if(frags[fragi].coded){
+                      memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+                      dx=frag_mvs[fragi][0];
+                      dy=frag_mvs[fragi][1];
+                      break;
+                    }
+                  }
+                  mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+                }
+                /*Otherwise we used the original analysis MV.*/
+                else{
+                  memcpy(last_mv,
+                   embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
+                }
+                _enc->mv_bits[0]+=mb_mv_bits_0;
+                _enc->mv_bits[1]+=12;
+              }break;
+              case OC_MODE_INTER_MV_LAST2:{
+                oc_mv tmp_mv;
+                memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
+                memcpy(prior_mv,last_mv,sizeof(prior_mv));
+                memcpy(last_mv,tmp_mv,sizeof(last_mv));
+              }break;
+              case OC_MODE_GOLDEN_MV:{
+                _enc->mv_bits[0]+=mb_gmv_bits_0;
+                _enc->mv_bits[1]+=12;
+              }break;
+              case OC_MODE_INTER_MV_FOUR:{
+                oc_mv lbmvs[4];
+                oc_mv cbmvs[4];
+                memcpy(prior_mv,last_mv,sizeof(prior_mv));
+                for(bi=0;bi<4;bi++){
                   fragi=mb_maps[mbi][0][bi];
                   if(frags[fragi].coded){
                     memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
-                    dx=frag_mvs[fragi][0];
-                    dy=frag_mvs[fragi][1];
-                    break;
+                    memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
+                    _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
+                     +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
+                    _enc->mv_bits[1]+=12;
                   }
+                  /*Replace the block MVs for not-coded blocks with (0,0).*/
+                  else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
                 }
-                mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
-              }
-              /*Otherwise we used the original analysis MV.*/
-              else{
-                memcpy(last_mv,
-                 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
-              }
-              _enc->mv_bits[0]+=mb_mv_bits_0;
-              _enc->mv_bits[1]+=12;
-            }break;
-            case OC_MODE_INTER_MV_LAST2:{
-              oc_mv tmp_mv;
-              memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
-              memcpy(last_mv,tmp_mv,sizeof(last_mv));
-            }break;
-            case OC_MODE_GOLDEN_MV:{
-              _enc->mv_bits[0]+=mb_gmv_bits_0;
-              _enc->mv_bits[1]+=12;
-            }break;
-            case OC_MODE_INTER_MV_FOUR:{
-              oc_mv lbmvs[4];
-              oc_mv cbmvs[4];
-              memcpy(prior_mv,last_mv,sizeof(prior_mv));
-              for(bi=0;bi<4;bi++){
-                fragi=mb_maps[mbi][0][bi];
-                if(frags[fragi].coded){
-                  memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
-                  memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
-                  _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
-                   +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
-                  _enc->mv_bits[1]+=12;
+                (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+                for(mapii=4;mapii<nmap_idxs;mapii++){
+                  mapi=map_idxs[mapii];
+                  pli=mapi>>2;
+                  bi=mapi&3;
+                  fragi=mb_maps[mbi][pli][bi];
+                  frags[fragi].mb_mode=mb_mode;
+                  memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
                 }
-                /*Replace the block MVs for not-coded blocks with (0,0).*/
-                else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
-              }
-              (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
-              for(mapii=4;mapii<nmap_idxs;mapii++){
-                mapi=map_idxs[mapii];
-                pli=mapi>>2;
-                bi=mapi&3;
-                fragi=mb_maps[mbi][pli][bi];
-                frags[fragi].mb_mode=mb_mode;
-                memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
-              }
-            }break;
+              }break;
+            }
+            coded_mbis[ncoded_mbis++]=mbi;
+            oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
+            interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
           }
-          coded_mbis[ncoded_mbis++]=mbi;
-          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
-          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
+          else{
+            *(uncoded_mbis-++nuncoded_mbis)=mbi;
+            mb_mode=OC_MODE_INTER_NOMV;
+            dx=dy=0;
+          }
         }
-        else{
-          *(uncoded_mbis-++nuncoded_mbis)=mbi;
-          mb_mode=OC_MODE_INTER_NOMV;
-          dx=dy=0;
+        /*Propagate final MB mode and MVs to the chroma blocks.
+          This has already been done for 4MV mode, since it requires individual
+           block motion vectors.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          for(mapii=4;mapii<nmap_idxs;mapii++){
+            mapi=map_idxs[mapii];
+            pli=mapi>>2;
+            bi=mapi&3;
+            fragi=mb_maps[mbi][pli][bi];
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi][0]=(signed char)dx;
+            frag_mvs[fragi][1]=(signed char)dy;
+          }
         }
       }
-      /*Propagate final MB mode and MVs to the chroma blocks.
-        This has already been done for 4MV mode, since it requires individual
-         block motion vectors.*/
-      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
-        for(mapii=4;mapii<nmap_idxs;mapii++){
-          mapi=map_idxs[mapii];
-          pli=mapi>>2;
-          bi=mapi&3;
-          fragi=mb_maps[mbi][pli][bi];
-          frags[fragi].mb_mode=mb_mode;
-          frag_mvs[fragi][0]=(signed char)dx;
-          frag_mvs[fragi][1]=(signed char)dy;
-        }
+      oc_fr_finish_sb(pipe.fr+0);
+      sb_flags[sbi].coded_fully=pipe.fr[0].sb_full_last;
+      sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial_last;
+    }
+    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    /*Code chroma planes.*/
+    for(pli=1;pli<3;pli++){
+      fplane=_enc->state.fplanes+pli;
+      sbi=fplane->sboffset+(stripe_sby>>vdec)*fplane->nhsbs;
+      pipe.fragy0[pli]=stripe_sby<<2-vdec;
+      if(notdone){
+        sbi_end=sbi+(sby_end-stripe_sby>>vdec)*fplane->nhsbs;
+        pipe.fragy_end[pli]=sby_end<<2-vdec;
       }
+      else{
+        sbi_end=fplane->sboffset+fplane->nsbs;
+        pipe.fragy_end[pli]=fplane->nvfrags;
+      }
+      oc_enc_sb_transform_quantize_chroma(_enc,&pipe,pli,sbi,sbi_end);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
     }
-    oc_fr_finish_sb(&fr);
-    sb_flags[sbi].coded_fully=fr.sb_full_last;
-    sb_flags[sbi].coded_partially=fr.sb_partial_last;
+    notstart=1;
   }
-  /*Code Cb plane.*/
-  oc_plane_state_plane_setup(_enc,&ps,1);
-  sbi=sbi_end;
-  sbi_end=sbi+_enc->state.fplanes[1].nsbs;
-  oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
-  /*Code Cr plane.*/
-  oc_plane_state_plane_setup(_enc,&ps,2);
-  sbi=sbi_end;
-  sbi_end=sbi+_enc->state.fplanes[2].nsbs;
-  oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
+  /*Finish filling in the reference frame borders.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  /*Finish adding flagging overhead costs to inter bit counts to determine if
+     we should have coded a key frame instead.*/
   if(_enc->state.frame_type!=OC_INTRA_FRAME){
     if(interbits>intrabits)return 1;
-    /*Finish adding flagging overhead costs to inter bit counts.*/
-    oc_fr_flush(&fr);
-    interbits+=fr.bits<<OC_BIT_SCALE;
+    /*Technically the chroma plane counts are over-estimations, because they
+       don't account for continuing runs from the luma planes, but the
+       inaccuracy is small.*/
+    for(pli=0;pli<3;pli++){
+      oc_fr_flush(pipe.fr+pli);
+      interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
+    }
     interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
     interbits+=
      _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;

Modified: branches/theora-thusnelda/lib/enc/encint.h
===================================================================
--- branches/theora-thusnelda/lib/enc/encint.h	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/encint.h	2009-06-14 18:50:22 UTC (rev 16130)
@@ -335,6 +335,10 @@
  oc_token_checkpoint **_stack,int _acmin);
 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
  const oc_token_checkpoint *_stack,int _n);
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend);
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1);
 void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
 
 

Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/encode.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -946,6 +946,15 @@
   oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
   oc_enc_analyze(_enc,OC_INTRA_FRAME,_recode);
   oc_enc_frame_pack(_enc);
+  /*On the first frame, the previous call was an initial dry-run to prime
+     feed-forward statistics.*/
+  if(!_recode&&_enc->state.curframe_num==0){
+    if(_enc->state.info.target_bitrate>0){
+      oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+       OC_INTRA_FRAME,_enc->state.qis[0],1);
+    }
+    oc_enc_compress_keyframe(_enc,1);
+  }
 }
 
 static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
@@ -1261,40 +1270,15 @@
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
   _enc->state.curframe_num+=_enc->prev_dup_count+1;
   /*Step 4: Compress the frame.*/
-  /*Don't allow the generation of invalid files that overflow the
-     keyframe_granule_shift.*/
+  /*Start with a keyframe, and don't allow the generation of invalid files that
+     overflow the keyframe_granule_shift.*/
   if(_enc->state.curframe_num==0||
    _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
    _enc->keyframe_frequency_force){
     oc_enc_compress_keyframe(_enc,0);
-    /*On the first frame, the previous call was an initial dry-run to prime
-       feed-forward statistics.*/
-    if(_enc->state.curframe_num==0){
-      if(_enc->state.info.target_bitrate>0){
-        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
-         OC_INTRA_FRAME,_enc->state.qis[0],1);
-      }
-      oc_enc_compress_keyframe(_enc,1);
-    }
   }
   /*Compress the frame.*/
   else oc_enc_compress_frame(_enc,0);
-  /*Step 5: Finish reconstruction.
-    TODO: Move this inline with compression process.*/
-  {
-    int bv[256];
-    int loop_filter;
-    loop_filter=!oc_state_loop_filter_init(&_enc->state,bv);
-    for(pli=0;pli<3;pli++){
-      if(loop_filter){
-        oc_state_loop_filter_frag_rows(&_enc->state,bv,refi,pli,
-         0,_enc->state.fplanes[pli].nvfrags);
-      }
-      oc_state_borders_fill_rows(&_enc->state,refi,pli,
-       0,_enc->state.ref_frame_bufs[refi][pli].height);
-      oc_state_borders_fill_caps(&_enc->state,refi,pli);
-    }
-  }
   oc_restore_fpu(&_enc->state);
   /*Update state variables.*/
   _enc->packet_state=OC_PACKET_READY;

Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/mcenc.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -264,22 +264,22 @@
      may cause increased degredation in many blocks to come.
     We could artificially reduce lambda to compensate, but it's faster to just
      disable it entirely, and use D (the distortion) as the sole criterion.*/
-  const ptrdiff_t       *frag_buf_offs;
-  const ptrdiff_t       *fragis;
-  const unsigned char   *src;
-  const unsigned char   *ref;
-  int                    ystride;
-  oc_mb_enc_info        *embs;
-  ogg_int32_t            hit_cache[31];
-  ogg_int32_t            hitbit;
-  unsigned               best_block_err[4];
-  unsigned               block_err[4];
-  unsigned               best_err;
-  int                    best_vec[2];
-  int                    best_block_vec[4][2];
-  int                    candx;
-  int                    candy;
-  int                    bi;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  const unsigned char *src;
+  const unsigned char *ref;
+  int                  ystride;
+  oc_mb_enc_info      *embs;
+  ogg_int32_t          hit_cache[31];
+  ogg_int32_t          hitbit;
+  unsigned             best_block_err[4];
+  unsigned             block_err[4];
+  unsigned             best_err;
+  int                  best_vec[2];
+  int                  best_block_vec[4][2];
+  int                  candx;
+  int                  candy;
+  int                  bi;
   embs=_enc->mb_info;
   /*Find some candidate motion vectors.*/
   oc_mcenc_find_candidates(_enc,_mcenc,_mbi,_frame);
@@ -701,14 +701,14 @@
 }
 
 void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
-  oc_mb_enc_info        *embs;
-  const ptrdiff_t       *frag_buf_offs;
-  const ptrdiff_t       *fragis;
-  const unsigned char   *src;
-  const unsigned char   *ref;
-  int                    offset_y[9];
-  int                    ystride;
-  int                    bi;
+  oc_mb_enc_info      *embs;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  const unsigned char *src;
+  const unsigned char *ref;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  bi;
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];

Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c	2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/tokenize.c	2009-06-14 18:50:22 UTC (rev 16130)
@@ -392,7 +392,7 @@
   return total_bits;
 }
 
-static void oc_enc_pred_dc_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend){
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   ogg_int16_t             *frag_dc;
@@ -420,239 +420,242 @@
   }
 }
 
-static void oc_enc_tokenize_dc(oc_enc_ctx *_enc){
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1){
   const ogg_int16_t *frag_dc;
-  const ptrdiff_t   *coded_fragis;
-  ptrdiff_t          ncoded_fragis;
   ptrdiff_t          fragii;
-  int                pli;
+  unsigned char     *dct_tokens0;
+  unsigned char     *dct_tokens1;
+  ogg_uint16_t      *extra_bits0;
+  ogg_uint16_t      *extra_bits1;
+  ptrdiff_t          ti0;
+  ptrdiff_t          ti1r;
+  ptrdiff_t          ti1w;
+  int                eob_run0;
+  int                eob_run1;
+  int                neobs1;
+  int                token;
+  int                eb;
+  /*eb1 and token1 are always initialized before use; if your compiler thinks
+     otherwise, it is dumb.*/
+  int                token1;
+  int                eb1;
+  /*Return immediately if there are no coded fragments; otherwise we'd flush
+     any trailing EOB run into the AC 1 list and never read it back out.*/
+  if(_ncoded_fragis<=0)return;
   frag_dc=_enc->frag_dc;
-  coded_fragis=_enc->state.coded_fragis;
-  ncoded_fragis=fragii=0;
-  for(pli=0;pli<3;pli++){
-    unsigned char *dct_tokens0;
-    unsigned char *dct_tokens1;
-    ogg_uint16_t  *extra_bits0;
-    ogg_uint16_t  *extra_bits1;
-    ptrdiff_t      ti0;
-    ptrdiff_t      ti1r;
-    ptrdiff_t      ti1w;
-    int            eob_run0;
-    int            eob_run1;
-    int            neobs1;
-    int            token;
-    int            eb;
-    int            token1;
-    int            eb1;
-    /*TODO: Move this inline with reconstruction.*/
-    oc_enc_pred_dc_rows(_enc,pli,0,_enc->state.fplanes[pli].nvfrags);
-    dct_tokens0=_enc->dct_tokens[pli][0];
-    dct_tokens1=_enc->dct_tokens[pli][1];
-    extra_bits0=_enc->extra_bits[pli][0];
-    extra_bits1=_enc->extra_bits[pli][1];
-    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
-    ti0=ti1w=ti1r=0;
-    eob_run0=eob_run1=neobs1=0;
-    for(;fragii<ncoded_fragis;fragii++){
-      int val;
-      /*All tokens in the 1st AC coefficient stack are regenerated as the DC
-         coefficients are produced.
-        This can be done in-place; stack 1 cannot get larger.*/
-      if(!neobs1){
-        /*There's no active EOB run in stack 1; read the next token.*/
-        token1=dct_tokens1[ti1r];
-        eb1=extra_bits1[ti1r];
-        ti1r++;
-        if(token1<OC_NDCT_EOB_TOKEN_MAX){
-          neobs1=oc_decode_eob_token(token1,eb1);
-          /*It's an EOB run; add it to the current (inactive) one.
-            Because we may have moved entries to stack 0, we may have an
-             opportunity to merge two EOB runs in stack 1.*/
-          eob_run1+=neobs1;
-        }
+  dct_tokens0=_enc->dct_tokens[_pli][0];
+  dct_tokens1=_enc->dct_tokens[_pli][1];
+  extra_bits0=_enc->extra_bits[_pli][0];
+  extra_bits1=_enc->extra_bits[_pli][1];
+  ti0=_enc->ndct_tokens[_pli][0];
+  ti1w=ti1r=_prev_ndct_tokens1;
+  eob_run0=_enc->eob_run[_pli][0];
+  /*Flush any trailing EOB run for the 1st AC coefficient.
+    This is needed to allow us to track tokens to the end of the list.*/
+  eob_run1=_enc->eob_run[_pli][1];
+  if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
+  /*If there was an active EOB run at the start of the 1st AC stack, read it
+     in and decode it.*/
+  if(_prev_eob_run1>0){
+    token1=dct_tokens1[ti1r];
+    eb1=extra_bits1[ti1r];
+    ti1r++;
+    eob_run1=oc_decode_eob_token(token1,eb1);
+    /*Consume the portion of the run that came before these fragments.*/
+    neobs1=eob_run1-_prev_eob_run1;
+  }
+  else eob_run1=neobs1=0;
+  for(fragii=0;fragii<_ncoded_fragis;fragii++){
+    int val;
+    /*All tokens in the 1st AC coefficient stack are regenerated as the DC
+       coefficients are produced.
+      This can be done in-place; stack 1 cannot get larger.*/
+    if(!neobs1){
+      /*There's no active EOB run in stack 1; read the next token.*/
+      token1=dct_tokens1[ti1r];
+      eb1=extra_bits1[ti1r];
+      ti1r++;
+      if(token1<OC_NDCT_EOB_TOKEN_MAX){
+        neobs1=oc_decode_eob_token(token1,eb1);
+        /*It's an EOB run; add it to the current (inactive) one.
+          Because we may have moved entries to stack 0, we may have an
+           opportunity to merge two EOB runs in stack 1.*/
+        eob_run1+=neobs1;
       }
-      val=frag_dc[coded_fragis[fragii]];
-      if(val){
-        /*There was a non-zero DC value, so there's no alteration to stack 1
-           for this fragment; just code the stack 0 token.*/
-        /*Flush any pending EOB run.*/
-        if(eob_run0>0){
+    }
+    val=frag_dc[_coded_fragis[fragii]];
+    if(val){
+      /*There was a non-zero DC value, so there's no alteration to stack 1
+         for this fragment; just code the stack 0 token.*/
+      /*Flush any pending EOB run.*/
+      if(eob_run0>0){
+        token=oc_make_eob_token_full(eob_run0,&eb);
+        dct_tokens0[ti0]=(unsigned char)token;
+        extra_bits0[ti0]=(ogg_uint16_t)eb;
+        ti0++;
+        eob_run0=0;
+      }
+      token=oc_make_dct_token_full(0,0,val,&eb);
+      dct_tokens0[ti0]=(unsigned char)token;
+      extra_bits0[ti0]=(ogg_uint16_t)eb;
+      ti0++;
+    }
+    else{
+      /*Zero DC value; that means the entry in stack 1 might need to be coded
+         from stack 0.
+        This requires a stack 1 fixup.*/
+      if(neobs1>0){
+        /*We're in the middle of an active EOB run in stack 1.
+          Move it to stack 0.*/
+        if(++eob_run0>=4095){
           token=oc_make_eob_token_full(eob_run0,&eb);
           dct_tokens0[ti0]=(unsigned char)token;
           extra_bits0[ti0]=(ogg_uint16_t)eb;
           ti0++;
           eob_run0=0;
         }
-        token=oc_make_dct_token_full(0,0,val,&eb);
-        dct_tokens0[ti0]=(unsigned char)token;
-        extra_bits0[ti0]=(ogg_uint16_t)eb;
-        ti0++;
+        eob_run1--;
       }
       else{
-        /*Zero DC value; that means the entry in stack 1 might need to be coded
-           from stack 0.
-          This requires a stack 1 fixup.*/
-        if(neobs1){
-          /*We're in the middle of an active EOB run in stack 1.
-            Move it to stack 0.*/
-          if(++eob_run0>=4095){
-            token=oc_make_eob_token_full(eob_run0,&eb);
-            dct_tokens0[ti0]=(unsigned char)token;
-            extra_bits0[ti0]=(ogg_uint16_t)eb;
-            ti0++;
-            eob_run0=0;
-          }
-          eob_run1--;
+        /*No active EOB run in stack 1, so we can't extend one in stack 0.
+          Flush it if we've got it.*/
+        if(eob_run0>0){
+          token=oc_make_eob_token_full(eob_run0,&eb);
+          dct_tokens0[ti0]=(unsigned char)token;
+          extra_bits0[ti0]=(ogg_uint16_t)eb;
+          ti0++;
+          eob_run0=0;
         }
-        else{
-          /*No active EOB run in stack 1, so we can't extend one in stack 0.
-            Flush it if we've got it.*/
-          if(eob_run0>0){
-            token=oc_make_eob_token_full(eob_run0,&eb);
-            dct_tokens0[ti0]=(unsigned char)token;
-            extra_bits0[ti0]=(ogg_uint16_t)eb;
-            ti0++;
-            eob_run0=0;
-          }
-          /*Stack 1 token is one of: a pure zero run token, a single
-             coefficient token, or a zero run/coefficient combo token.
-            A zero run token is expanded and moved to token stack 0, and the
-             stack 1 entry dropped.
-            A single coefficient value may be transformed into combo token that
-             is moved to stack 0, or if it cannot be combined, it is left alone
-             and a single length-1 zero run is emitted in stack 0.
-            A combo token is extended and moved to stack 0.
-            During AC coding, we restrict the run lengths on combo tokens for
-             stack 1 to guarantee we can extend them.*/
-          switch(token1){
-            case OC_DCT_SHORT_ZRL_TOKEN:{
-              if(eb1<7){
-                dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
-                extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
-                ti0++;
-                /*Don't write the AC coefficient back out.*/
-                continue;
-              }
-              /*Fall through.*/
-            }
-            case OC_DCT_ZRL_TOKEN:{
-              dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+        /*Stack 1 token is one of: a pure zero run token, a single
+           coefficient token, or a zero run/coefficient combo token.
+          A zero run token is expanded and moved to token stack 0, and the
+           stack 1 entry dropped.
+          A single coefficient value may be transformed into combo token that
+           is moved to stack 0, or if it cannot be combined, it is left alone
+           and a single length-1 zero run is emitted in stack 0.
+          A combo token is extended and moved to stack 0.
+          During AC coding, we restrict the run lengths on combo tokens for
+           stack 1 to guarantee we can extend them.*/
+        switch(token1){
+          case OC_DCT_SHORT_ZRL_TOKEN:{
+            if(eb1<7){
+              dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
               ti0++;
               /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_ONE_TOKEN:
-            case OC_MINUS_ONE_TOKEN:{
-              dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
-              extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_TWO_TOKEN:
-            case OC_MINUS_TWO_TOKEN:{
-              dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
-              extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_DCT_VAL_CAT2:{
-              dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
-              extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_DCT_RUN_CAT1A:
-            case OC_DCT_RUN_CAT1A+1:
-            case OC_DCT_RUN_CAT1A+2:
-            case OC_DCT_RUN_CAT1A+3:{
-              dct_tokens0[ti0]=(unsigned char)(token1+1);
-              extra_bits0[ti0]=(ogg_uint16_t)eb1;
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_DCT_RUN_CAT1A+4:{
+              continue;
+            }
+            /*Fall through.*/
+          }
+          case OC_DCT_ZRL_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_ONE_TOKEN:
+          case OC_MINUS_ONE_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
+            extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_TWO_TOKEN:
+          case OC_MINUS_TWO_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+            extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_VAL_CAT2:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+            extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1A:
+          case OC_DCT_RUN_CAT1A+1:
+          case OC_DCT_RUN_CAT1A+2:
+          case OC_DCT_RUN_CAT1A+3:{
+            dct_tokens0[ti0]=(unsigned char)(token1+1);
+            extra_bits0[ti0]=(ogg_uint16_t)eb1;
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1A+4:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1B:{
+            if((eb1&3)<3){
               dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
-              extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_DCT_RUN_CAT1B:{
-              if((eb1&3)<3){
-                dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
-                extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
-                ti0++;
-                /*Don't write the AC coefficient back out.*/
-                continue;
-              }
-              eb1=((eb1&4)<<1)-1;
-              /*Fall through.*/
-            }
-            case OC_DCT_RUN_CAT1C:{
-              dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
               ti0++;
               /*Don't write the AC coefficient back out.*/
-            }continue;
-            case OC_DCT_RUN_CAT2A:{
-              eb1=(eb1<<1)-1;
-              /*Fall through.*/
+              continue;
             }
-            case OC_DCT_RUN_CAT2B:{
-              dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
-              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
-              ti0++;
-              /*Don't write the AC coefficient back out.*/
-            }continue;
+            eb1=((eb1&4)<<1)-1;
+            /*Fall through.*/
           }
-          /*We can't merge tokens, write a short zero run and keep going.*/
-          dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
-          extra_bits0[ti0]=0;
-          ti0++;
+          case OC_DCT_RUN_CAT1C:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT2A:{
+            eb1=(eb1<<1)-1;
+            /*Fall through.*/
+          }
+          case OC_DCT_RUN_CAT2B:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
         }
+        /*We can't merge tokens, write a short zero run and keep going.*/
+        dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+        extra_bits0[ti0]=0;
+        ti0++;
       }
-      if(!neobs1){
-        /*Flush any (inactive) EOB run.*/
-        if(eob_run1>0){
-          token=oc_make_eob_token_full(eob_run1,&eb);
-          dct_tokens1[ti1w]=(unsigned char)token;
-          extra_bits1[ti1w]=(ogg_uint16_t)eb;
-          ti1w++;
-          eob_run1=0;
-        }
-        /*There's no active EOB run, so log the current token.*/
-        dct_tokens1[ti1w]=(unsigned char)token1;
-        extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+    }
+    if(!neobs1){
+      /*Flush any (inactive) EOB run.*/
+      if(eob_run1>0){
+        token=oc_make_eob_token_full(eob_run1,&eb);
+        dct_tokens1[ti1w]=(unsigned char)token;
+        extra_bits1[ti1w]=(ogg_uint16_t)eb;
         ti1w++;
+        eob_run1=0;
       }
-      else{
-        /*Otherwise consume one EOB from the current run.*/
-        neobs1--;
-        /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
-        if(eob_run1-neobs1>=4095){
-          token=oc_make_eob_token_full(4095,&eb);
-          dct_tokens1[ti1w]=(unsigned char)token;
-          extra_bits1[ti1w]=(ogg_uint16_t)eb;
-          ti1w++;
-          eob_run1-=4095;
-        }
+      /*There's no active EOB run, so log the current token.*/
+      dct_tokens1[ti1w]=(unsigned char)token1;
+      extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+      ti1w++;
+    }
+    else{
+      /*Otherwise consume one EOB from the current run.*/
+      neobs1--;
+      /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
+      if(eob_run1-neobs1>=4095){
+        token=oc_make_eob_token_full(4095,&eb);
+        dct_tokens1[ti1w]=(unsigned char)token;
+        extra_bits1[ti1w]=(ogg_uint16_t)eb;
+        ti1w++;
+        eob_run1-=4095;
       }
     }
-    /*Flush the trailing EOB runs.*/
-    if(eob_run0>0){
-      token=oc_make_eob_token_full(eob_run0,&eb);
-      dct_tokens0[ti0]=(unsigned char)token;
-      extra_bits0[ti0]=(ogg_uint16_t)eb;
-      ti0++;
-    }
-    if(eob_run1>0){
-      token=oc_make_eob_token_full(eob_run1,&eb);
-      dct_tokens1[ti1w]=(unsigned char)token;
-      extra_bits1[ti1w]=(ogg_uint16_t)eb;
-      ti1w++;
-    }
-    _enc->ndct_tokens[pli][0]=ti0;
-    _enc->ndct_tokens[pli][1]=ti1w;
   }
+  /*Save the current state.*/
+  _enc->ndct_tokens[_pli][0]=ti0;
+  _enc->ndct_tokens[_pli][1]=ti1w;
+  _enc->eob_run[_pli][0]=eob_run0;
+  _enc->eob_run[_pli][1]=eob_run1;
 }
 
 /*DC prediction, post-facto DC tokenization (has to be completed after DC
@@ -660,16 +663,12 @@
 void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
   int pli;
   int zzi;
-  /*Emit final EOB runs for the AC coefficients.
-    This must be done before we tokenize the DC coefficients, so we can
-     properly track the 1st AC coefficient to the end of the list.*/
-  for(pli=0;pli<3;pli++)for(zzi=1;zzi<64;zzi++){
+  /*Emit final EOB runs.*/
+  for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
     int eob_run;
     eob_run=_enc->eob_run[pli][zzi];
     if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
   }
-  /*Fill in the DC token list and fix-up the 1st AC coefficient.*/
-  oc_enc_tokenize_dc(_enc);
   /*Merge the final EOB run of one token list with the start of the next, if
      possible.*/
   for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
@@ -681,6 +680,8 @@
     int       new_eb;
     int       zzj;
     int       plj;
+    /*ti is always initialized before use; if your compiler thinks otherwise,
+       it is dumb.*/
     ptrdiff_t ti;
     int       run_count;
     /*Make sure this coefficient has tokens at all.*/
@@ -704,7 +705,6 @@
     /*Ensure its last token was an EOB run.*/
     if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
     /*Pull off the associated extra bits, if any, and decode the runs.*/
-    /*ti is always initialized; if your compiler thinks otherwise, it is dumb.*/
     old_eb1=_enc->extra_bits[plj][zzj][ti];
     old_eb2=_enc->extra_bits[pli][zzi][0];
     run_count=oc_decode_eob_token(old_tok1,old_eb1)