[xiph-commits] r15555 - branches/theora-thusnelda/lib/enc

Wed Dec 3 02:34:41 PST 2008

Author: xiphmont
Date: 2008-12-03 02:34:41 -0800 (Wed, 03 Dec 2008)
New Revision: 15555

Modified:
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct_encode.c
   branches/theora-thusnelda/lib/enc/encode.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/mode.c
Log:
Ongoing skip refinement work as prompted by OVA slideshow transcodes



Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================

--- branches/theora-thusnelda/lib/enc/codec_internal.h	2008-12-03 00:21:45 UTC (rev 15554)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2008-12-03 10:34:41 UTC (rev 15555)
@@ -344,7 +344,7 @@
 			      token_checkpoint_t *stack,
 			      int n);
 extern void dct_tokenize_init (CP_INSTANCE *cpi);
-extern void dct_tokenize_AC (CP_INSTANCE *cpi, 
+extern int dct_tokenize_AC (CP_INSTANCE *cpi, 
 			     int fi, 
 			     ogg_int16_t *dct, 
 			     ogg_int16_t *dequant, 

Modified: branches/theora-thusnelda/lib/enc/dct_encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_encode.c	2008-12-03 00:21:45 UTC (rev 15554)
+++ branches/theora-thusnelda/lib/enc/dct_encode.c	2008-12-03 10:34:41 UTC (rev 15555)
@@ -5,7 +5,7 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
@@ -224,6 +224,12 @@
   48,48,48,48,48,48,48,48,
   48,48,48,48,48,48,48,48};
 
+/* only counts bits */
+static int tokencost(CP_INSTANCE *cpi, int huff, int coeff, int token){
+  huff += acoffset[coeff];
+  return cpi->HuffCodeLengthArray_VP3x[huff][token] + cpi->ExtraBitLengths_VP3x[token];
+}
+
 void tokenlog_rollback(CP_INSTANCE *cpi, token_checkpoint_t *stack,int n){
   int i;
   for(i=n-1;i>=0;i--){
@@ -316,11 +322,15 @@
   tokenlog_metrics(cpi,coeff,chroma,token);
 }
 
-static void tokenize_eobrun(CP_INSTANCE *cpi, int pos, int run, token_checkpoint_t **stack){
+static int tokenize_eobrun(CP_INSTANCE *cpi, int pos, int run, token_checkpoint_t **stack){
   int token=0,eb=0;
   int chroma = !(run&0x8000);
+  int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][chroma];
+
   make_eobrun_token(run&0x7fff, &token, &eb);
   token_add(cpi, chroma, pos, token, eb, stack);
+
+  return tokencost(cpi,huff,pos,token);
 }
 
 
@@ -360,7 +370,7 @@
 			   token_checkpoint_t **stack){
   int eb=0;
   int token=make_dct_token(cpi,coeff,coeff2,val,&eb);
-  
+
   /* Emit pending EOB run if any */
   if(cpi->eob_run[coeff]){
     tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],stack);
@@ -378,19 +388,20 @@
   return 1;
 }
 
-static void tokenize_mark_run(CP_INSTANCE *cpi, 
+static int tokenize_mark_run(CP_INSTANCE *cpi, 
 			      int chroma,
 			      int fi,
 			      int pre,
 			      int coeff,
 			      token_checkpoint_t **stack){
-  
+  int cost = 0;
+
   if(pre && cpi->dct_token_count[coeff] == 0){
     if(stack)tokenlog_mark(cpi,coeff,stack); /* log an undo without logging a token */
     cpi->eob_pre[coeff]++;
   }else{
     if((cpi->eob_run[coeff]&0x7fff) == 4095){
-      tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],stack);
+      cost += tokenize_eobrun(cpi,coeff,cpi->eob_run[coeff],stack);
       cpi->eob_run[coeff] = 0;
     }
     
@@ -401,14 +412,9 @@
 #ifdef COLLECT_METRICS
   cpi->dct_eob_fi_stack[coeff][cpi->dct_eob_fi_count[coeff]++]=fi;
 #endif
+  return cost;
 }
 
-/* only counts bits */
-static int tokencost(CP_INSTANCE *cpi, int huff, int coeff, int token){
-  huff += acoffset[coeff];
-  return cpi->HuffCodeLengthArray_VP3x[huff][token] + cpi->ExtraBitLengths_VP3x[token];
-}
-
 static int tokenize_dctcost(CP_INSTANCE *cpi,int chroma,
 			     int coeff, int coeff2, int val){
   int huff = cpi->huffchoice[cpi->FrameType!=KEY_FRAME][1][chroma];
@@ -462,10 +468,11 @@
    simply assume there will be a nonzero DC value and code.  That's
    not a true assumption but it can be fixed-up as DC is tokenized
    later */
-void dct_tokenize_AC(CP_INSTANCE *cpi, int fi, 
+int dct_tokenize_AC(CP_INSTANCE *cpi, int fi, 
 		     ogg_int16_t *dct, ogg_int16_t *dequant, ogg_int16_t *origdct, 
 		     int chroma, token_checkpoint_t **stack){
   int coeff = 1; /* skip DC for now */
+  int retcost = 0;
   while(coeff < BLOCK_SIZE){
     int i = coeff;
     int ret;
@@ -474,45 +481,48 @@
     
     if ( i == BLOCK_SIZE ){
       
-      tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
+      retcost += tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
       coeff = BLOCK_SIZE;
     }else{
 
       /* determine costs for encoding this value (and any preceeding
 	 eobrun/zerorun) as well as the cost for encoding a demoted token */
-      int cost = tokenize_dctcost(cpi,chroma,coeff,i,dct[i]);
+      int costA = tokenize_dctcost(cpi,chroma,coeff,i,dct[i]),costB;
+      int costD = costA;
       int dval = (dct[i]>0 ? dct[i]-1 : dct[i]+1);
       int j=i;
       if(dval){
 	/* demoting will not produce a zero. */
-	cost -= tokenize_dctcost(cpi,chroma,coeff,i,dval);
+	costD -= costB = tokenize_dctcost(cpi,chroma,coeff,i,dval);
       }else{
 	/* demoting token will produce a zero. */
 	j=i+1;
+	costB = 0;
 	while((j < BLOCK_SIZE) && !dct[j] ) j++;
 	if(j==BLOCK_SIZE){
-	  cost += tokenize_eobcost(cpi,chroma,i+1);
-	  cost -= tokenize_eobcost(cpi,chroma,coeff);
+	  costD += tokenize_eobcost(cpi,chroma,i+1);
+	  costD -= tokenize_eobcost(cpi,chroma,coeff);
 	}else{
-	  cost += tokenize_dctcost(cpi,chroma,i+1,j,dct[j]);
-	  cost -= tokenize_dctcost(cpi,chroma,coeff,j,dct[j]);
+	  costD += tokenize_dctcost(cpi,chroma,i+1,j,dct[j]);
+	  costD -= tokenize_dctcost(cpi,chroma,coeff,j,dct[j]);
 	}
       }
 
-      if(cost>0){
+      if(costD>0){
 	/* demoting results in a cheaper token cost.  Is the bit savings worth the added distortion? */
 	int ii = dezigzag_index[i];
 	int od = dct[i]*dequant[i] - origdct[ii];
 	int dd = dval*dequant[i] - origdct[ii];
 	int delta = dd*dd - od*od;
 
-	if(delta < cost*cpi->token_lambda){
+	if(delta < costD*cpi->token_lambda){
 	  /* we have a winner.  Demote token */
 	  dct[i]=dval;
+	  costA=costB;
 
 	  if(dval==0){
 	    if(j==BLOCK_SIZE){
-	      tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
+	      retcost += tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
 	      coeff = BLOCK_SIZE;
 	      break;
 	    }else{
@@ -522,6 +532,7 @@
 	  }
 	}
       }
+      retcost+=costA;
 	
       ret = tokenize_dctval(cpi, chroma, fi, coeff, i, dct[i], stack);
       if(!ret)
@@ -530,6 +541,7 @@
 
     }
   }
+  return retcost;
 }
 
 /* called after AC tokenization is complete, because DC coding has to
@@ -547,7 +559,7 @@
     int val = cpi->frag_dc[fi];
     int token1 = cpi->dct_token[1][*idx1];
     int eb1 = cpi->dct_token_eb[1][*idx1];
-    
+
     if(!*run1) *run1 = decode_eob_token(token1, eb1);
     
     if(val){

Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c	2008-12-03 00:21:45 UTC (rev 15554)
+++ branches/theora-thusnelda/lib/enc/encode.c	2008-12-03 10:34:41 UTC (rev 15555)
@@ -20,24 +20,18 @@
 #include "codec_internal.h"
 #include "encoder_lookup.h"
 
-static int predict_frag(CP_INSTANCE  *cpi,
-			int wpc,
-			int fi,
-			int fi_down,
-			int fixup){
+static int predict_frag(int wpc,
+			ogg_int16_t *dc,
+			ogg_int16_t *down,
+			int *last){
   
-  ogg_int16_t   *dc = cpi->frag_dc;
-
-  if(fixup>=0)
-    dc[fixup] -= dc[fi];
-  
   if(wpc){
     ogg_int16_t DC = 0;
     
-    if(wpc&0x1) DC += pc[wpc][0]*dc[fi-1];
-    if(wpc&0x2) DC += pc[wpc][1]*dc[fi_down-1];
-    if(wpc&0x4) DC += pc[wpc][2]*dc[fi_down];
-    if(wpc&0x8) DC += pc[wpc][3]*dc[fi_down+1];
+    if(wpc&0x1) DC += pc[wpc][0]* *(dc-1);
+    if(wpc&0x2) DC += pc[wpc][1]* *(down-1);
+    if(wpc&0x4) DC += pc[wpc][2]* *(down);
+    if(wpc&0x8) DC += pc[wpc][3]* *(down+1);
     
     /* if we need to do a shift */
     if(pc[wpc][4]) {
@@ -49,41 +43,49 @@
     
     /* check for outranging on the two predictors that can outrange */
     if((wpc&(PU|PUL|PL)) == (PU|PUL|PL)){
-      if( abs(DC - dc[fi_down]) > 128) {
-	DC = dc[fi_down];
-      } else if( abs(DC - dc[fi-1]) > 128) {
-	DC = dc[fi-1];
-      } else if( abs(DC - dc[fi_down-1]) > 128) {
-	DC = dc[fi_down-1];
+      if( abs(DC - *down) > 128) {
+	DC = *down;
+      } else if( abs(DC - *(dc-1)) > 128) {
+	DC = *(dc-1);
+      } else if( abs(DC - *(down-1)) > 128) {
+	DC = *(down-1);
       }
     }
     
-    dc[fi] -= DC;
-    return -1;
+    *last = *dc;
+    return *dc - DC;
   }else{
-    return fi;
+    int ret = *dc - *last;
+    *last = *dc;
+    return ret;
   }
 }
 
 static void PredictDC(CP_INSTANCE *cpi){
   ogg_int32_t pi;
-  int fixup[3];  /* last value used for given frame */
-  int y,x,fi = cpi->frag_total-1;
+  int last[3];  /* last value used for given frame */
+  int y,x,fi = 0;
   unsigned char *cp = cpi->frag_coded;
 
   /* for y,u,v; handles arbitrary plane subsampling arrangement.  Shouldn't need to be altered for 4:2:2 or 4:4:4 */
-  for (pi=2; pi>=0; pi--) {
+  for (pi=0; pi<3; pi++) {
     int v = cpi->frag_v[pi];
     int h = cpi->frag_h[pi];
     int subh = !(pi && cpi->info.pixelformat != OC_PF_444);
     int subv = !(pi && cpi->info.pixelformat == OC_PF_420);
+    ogg_int16_t dc[h];
+    ogg_int16_t down[h];
 
-    for(x=0;x<3;x++)fixup[x]=-1;
+    for(x=0;x<3;x++)last[x]=0;
 
-    for (y=v-1; y>=0 ; y--) {
+    for (y=0; y<v ; y++) {
       macroblock_t *mb_row = cpi->macro + (y>>subv)*cpi->macro_h;
       macroblock_t *mb_down = cpi->macro + ((y-1)>>subv)*cpi->macro_h;
-      for (x=h-1; x>=0; x--, fi--) {
+
+      memcpy(down,dc,sizeof(down));
+      memcpy(dc,cpi->frag_dc+fi,sizeof(dc));
+
+      for (x=0; x<h; x++, fi++) {
 	if(cp[fi]) {
 	  int wpc=0;
 	  int wf = Mode2Frame[mb_row[x>>subh].mode];
@@ -97,7 +99,7 @@
 	    if(x+1<h && cp[fi-h+1] && Mode2Frame[mb_down[(x+1)>>subh].mode] == wf) wpc|=8; /* down right */
 	  }
 
-	  fixup[wf]=predict_frag(cpi,wpc,fi,fi-h,fixup[wf]);
+	  cpi->frag_dc[fi]=predict_frag(wpc,dc+x,down+x,last+wf);
 	}
       }
     }
@@ -306,7 +308,11 @@
   }
 }
 
+#include <stdio.h>
 void EncodeData(CP_INSTANCE *cpi){
+  long modebits=0;
+  long mvbits=0;
+  long dctbits;
   long bits;
 
   PredictDC(cpi);
@@ -314,21 +320,88 @@
 
   /* Mode and MV data not needed for key frames. */
   if ( cpi->FrameType != KEY_FRAME ){
+    int prebits = oggpackB_bits(cpi->oggbuffer);
     PackModes(cpi);
-    bits = oggpackB_bits(cpi->oggbuffer);
+    modebits = oggpackB_bits(cpi->oggbuffer)-prebits;
+    prebits = oggpackB_bits(cpi->oggbuffer);
     PackMotionVectors (cpi);
-    bits = oggpackB_bits(cpi->oggbuffer);
+    mvbits = oggpackB_bits(cpi->oggbuffer)-prebits;
   }
 
   ChooseTokenTables(cpi);
+  {
+    int prebits = oggpackB_bits(cpi->oggbuffer);
+    EncodeTokenList(cpi);
+    dctbits = oggpackB_bits(cpi->oggbuffer)-prebits;
+  }
+  
+  bits = oggpackB_bits(cpi->oggbuffer);
+  ReconRefFrames(cpi);
+
 #ifdef COLLECT_METRICS
   ModeMetrics(cpi);
+  {
+    int total = cpi->frag_total*64;
+    int fi=0,pi,x,y;
+    ogg_int64_t ssd=0;
+    double minimize;
+
+    for(pi=0;pi<3;pi++){
+      int bi = cpi->frag_buffer_index[fi];
+      unsigned char *frame = cpi->frame+bi;
+      unsigned char *recon = cpi->lastrecon+bi;
+      int stride = cpi->stride[pi];
+      int h = cpi->frag_h[pi]*8;
+      int v = cpi->frag_v[pi]*8;
+      
+      for(y=0;y<v;y++){
+	int lssd=0;
+	for(x=0;x<h;x++)
+	  lssd += (frame[x]-recon[x])*(frame[x]-recon[x]);
+	ssd+=lssd;
+	frame+=stride;
+	recon+=stride;
+      }
+      fi+=cpi->frag_n[pi];
+    }
+
+    minimize = ssd + (float)bits*cpi->token_lambda*16;
+
+    fprintf(stdout,"%d %d %d %d %f %f %f %ld %ld %ld %ld %f %f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  %.0f %.0f %.0f %.0f %.0f %.0f %.0f %.0f  \n",
+	    (int)cpi->CurrentFrame, // 0
+	    cpi->BaseQ,             // 1
+	    cpi->token_lambda,      // 2
+	    cpi->skip_lambda,       // 3
+	    (double)cpi->rho_count[cpi->BaseQ]/total,           // 4
+	    (double)cpi->rho_postop/total,                      // 5
+	    (double)cpi->rho_postop/cpi->rho_count[cpi->BaseQ], // 6
+ 	    modebits,               // 7
+	    mvbits,                 // 8
+	    dctbits,                // 9
+	    oggpackB_bits(cpi->oggbuffer), // 10
+	    (double)ssd,              // 11
+	    (double)0,
+	    (double)cpi->dist_dist[0][0],//13
+	    (double)cpi->dist_dist[0][1],
+	    (double)cpi->dist_dist[0][2],
+	    (double)cpi->dist_dist[0][3],
+	    (double)cpi->dist_dist[0][4],
+	    (double)cpi->dist_dist[0][5],
+	    (double)cpi->dist_dist[0][6],
+	    (double)cpi->dist_dist[0][7],
+	    (double)(cpi->dist_bits[0][0]>>7),//21
+	    (double)(cpi->dist_bits[0][1]>>7),
+	    (double)(cpi->dist_bits[0][2]>>7),
+	    (double)(cpi->dist_bits[0][3]>>7),
+	    (double)(cpi->dist_bits[0][4]>>7),
+	    (double)(cpi->dist_bits[0][5]>>7),
+	    (double)(cpi->dist_bits[0][6]>>7),
+	    (double)(cpi->dist_bits[0][7]>>7)
+	    
+
+	    );               
+  }
 #endif
-  EncodeTokenList(cpi);
-  bits = oggpackB_bits(cpi->oggbuffer);
-  
-  ReconRefFrames(cpi);
-
   dsp_restore_fpu (cpi->dsp);
 }
 

Modified: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2008-12-03 00:21:45 UTC (rev 15554)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2008-12-03 10:34:41 UTC (rev 15555)
@@ -33,10 +33,6 @@
   cpi->FrameType = KEY_FRAME;
   cpi->LastKeyFrame = 0;
 
-  /* code all blocks */
-  for(i=0;i<cpi->frag_total;i++)
-    cpi->frag_coded[i]=1;
-  
   /* mark as video frame */
   oggpackB_write(cpi->oggbuffer,0,1);
   
@@ -53,9 +49,6 @@
   oggpackB_reset(cpi->oggbuffer);
   cpi->FrameType = DELTA_FRAME;
 
-  for ( i = 0; i < cpi->frag_total; i++ ) 
-    cpi->frag_coded[i] = 1; /* TEMPORARY */
-  
   /* mark as video frame */
   oggpackB_write(cpi->oggbuffer,0,1);
 
@@ -67,10 +60,6 @@
     cpi->FrameType = KEY_FRAME;
     cpi->LastKeyFrame = 0;
 
-    /* code all blocks */
-    for(i=0;i<cpi->frag_total;i++)
-      cpi->frag_coded[i]=1;
-  
     /* mark as video frame */
     oggpackB_write(cpi->oggbuffer,0,1);
     
@@ -130,8 +119,8 @@
   cpi->BaseQ = c->quality;
 
   /* temporary while the RD code is only partially complete */
-  cpi->skip_lambda=24;
-  cpi->token_lambda=24;
+  cpi->skip_lambda=50;
+  cpi->token_lambda=50;
   cpi->mv_lambda=0;
 
   /* Set encoder flags. */
@@ -280,7 +269,7 @@
   if(last_p){
     cpi->doneflag=1;
 #ifdef COLLECT_METRICS
-    DumpMetrics(cpi);
+    //DumpMetrics(cpi);
 #endif
   }
   return 1;

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2008-12-03 00:21:45 UTC (rev 15554)
+++ branches/theora-thusnelda/lib/enc/mode.c	2008-12-03 10:34:41 UTC (rev 15555)
@@ -71,6 +71,7 @@
 
 */
 
+#include<stdio.h>
 void oc_mode_scheme_chooser_init(CP_INSTANCE *cpi){
   oc_mode_scheme_chooser *chooser = &cpi->chooser;
   int i;
@@ -298,14 +299,13 @@
 }
 
 static int cost_intra(CP_INSTANCE *cpi, int qi, int mbi, ogg_uint32_t *intrabits, int *overhead){
-  unsigned char *cp = cpi->frag_coded;
   macroblock_t *mb = &cpi->macro[mbi];
   int i,j;
   int cost = 0;
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(cp[fi]){
+      if(fi<cpi->frag_total){
 	int sad = BIntraSAD(cpi,fi,i);
 	cost += BINMAP(mode_rate[qi][i][1],sad);
       }
@@ -318,14 +318,13 @@
 }
 
 static int cost_inter(CP_INSTANCE *cpi, int qi, int mbi, mv_t mv, int mode, int *overhead){
-  unsigned char *cp = cpi->frag_coded;
   macroblock_t *mb = &cpi->macro[mbi];
   int i,j;
   int cost = 0;
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(cp[fi]){
+      if(fi<cpi->frag_total){
 	int sad = BInterSAD(cpi,fi,i,mode==CODE_USING_GOLDEN,mv);
 	cost += BINMAP(mode_rate[qi][i][0],sad);
       }
@@ -336,18 +335,17 @@
 }
 
 static int cost_inter_nomv(CP_INSTANCE *cpi, int qi, int mbi, int *overhead){
-  unsigned char *cp = cpi->frag_coded;
   macroblock_t *mb = &cpi->macro[mbi];
   int i,j;
   int cost = 0;
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(cp[fi]){
+      if(fi<cpi->frag_total){
 	int bi = cpi->frag_buffer_index[fi];
 	int stride = cpi->stride[i];  
 	int sad =  dsp_sad8x8 (cpi->dsp, cpi->frame+bi, cpi->lastrecon+bi, stride);
-
+      
 	if(i)sad<<=2;
 	cost += BINMAP(mode_rate[qi][i][0],sad);
       }
@@ -358,7 +356,6 @@
 }
 
 static int cost_inter1mv(CP_INSTANCE *cpi, int qi, int mbi, int golden, int *bits0, int *overhead){
-  unsigned char *cp = cpi->frag_coded;
   macroblock_t *mb = &cpi->macro[mbi];
   int i,j;
   int cost = 0;
@@ -366,7 +363,7 @@
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(cp[fi]){
+      if(fi<cpi->frag_total){
 	int sad = BInterSAD(cpi,fi,i,golden,mb->analysis_mv[0][golden]);
 	cost += BINMAP(mode_rate[qi][i][0],sad);
       }
@@ -386,7 +383,6 @@
 
 static int cost_inter4mv(CP_INSTANCE *cpi, int qi, int mbi, int *bits0, int *bits1, int *overhead){
   int pf = cpi->info.pixelformat;
-  unsigned char *cp = cpi->frag_coded;
   macroblock_t *mb = &cpi->macro[mbi];
   int i,j;
   int cost = 0;
@@ -395,15 +391,14 @@
 
   for(j=0;j<4;j++){
     int fi=mb->Ryuv[0][j];
-    if(cp[fi]){
+    if(fi<cpi->frag_total){
       int sad = BInterSAD(cpi,fi,0,0,mb->mv[j]);
       cost += BINMAP(mode_rate[qi][0][0],sad);
-
+      
       *bits0 += 
 	MvBits[mb->mv[j].x + MAX_MV_EXTENT] + 
 	MvBits[mb->mv[j].y + MAX_MV_EXTENT];
       *bits1 += 12;
-      
     }
   }
   
@@ -420,10 +415,9 @@
       
       for(i=1;i<3;i++){
 	int fi=mb->Ryuv[i][0];
-	if(cp[fi]){
+	if(fi<cpi->frag_total){
 	  int sad = BInterSAD(cpi,fi,i,0,ch);
 	  cost += BINMAP(mode_rate[qi][i][0],sad);
-
 	}
       }
     }
@@ -446,7 +440,7 @@
       for(i=1;i<3;i++){
 	for(j=0;j<2;j++){
 	  int fi=mb->Ryuv[i][j];
-	  if(cp[fi]){
+	  if(fi<cpi->frag_total){
 	    int sad = BInterSAD(cpi,fi,i,0,mv[j]);
 	    cost += BINMAP(mode_rate[qi][i][0],sad);
 	  }
@@ -459,7 +453,7 @@
     for(i=1;i<3;i++){
       for(j=0;j<4;j++){
 	int fi=mb->Ryuv[i][j];
-	if(cp[fi]){
+	if(fi<cpi->frag_total){
 	  int sad = BInterSAD(cpi,fi,i,0,mb->mv[j]);
 	  cost += BINMAP(mode_rate[qi][i][0],sad);
 	}
@@ -486,9 +480,10 @@
 }      
 
 typedef struct{
-  int uncoded_ssd;
-  int coded_ssd;
-  int sad_cost;
+  int uncoded_ac_ssd;
+  int coded_ac_ssd;
+  int ac_cost;
+  int dc_flag;
 } rd_metric_t;
 
 typedef struct{
@@ -547,11 +542,14 @@
   int nonzero=63;
   ogg_int16_t *dequant = ps->re_q[mode != CODE_INTRA][ps->plane];
   int uncoded_ssd=0,coded_ssd=0,coded_partial_ssd=0,sad=0;
+  int uncoded_dc=0,coded_dc=0,dc_flag=0;
   int lambda = cpi->skip_lambda;
   token_checkpoint_t *checkpoint=*stack;
-  int sad_cost=0;
+  int sad_cost=0,cost;
   int i;
 
+  cpi->frag_coded[fi]=1;
+
   /* motion comp */
   switch(mode){
   case CODE_INTER_PLUS_MV:
@@ -590,18 +588,6 @@
     break;
   }
 
-  if(!keyframe){
-    if(mode==CODE_INTER_NO_MV){
-      for(i=0;i<64;i++)
-	uncoded_ssd += data[i]*data[i];
-    }else{
-      dsp_sub8x8(cpi->dsp, frame_ptr, cpi->lastrecon+bi, buffer, stride);
-      for(i=0;i<64;i++)
-	uncoded_ssd += buffer[i]*buffer[i];
-    }
-  }
-  uncoded_ssd <<= 4; /* scale to match DCT domain */
-
   if(mode==CODE_INTRA){
     int acc=0;
     for(i=0;i<64;i++)
@@ -622,15 +608,31 @@
   cpi->frag_sad[fi]=sad;
 #endif
 
+  if(!keyframe){
+    if(mode==CODE_INTER_NO_MV){
+      for(i=0;i<64;i++){
+	uncoded_ssd += data[i]*data[i];
+	uncoded_dc += data[i];
+      }
+    }else{
+      dsp_sub8x8(cpi->dsp, frame_ptr, cpi->lastrecon+bi, buffer, stride);
+      for(i=0;i<64;i++){
+	uncoded_ssd += buffer[i]*buffer[i];
+	uncoded_dc += buffer[i];
+      }
+    }
+    uncoded_ssd*=ps->ssdmul;
+    uncoded_ssd <<= 4; /* scale to match DCT domain */
+    sad_cost = BINMAP(mode_rate[qi][ps->plane][mode==CODE_INTRA],sad);  
+  }
+
   /* transform */
   dsp_fdct_short(cpi->dsp, data, buffer);
 
   /* collect rho metrics, quantize */
   {
     int i;
-    int dcshift = (mode==CODE_INTRA?1:0); /* temporary hysteresis
-					     until DC opt is in */
-    quant_tables *qq = ps->qq[mode != CODE_INTRA];
+    //quant_tables *qq = ps->qq[mode != CODE_INTRA];
     
     {
       int d;
@@ -642,9 +644,9 @@
       //for(pos=64;pos>0;pos--)
       //if(val < qqq[pos-1])break;
       
-      rho_count[pos]++;
+      //rho_count[pos]++;
 
-      if((abs(buffer[0])<<dcshift)>=dequant[0]){
+      if(abs(buffer[0])>=dequant[0]){
 	int val = (((iq[0]>>15)*buffer[0]) + (1<<15) + (((iq[0]&0x7fff)*buffer[0])>>15)) >>16;
 	val = (val>511?511:(val<-511?-511:val));
 	
@@ -679,10 +681,14 @@
 	data[i] = val;
       }
     }
+
+    /* for undersampled planes */
+    coded_partial_ssd*=ps->ssdmul;
+
   }
+  cpi->frag_dc[fi] = data[0];
   
-  cpi->frag_dc[fi] = data[0];
-
+#if 0
   /* small performance short-circuit:
 
      Because roundoff error means that C2 preservation can't really be
@@ -692,32 +698,31 @@
      expect it to be... off... especially at low energies.
 
      If the partial_ssd indicates this block is not worth the bits by
-     some large margin, don't proceed / bother to get a more precise
+     some large margin, don't proceed or bother to get a more precise
      determination */
+
   if(!keyframe){
-    sad_cost = BINMAP(mode_rate[qi][ps->plane][mode==CODE_INTRA],sad);
-    if(sad_cost<0)sad_cost=0; /* some of the trained fits can return a negative cost for zero entropy */
 
-    /* for undersampled planes */
-    coded_partial_ssd*=ps->ssdmul;
-    uncoded_ssd*=ps->ssdmul;
+    /* Don't short circuit if there's a chance of coding a DC component */
+    if( (mode != CODE_INTRA && data[0]==0) ||
+	(mode == CODE_INTRA && abs( buffer[0] - (uncoded_dc>>1) + 4096 ) < (dequant[0]>>1))){
 
-    mo->uncoded_ssd+=uncoded_ssd;
-
-    /* the partial_ssd underreports distortion, so this comparison
-       will only yield false negatives, which are harmless */
-    if(uncoded_ssd <= coded_partial_ssd+coding_overhead*lambda+((sad_cost*lambda)>>OC_BIT_SCALE)){ 
-      /* SKIP */
-      
-      uncode_frag(cpi,fi,ps->plane);
-      mo->coded_ssd+=uncoded_ssd; /* We may still be coding the MB even if not this block */
-      return 0;
-
+      /* the partial_ssd underreports distortion, so this comparison
+	 will only yield false negatives, which are harmless */
+      if(uncoded_ssd <= coded_partial_ssd+coding_overhead*lambda+((sad_cost*lambda)>>OC_BIT_SCALE)){ 
+	/* SKIP */
+	
+	uncode_frag(cpi,fi,ps->plane);
+	mo->coded_ssd+=uncoded_ssd; /* We may still be coding the MB even if not this block */
+	return 0;
+	
+      }
     }
   }
+#endif
 
   /* tokenize */
-  dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack);
+  cost = dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack);
   
   /* reconstruct */
   while(!data[nonzero] && --nonzero);
@@ -746,33 +751,45 @@
        which to do so.*/
     /* for now, straight up SSD */
     dsp_sub8x8(cpi->dsp, frame_ptr, thisrecon, buffer, stride);    
-    for(i=0;i<64;i++)
+    for(i=0;i<64;i++){
       coded_ssd += buffer[i]*buffer[i];
+      coded_dc += buffer[i];
+    }
     coded_ssd <<= 4; /* scale to match DCT domain */
+    coded_ssd*=ps->ssdmul; /* for undersampled planes */
+    
+    /* We actually only want the AC contribution to the SSDs */
+    uncoded_ssd -= ((uncoded_dc*uncoded_dc)>>2);
+    coded_ssd -= ((coded_dc*coded_dc)>>2);
+    mo->uncoded_ac_ssd+=uncoded_ssd;  
 
-    /* for undersampled planes */
-    coded_ssd*=ps->ssdmul;
-    
-    if(uncoded_ssd <= coded_ssd+coding_overhead*lambda+((sad_cost*lambda)>>OC_BIT_SCALE)){ 
+    /* DC is a special visual case; if there's more than a
+       half-quantizer improvement in the effective DC component, code
+       the block */
+    if( abs(uncoded_dc)-abs(coded_dc) > dequant[0]){
+      mo->dc_flag = dc_flag = 1;
+    }
+       
+    if(!dc_flag && uncoded_ssd <= coded_ssd+(coding_overhead+cost)*lambda){
       /* Hm, not worth it.  roll back */
       tokenlog_rollback(cpi, checkpoint, (*stack)-checkpoint);
       *stack = checkpoint;
       uncode_frag(cpi,fi,ps->plane);
-
-      mo->coded_ssd+=uncoded_ssd;
-
+      
+      mo->coded_ac_ssd+=uncoded_ssd;
+      
       return 0;
     }else{
-
-      mo->coded_ssd+=coded_ssd;
-      mo->sad_cost+=sad_cost;
-
+      
+      mo->coded_ac_ssd+=coded_ssd;
+      mo->ac_cost+=cost;
+      
     }
   }
-
+  
   //for(i=0;i<64;i++)
   //if(data[i]!=0)cpi->rho_postop++;
-    
+
   return 1;
 }
 
@@ -816,11 +833,11 @@
   
 
   if(cpi->FrameType != KEY_FRAME){
-    if(coded){
+    if(coded && !mo.dc_flag){
       /* block by block, still coding the MB.  Now consider the
 	 macroblock coding cost as a whole (mode and MV) */ 
-      int codecost = mo.sad_cost+(fr_cost4(&fr_checkpoint,fr)<<OC_BIT_SCALE)+mode_overhead;
-      if(mo.uncoded_ssd <= mo.coded_ssd+((cpi->skip_lambda*codecost)>>(OC_BIT_SCALE))){     
+      int codecost = mo.ac_cost+fr_cost4(&fr_checkpoint,fr)+(mode_overhead>>OC_BIT_SCALE);
+      if(mo.uncoded_ac_ssd <= mo.coded_ac_ssd+cpi->skip_lambda*codecost){
 	
 	/* taking macroblock overhead into account, it is not worth coding this MB */
 	tokenlog_rollback(cpi, stack, stackptr-stack);
@@ -884,7 +901,6 @@
   int pf = cpi->info.pixelformat;
   int i;
   int coded = 0;
-  unsigned char *cp=cpi->frag_coded;
   rd_metric_t mo;
   token_checkpoint_t stack[64*2]; /* worst case token usage for 1 fragment*/
   memset(&mo,0,sizeof(mo));
@@ -892,12 +908,13 @@
   for(i=0;i<16;i++){
     int fi = sb->f[i];
     int mb_phase;
-    if(cp[fi]){
+
+    if(fi<cpi->frag_total){
       token_checkpoint_t *stackptr = stack;
       macroblock_t *mb = &cpi->macro[sb->m[i]];
       mv_t mv;
       if(mb->mode == CODE_INTER_FOURMV){
-
+	
 	switch(pf){
 	case OC_PF_420:
 	  /* sixteen blocks/macroblocks per chroma superblock */
@@ -908,7 +925,7 @@
 	  mv.x = ( mv.x >= 0 ? (mv.x + 2) / 4 : (mv.x - 2) / 4);
 	  mv.y = ( mv.y >= 0 ? (mv.y + 2) / 4 : (mv.y - 2) / 4);
 	  break;
-
+	  
 	case OC_PF_422:
 	  /* sixteen blocks / eight macroblocks per chroma superblock */
 	  mb_phase = macroblock_phase_422[i];
@@ -925,7 +942,7 @@
 	}
       }else
 	mv = mb->mv[0];
-	
+      
       if(TQB(cpi,ps,mb->mode,fi,mv,fr_cost1(fr),&mo,rc,&stackptr)){
 	fr_codeblock(cpi,fr);
 	tokenlog_commit(cpi, stack, stackptr-stack);
@@ -1167,11 +1184,10 @@
     fr_finishsb(cpi,&fr);
   }
 
-  for(i=1;i<65;i++)
-    rho_count[i]+=rho_count[i-1];
+  //for(i=1;i<65;i++)
+  //rho_count[i]+=rho_count[i-1];
 
-  memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
-
+  //memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
   if(cpi->FrameType != KEY_FRAME){
     
     if(interbits>intrabits) return 1; /* short circuit */