[xiph-commits] r15556 - in branches/theora-thusnelda/lib: . enc enc/x86_64

Thu Dec 4 21:40:38 PST 2008

Author: xiphmont
Date: 2008-12-04 21:40:36 -0800 (Thu, 04 Dec 2008)
New Revision: 15556

Modified:
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct_encode.c
   branches/theora-thusnelda/lib/enc/dsp.c
   branches/theora-thusnelda/lib/enc/dsp.h
   branches/theora-thusnelda/lib/enc/encoder_idct.c
   branches/theora-thusnelda/lib/enc/encoder_quant.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/frinit.c
   branches/theora-thusnelda/lib/enc/mcenc.c
   branches/theora-thusnelda/lib/enc/mode.c
   branches/theora-thusnelda/lib/enc/reconstruct.c
   branches/theora-thusnelda/lib/enc/x86_64/dsp_mmx.c
   branches/theora-thusnelda/lib/enc/x86_64/dsp_mmxext.c
   branches/theora-thusnelda/lib/enc/x86_64/fdct_mmx.c
   branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c
   branches/theora-thusnelda/lib/enc/x86_64/recon_mmx.c
   branches/theora-thusnelda/lib/internal.h
Log:
A rousing round of functional and build cleanup, some const-ing, some more 
skip block refinement.



Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================

--- branches/theora-thusnelda/lib/Makefile.am	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/Makefile.am	2008-12-05 05:40:36 UTC (rev 15556)
@@ -3,17 +3,12 @@
 LIBADD = $(OGG_LIBS) 
 
 EXTRA_DIST = \
-        enc/x86_32/dct_decode_mmx.c \
-        enc/x86_32/dsp_mmx.c \
-        enc/x86_32/dsp_mmxext.c \
-        enc/x86_32/recon_mmx.c \
-        enc/x86_32/fdct_mmx.c \
-        enc/x86_32/idct_mmx.c \
-        enc/x86_64/dsp_mmx.c \
-        enc/x86_64/dsp_mmxext.c \
-        enc/x86_64/recon_mmx.c \
-        enc/x86_64/fdct_mmx.c \
-        enc/x86_64/idct_mmx.c \
+        enc/x86/dct_decode_mmx.c \
+        enc/x86/dsp_mmx.c \
+        enc/x86/dsp_mmxext.c \
+        enc/x86/recon_mmx.c \
+        enc/x86/fdct_mmx.c \
+        enc/x86/idct_mmx.c \
         enc/x86_32_vs/dsp_mmx.c \
         enc/x86_32_vs/fdct_mmx.c \
         enc/x86_32_vs/recon_mmx.c \
@@ -43,25 +38,19 @@
 	enc/reconstruct.c \
 	enc/dsp.c
 
+encoder_x86_sources = \
+	enc/x86/dct_decode_mmx.c \
+	enc/x86/dsp_mmx.c \
+	enc/x86/dsp_mmxext.c \
+	enc/x86/recon_mmx.c \
+	enc/x86/idct_mmx.c \
+	enc/x86/fdct_mmx.c
+
 if CPU_x86_64
-enc_arch_dir = enc/x86_64
-encoder_arch_sources= \
-	$(enc_arch_dir)/dct_decode_mmx.c \
-	$(enc_arch_dir)/dsp_mmx.c \
-	$(enc_arch_dir)/dsp_mmxext.c \
-	$(enc_arch_dir)/recon_mmx.c \
-	$(enc_arch_dir)/idct_mmx.c \
-	$(enc_arch_dir)/fdct_mmx.c
+encoder_arch_sources = $(encoder_x86_sources)
 else
 if CPU_x86_32
-enc_arch_dir = enc/x86_32
-encoder_arch_sources= \
-	$(enc_arch_dir)/dct_decode_mmx.c \
-	$(enc_arch_dir)/dsp_mmx.c \
-	$(enc_arch_dir)/dsp_mmxext.c \
-	$(enc_arch_dir)/recon_mmx.c \
-	$(enc_arch_dir)/idct_mmx.c \
-	$(enc_arch_dir)/fdct_mmx.c
+encoder_arch_sources = $(encoder_x86_sources)
 endif
 endif
 

Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2008-12-05 05:40:36 UTC (rev 15556)
@@ -318,8 +318,8 @@
 
 #define clamp255(x) ((unsigned char)((((x)<0)-1) & ((x) | -((x)>255))))
 
-extern void IDct1( ogg_int16_t *InputData,
-                   ogg_int16_t *QuantMatrix,
+extern void IDct1( const ogg_int16_t *InputData,
+                   const ogg_int16_t *QuantMatrix,
                    ogg_int16_t *OutputData );
 
 extern void ReconRefFrames (CP_INSTANCE *cpi);
@@ -345,12 +345,12 @@
 			      int n);
 extern void dct_tokenize_init (CP_INSTANCE *cpi);
 extern int dct_tokenize_AC (CP_INSTANCE *cpi, 
-			     int fi, 
-			     ogg_int16_t *dct, 
-			     ogg_int16_t *dequant, 
-			     ogg_int16_t *origdct, 
-			     int chroma, 
-			     token_checkpoint_t **stack);
+			    const int fi, 
+			    ogg_int16_t *dct, 
+			    const ogg_int16_t *dequant, 
+			    const ogg_int16_t *origdct, 
+			    const int chroma, 
+			    token_checkpoint_t **stack);
 extern void dct_tokenize_finish (CP_INSTANCE *cpi);
 extern void dct_tokenize_mark_ac_chroma (CP_INSTANCE *cpi);
 

Modified: branches/theora-thusnelda/lib/enc/dct_encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_encode.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/dct_encode.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -468,80 +468,72 @@
    simply assume there will be a nonzero DC value and code.  That's
    not a true assumption but it can be fixed-up as DC is tokenized
    later */
-int dct_tokenize_AC(CP_INSTANCE *cpi, int fi, 
-		     ogg_int16_t *dct, ogg_int16_t *dequant, ogg_int16_t *origdct, 
-		     int chroma, token_checkpoint_t **stack){
+int dct_tokenize_AC(CP_INSTANCE *cpi, const int fi, 
+		    ogg_int16_t *dct, const ogg_int16_t *dequant, 
+		    const ogg_int16_t *origdct, const int chroma, 
+		    token_checkpoint_t **stack){
   int coeff = 1; /* skip DC for now */
+  int i = coeff;
   int retcost = 0;
-  while(coeff < BLOCK_SIZE){
-    int i = coeff;
-    int ret;
 
-    while( !dct[i] && (++i < BLOCK_SIZE) );
+  while( !dct[i] && (++i < BLOCK_SIZE) );
     
-    if ( i == BLOCK_SIZE ){
-      
-      retcost += tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
-      coeff = BLOCK_SIZE;
-    }else{
+  while(i < BLOCK_SIZE){
+    int ret;
 
-      /* determine costs for encoding this value (and any preceeding
-	 eobrun/zerorun) as well as the cost for encoding a demoted token */
-      int costA = tokenize_dctcost(cpi,chroma,coeff,i,dct[i]),costB;
-      int costD = costA;
-      int dval = (dct[i]>0 ? dct[i]-1 : dct[i]+1);
-      int j=i;
-      if(dval){
-	/* demoting will not produce a zero. */
-	costD -= costB = tokenize_dctcost(cpi,chroma,coeff,i,dval);
+    /* determine costs for encoding this value (and any preceeding
+       eobrun/zerorun) as well as the cost for encoding a demoted token */
+    int costA = tokenize_dctcost(cpi,chroma,coeff,i,dct[i]),costB;
+    int costD = costA;
+    int dval = (dct[i]>0 ? dct[i]-1 : dct[i]+1);
+    int j=i+1;
+    while((j < BLOCK_SIZE) && !dct[j] ) j++;
+
+    if(dval){
+      /* demoting will not produce a zero. */
+      costD -= costB = tokenize_dctcost(cpi,chroma,coeff,i,dval);
+    }else{
+      /* demoting token will produce a zero. */
+      costB = 0;
+      if(j==BLOCK_SIZE){
+	costD += tokenize_eobcost(cpi,chroma,i+1);
+	costD -= tokenize_eobcost(cpi,chroma,coeff);
       }else{
-	/* demoting token will produce a zero. */
-	j=i+1;
-	costB = 0;
-	while((j < BLOCK_SIZE) && !dct[j] ) j++;
-	if(j==BLOCK_SIZE){
-	  costD += tokenize_eobcost(cpi,chroma,i+1);
-	  costD -= tokenize_eobcost(cpi,chroma,coeff);
-	}else{
-	  costD += tokenize_dctcost(cpi,chroma,i+1,j,dct[j]);
-	  costD -= tokenize_dctcost(cpi,chroma,coeff,j,dct[j]);
-	}
+	costD += tokenize_dctcost(cpi,chroma,i+1,j,dct[j]);
+	costD -= tokenize_dctcost(cpi,chroma,coeff,j,dct[j]);
       }
+    }
 
-      if(costD>0){
-	/* demoting results in a cheaper token cost.  Is the bit savings worth the added distortion? */
-	int ii = dezigzag_index[i];
-	int od = dct[i]*dequant[i] - origdct[ii];
-	int dd = dval*dequant[i] - origdct[ii];
-	int delta = dd*dd - od*od;
-
-	if(delta < costD*cpi->token_lambda){
-	  /* we have a winner.  Demote token */
-	  dct[i]=dval;
-	  costA=costB;
-
-	  if(dval==0){
-	    if(j==BLOCK_SIZE){
-	      retcost += tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
-	      coeff = BLOCK_SIZE;
-	      break;
-	    }else{
-	      i=j;
-	      continue;
-	    }
-	  }
+    if(costD>0){
+      /* demoting results in a cheaper token cost.  Is the bit savings worth the added distortion? */
+      int ii = dezigzag_index[i];
+      int od = dct[i]*dequant[i] - origdct[ii];
+      int dd = dval*dequant[i] - origdct[ii];
+      int delta = dd*dd - od*od;
+      
+      if(delta < costD*cpi->token_lambda){
+	/* we have a winner.  Demote token */
+	dct[i]=dval;
+	costA=costB;
+	
+	if(dval==0){
+	  if(j==BLOCK_SIZE) break;
+	  i=j;
+	  continue;
 	}
       }
-      retcost+=costA;
-	
-      ret = tokenize_dctval(cpi, chroma, fi, coeff, i, dct[i], stack);
-      if(!ret)
-	tokenize_dctval(cpi, chroma, fi, i, i, dct[i], stack);
-      coeff=i+1;
-
     }
+
+    retcost+=costA;
+	
+    ret = tokenize_dctval(cpi, chroma, fi, coeff, i, dct[i], stack);
+    if(!ret)
+      tokenize_dctval(cpi, chroma, fi, i, i, dct[i], stack);
+    coeff=i+1;
+    i=j;
+    
   }
-  return retcost;
+  return retcost+tokenize_mark_run(cpi,chroma,fi,coeff>1,coeff,stack);
 }
 
 /* called after AC tokenization is complete, because DC coding has to

Modified: branches/theora-thusnelda/lib/enc/dsp.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/dsp.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -43,7 +43,7 @@
   memset(ptr,val,8);
 }
 
-static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
+static void sub8x8__c (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
 		       ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine){
   int i;
 
@@ -65,7 +65,7 @@
   }
 }
 
-static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+static void sub8x8_128__c (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
 			   ogg_uint32_t PixelsPerLine) {
   int i;
   /* For each block row */
@@ -89,99 +89,8 @@
   }
 }
 
-static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-			   unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-			   ogg_uint32_t PixelsPerLine) {
-
-  int i;
-
-  /* For each block row */
-  for (i=8; i; i--) {
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    ReconPtr1 += PixelsPerLine;
-    ReconPtr2 += PixelsPerLine;
-    DctInputPtr += 8;
-  }
-}
-
-static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t SadValue;
-  ogg_uint32_t SadValue1;
-
-  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
-                DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
-                DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
-                DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
-
-  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
-                DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
-                DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
-                DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
-
-  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
-  return SadValue;
-}
-
-static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
-				   ogg_uint32_t stride)
-{
-  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t MaxSad = 0;
-  ogg_uint32_t i;
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue[0] += abs(Src1[0] - Src2[0]);
-    SadValue[1] += abs(Src1[1] - Src2[1]);
-    SadValue[2] += abs(Src1[2] - Src2[2]);
-    SadValue[3] += abs(Src1[3] - Src2[3]);
-    SadValue[4] += abs(Src1[4] - Src2[4]);
-    SadValue[5] += abs(Src1[5] - Src2[5]);
-    SadValue[6] += abs(Src1[6] - Src2[6]);
-    SadValue[7] += abs(Src1[7] - Src2[7]);
-    
-    Src1 += stride;
-    Src2 += stride;
-  }
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue2[0] += abs(Src1[0] - Src2[0]);
-    SadValue2[1] += abs(Src1[1] - Src2[1]);
-    SadValue2[2] += abs(Src1[2] - Src2[2]);
-    SadValue2[3] += abs(Src1[3] - Src2[3]);
-    SadValue2[4] += abs(Src1[4] - Src2[4]);
-    SadValue2[5] += abs(Src1[5] - Src2[5]);
-    SadValue2[6] += abs(Src1[6] - Src2[6]);
-    SadValue2[7] += abs(Src1[7] - Src2[7]);
-    
-    Src1 += stride;
-    Src2 += stride;
-  }
-    
-  for ( i = 0; i < 8; i++ ){
-    if ( SadValue[i] > MaxSad )
-      MaxSad = SadValue[i];
-    if ( SadValue2[i] > MaxSad )
-      MaxSad = SadValue2[i];
-  }
-    
-  return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__c (unsigned char *ptr1, 
-			       unsigned char *ptr2, 
+static ogg_uint32_t sad8x8__c (const unsigned char *ptr1, 
+			       const unsigned char *ptr2, 
 			       ogg_uint32_t stride)
 {
   ogg_uint32_t  i;
@@ -205,8 +114,8 @@
   return sad;
 }
 
-static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, 
-				     unsigned char *ptr2, 
+static ogg_uint32_t sad8x8_thres__c (const unsigned char *ptr1, 
+				     const unsigned char *ptr2, 
 				     ogg_uint32_t stride, 
 				     ogg_uint32_t thres)
 {
@@ -234,9 +143,9 @@
   return sad;
 }
 
-static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, 
-					 unsigned char *RefDataPtr1,
-					 unsigned char *RefDataPtr2, 
+static ogg_uint32_t sad8x8_xy2_thres__c (const unsigned char *SrcData, 
+					 const unsigned char *RefDataPtr1,
+					 const unsigned char *RefDataPtr2, 
 					 ogg_uint32_t Stride,
 					 ogg_uint32_t thres)
 {
@@ -265,143 +174,6 @@
   return sad;
 }
 
-static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-
-  for (i=8; i; i--) {
-     /* Examine alternate pixel locations. */
-     XSum += DataPtr[0];
-     XXSum += DataPtr[0]*DataPtr[0];
-     XSum += DataPtr[1];
-     XXSum += DataPtr[1]*DataPtr[1];
-     XSum += DataPtr[2];
-     XXSum += DataPtr[2]*DataPtr[2];
-     XSum += DataPtr[3];
-     XXSum += DataPtr[3]*DataPtr[3];
-     XSum += DataPtr[4];
-     XXSum += DataPtr[4]*DataPtr[4];
-     XSum += DataPtr[5];
-     XXSum += DataPtr[5]*DataPtr[5];
-     XSum += DataPtr[6];
-     XXSum += DataPtr[6]*DataPtr[6];
-     XSum += DataPtr[7];
-     XXSum += DataPtr[7]*DataPtr[7];
-
-     /* Step to next row of block. */
-     DataPtr += Stride;
-   }
-
-   /* Compute population variance as mis-match metric. */
-   return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, 
-				     unsigned char *RefDataPtr, 
-				     ogg_uint32_t Stride)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  ogg_int32_t   DiffVal;
-
-  for (i=8; i; i--) {
-    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    /* Step to next row of block. */
-    SrcData += Stride;
-    RefDataPtr += Stride;
-  }
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, 
-					 unsigned char *RefDataPtr1,
-					 unsigned char *RefDataPtr2, 
-					 ogg_uint32_t Stride)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  ogg_int32_t   DiffVal;
-
-  for (i=8; i; i--) {
-    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    /* Step to next row of block. */
-    SrcData += Stride;
-    RefDataPtr1 += Stride;
-    RefDataPtr2 += Stride;
-  }
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
 static void nop (void) { /* NOP */ }
 
 void dsp_init(DspFunctions *funcs)
@@ -411,15 +183,9 @@
   funcs->set8x8 = set8x8__c;
   funcs->sub8x8 = sub8x8__c;
   funcs->sub8x8_128 = sub8x8_128__c;
-  funcs->sub8x8avg2 = sub8x8avg2__c;
-  funcs->row_sad8 = row_sad8__c;
-  funcs->col_sad8x8 = col_sad8x8__c;
   funcs->sad8x8 = sad8x8__c;
   funcs->sad8x8_thres = sad8x8_thres__c;
   funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
-  funcs->intra8x8_err = intra8x8_err__c;
-  funcs->inter8x8_err = inter8x8_err__c;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
 }
 
 void dsp_static_init(DspFunctions *funcs)

Modified: branches/theora-thusnelda/lib/enc/dsp.h
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.h	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/dsp.h	2008-12-05 05:40:36 UTC (rev 15556)
@@ -29,63 +29,49 @@
   void   (*set8x8)                (unsigned char val, unsigned char *ptr,
 				   ogg_uint32_t stride);
 
-  void   (*sub8x8)                (unsigned char *FiltPtr, unsigned char *ReconPtr,
+  void   (*sub8x8)                (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
 				   ogg_int16_t *DctInputPtr, ogg_uint32_t stride);
 
-  void   (*sub8x8_128)            (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+  void   (*sub8x8_128)            (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
 				   ogg_uint32_t stride);
 
-  void   (*sub8x8avg2)            (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-				   unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+  void   (*copy8x8)               (const unsigned char *src, unsigned char *dest, 
 				   ogg_uint32_t stride);
-
-  void   (*copy8x8)               (unsigned char *src, unsigned char *dest, 
-				   ogg_uint32_t stride);
   
-  void   (*copy8x8_half)          (unsigned char *src1, unsigned char *src2, 
+  void   (*copy8x8_half)          (const unsigned char *src1, const unsigned char *src2, 
 				   unsigned char *dest, ogg_uint32_t stride);
 
-  void   (*recon8x8)              (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
+  void   (*recon8x8)              (unsigned char *ReconPtr, const ogg_int16_t *ChangePtr, 
 				   ogg_uint32_t stride);
 
-  void   (*fdct_short)            (ogg_int16_t *InputData, ogg_int16_t *OutputData);
+  void   (*fdct_short)            (const ogg_int16_t *InputData, ogg_int16_t *OutputData);
 
-  ogg_uint32_t (*row_sad8)        (unsigned char *Src1, unsigned char *Src2);
-
-  ogg_uint32_t (*col_sad8x8)      (unsigned char *Src1, unsigned char *Src2,
+  ogg_uint32_t (*sad8x8)          (const unsigned char *ptr1, const unsigned char *ptr2, 
 				   ogg_uint32_t stride);
-  
-  ogg_uint32_t (*sad8x8)          (unsigned char *ptr1, unsigned char *ptr2, 
-				   ogg_uint32_t stride);
 
-  ogg_uint32_t (*sad8x8_thres)    (unsigned char *ptr1, unsigned char *ptr2, 
+  ogg_uint32_t (*sad8x8_thres)    (const unsigned char *ptr1, const unsigned char *ptr2, 
 				   ogg_uint32_t stride, ogg_uint32_t thres);
 
-  ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, unsigned char *RefDataPtr1,
-				   unsigned char *RefDataPtr2, ogg_uint32_t stride,
+  ogg_uint32_t (*sad8x8_xy2_thres)(const unsigned char *SrcData, const unsigned char *RefDataPtr1,
+				   const unsigned char *RefDataPtr2, ogg_uint32_t stride,
 				   ogg_uint32_t thres);
-  
-  ogg_uint32_t (*intra8x8_err)    (unsigned char *DataPtr, ogg_uint32_t stride);
-  
-  ogg_uint32_t (*inter8x8_err)    (unsigned char *SrcData, unsigned char *RefData, 
-				   ogg_uint32_t stride);
-
-  ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, unsigned char *RefDataPtr1,
-				   unsigned char *RefDataPtr2, ogg_uint32_t stride);
-               
+                 
   void (*LoopFilter)              (CP_INSTANCE *cpi, int FLimit);
 
   void (*FilterVert)              (unsigned char * PixelPtr,
 				   ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
   
-  void (*IDctSlow)                (ogg_int16_t *InputData, 
-				   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
+  void (*IDctSlow)                (const ogg_int16_t *InputData, 
+				   const ogg_int16_t *QuantMatrix, 
+				   ogg_int16_t *OutputData);
 
-  void (*IDct3)                   (ogg_int16_t *InputData, 
-				   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
+  void (*IDct3)                   (const ogg_int16_t *InputData, 
+				   const ogg_int16_t *QuantMatrix, 
+				   ogg_int16_t *OutputData);
   
-  void (*IDct10)                  (ogg_int16_t *InputData, 
-				   ogg_int16_t *QuantMatrix, ogg_int16_t *OutputData);
+  void (*IDct10)                  (const ogg_int16_t *InputData, 
+				   const ogg_int16_t *QuantMatrix, 
+				   ogg_int16_t *OutputData);
 } DspFunctions;
 
 extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
@@ -114,8 +100,6 @@
 
 #define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
 
-#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5))
-
 #define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
 
 #define dsp_copy8x8_half(funcs,ptr1,ptr2,ptr3,str1) (funcs.copy8x8_half (ptr1,ptr2,ptr3,str1))
@@ -124,10 +108,6 @@
 
 #define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
 
-#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
-
-#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
-
 #define dsp_sad8x8(funcs,ptr1,ptr2,str) (funcs.sad8x8 (ptr1,ptr2,str))
 
 #define dsp_sad8x8_thres(funcs,ptr1,ptr2,str,t) (funcs.sad8x8_thres (ptr1,ptr2,str,t))
@@ -135,14 +115,6 @@
 #define dsp_sad8x8_xy2_thres(funcs,ptr1,ptr2,ptr3,str,t) \
   (funcs.sad8x8_xy2_thres (ptr1,ptr2,ptr3,str,t))
 
-#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
-
-#define dsp_inter8x8_err(funcs,ptr1,ptr2,str) \
-  (funcs.inter8x8_err (ptr1,ptr2,str))
-
-#define dsp_inter8x8_err_xy2(funcs,ptr1,ptr2,ptr3,str) \
-  (funcs.inter8x8_err_xy2 (ptr1,ptr2,ptr3,str))
-
 #define dsp_LoopFilter(funcs, ptr1, i) \
   (funcs.LoopFilter(ptr1, i))
 

Modified: branches/theora-thusnelda/lib/enc/encoder_idct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_idct.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/encoder_idct.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -111,9 +111,9 @@
 }
 */
 
-static void dequant_slow( ogg_int16_t * dequant_coeffs,
-                   ogg_int16_t * quantized_list,
-                   ogg_int32_t * DCT_block) {
+static void dequant_slow( const ogg_int16_t * dequant_coeffs,
+			  const ogg_int16_t * quantized_list,
+			  ogg_int32_t * DCT_block) {
   int i;
   for(i=0;i<64;i++)
     DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
@@ -121,9 +121,9 @@
 
 
 
-void IDctSlow__c(  ogg_int16_t * InputData,
-                ogg_int16_t *QuantMatrix,
-                ogg_int16_t * OutputData ) {
+void IDctSlow__c(  const ogg_int16_t * InputData,
+		   const ogg_int16_t *QuantMatrix,
+		   ogg_int16_t * OutputData ) {
   ogg_int32_t IntermediateData[64];
   ogg_int32_t * ip = IntermediateData;
   ogg_int16_t * op = OutputData;
@@ -340,9 +340,9 @@
   0  0  0  0  0  0  0  0
 *************************/
 
-static void dequant_slow10( ogg_int16_t * dequant_coeffs,
-                     ogg_int16_t * quantized_list,
-                     ogg_int32_t * DCT_block){
+static void dequant_slow10( const ogg_int16_t * dequant_coeffs,
+			    const ogg_int16_t * quantized_list,
+			    ogg_int32_t * DCT_block){
   int i;
   memset(DCT_block,0, 128);
   for(i=0;i<10;i++)
@@ -350,9 +350,9 @@
 
 }
 
-void IDct10__c( ogg_int16_t * InputData,
-             ogg_int16_t *QuantMatrix,
-             ogg_int16_t * OutputData ){
+void IDct10__c( const ogg_int16_t * InputData,
+		const ogg_int16_t *QuantMatrix,
+		ogg_int16_t * OutputData ){
   ogg_int32_t IntermediateData[64];
   ogg_int32_t * ip = IntermediateData;
   ogg_int16_t * op = OutputData;
@@ -542,18 +542,18 @@
   0   0   0  0  0  0  0  0
 **************************/
 
-void IDct1( ogg_int16_t * InputData,
-            ogg_int16_t *QuantMatrix,
+void IDct1( const ogg_int16_t * InputData,
+            const ogg_int16_t *QuantMatrix,
             ogg_int16_t * OutputData ){
   int loop;
 
   ogg_int16_t  OutD;
-
+  
   OutD=(ogg_int16_t) ((ogg_int32_t)(InputData[0]*QuantMatrix[0]+15)>>5);
-
+  
   for(loop=0;loop<64;loop++)
     OutputData[loop]=OutD;
-
+  
 }
 
 void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)

Modified: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -171,7 +171,7 @@
 	    q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
 	    q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
 	    cpi->quant_tables[qti][pli][zigzag_index[ci]][qi]=(ogg_uint16_t)q;
-	    cpi->iquant_tables[qti][pli][qi][ci]=(ogg_int32_t)(((1<<31))/q+1);
+	    cpi->iquant_tables[qti][pli][qi][zigzag_index[ci]]=(ogg_int32_t)(((1<<31))/q+1);
 	  }
 	  
 	  if(++qi>=qi_end)break;

Modified: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -119,8 +119,8 @@
   cpi->BaseQ = c->quality;
 
   /* temporary while the RD code is only partially complete */
-  cpi->skip_lambda=50;
-  cpi->token_lambda=50;
+  cpi->skip_lambda=1000;
+  cpi->token_lambda=2000;
   cpi->mv_lambda=0;
 
   /* Set encoder flags. */

Modified: branches/theora-thusnelda/lib/enc/frinit.c
===================================================================
--- branches/theora-thusnelda/lib/enc/frinit.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/frinit.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -112,13 +112,11 @@
   cpi->super_n[2] = cpi->super_h[2] * cpi->super_v[2];
   cpi->super_total = cpi->super_n[0] + cpi->super_n[1] + cpi->super_n[2];
 
-  /* +1; the last entry is the 'invalid' frag, which is always set to not coded as it doesn't really exist */
-  cpi->frag_coded = calloc(cpi->frag_total+1, sizeof(*cpi->frag_coded)); 
+  cpi->frag_coded = calloc(cpi->frag_total, sizeof(*cpi->frag_coded)); 
   cpi->frag_buffer_index = calloc(cpi->frag_total, sizeof(*cpi->frag_buffer_index));
   cpi->frag_dc = calloc(cpi->frag_total, sizeof(*cpi->frag_dc));
 
-  /* +1; the last entry is the 'invalid' mb, which contains only 'invalid' frags */
-  cpi->macro = calloc(cpi->macro_total+1, sizeof(*cpi->macro));
+  cpi->macro = calloc(cpi->macro_total, sizeof(*cpi->macro));
 
   cpi->super[0] = calloc(cpi->super_total, sizeof(**cpi->super));
   cpi->super[1] = cpi->super[0] + cpi->super_n[0];
@@ -138,8 +136,8 @@
 #ifdef COLLECT_METRICS
  {
    int i;
-   cpi->frag_mbi = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->frag_mbi));
-   cpi->frag_sad = _ogg_calloc(cpi->frag_total+1, sizeof(*cpi->frag_sad));
+   cpi->frag_mbi = _ogg_calloc(cpi->frag_total, sizeof(*cpi->frag_mbi));
+   cpi->frag_sad = _ogg_calloc(cpi->frag_total, sizeof(*cpi->frag_sad));
    cpi->dct_token_frag_storage = _ogg_malloc(cpi->stack_offset*BLOCK_SIZE*sizeof(*cpi->dct_token_frag_storage));
    cpi->dct_eob_fi_storage = _ogg_malloc(cpi->frag_total*BLOCK_SIZE*sizeof(*cpi->dct_eob_fi_storage));
  }
@@ -169,7 +167,7 @@
 	      int fragindex = frow*cpi->frag_h[plane] + fcol + offset;
 	      cpi->super[plane][superindex].f[frag] = fragindex;
 	    }else
-	      cpi->super[plane][superindex].f[frag] = cpi->frag_total; /* 'invalid' */
+	      cpi->super[plane][superindex].f[frag] = -1; /* 'invalid' */
 	  }
 	}
       }
@@ -189,7 +187,7 @@
 	    cpi->super[0][superindex].m[mb] = macroindex;
 	    cpi->macro[macroindex].ysb = superindex;
 	  }else
-	    cpi->super[0][superindex].m[mb] = cpi->macro_total;
+	    cpi->super[0][superindex].m[mb] = -1;
 	}
       }
     }
@@ -207,7 +205,7 @@
 	    cpi->super[1][superindex].m[mb] = macroindex;
 	    cpi->macro[macroindex].usb = superindex + cpi->super_n[0];
 	  }else
-	    cpi->super[1][superindex].m[mb] = cpi->macro_total;
+	    cpi->super[1][superindex].m[mb] = -1;
 	}
       }
     }
@@ -225,7 +223,7 @@
 	    cpi->super[2][superindex].m[mb] = macroindex;
 	    cpi->macro[macroindex].vsb = superindex + cpi->super_n[0] + cpi->super_n[1];
 	  }else
-	    cpi->super[2][superindex].m[mb] = cpi->macro_total;
+	    cpi->super[2][superindex].m[mb] = -1;
 	}
       }
     }
@@ -253,8 +251,8 @@
 	  int Rrow = baserow + ((frag>>1)&1);
 	  int Rcol = basecol + (frag&1);
 
-	  cpi->macro[macroindex].Hyuv[0][frag] = cpi->frag_total; // default
-	  cpi->macro[macroindex].Ryuv[0][frag] = cpi->frag_total; //default
+	  cpi->macro[macroindex].Hyuv[0][frag] = -1;
+	  cpi->macro[macroindex].Ryuv[0][frag] = -1;
 	  if(Hrow<cpi->frag_v[0] && Hcol<cpi->frag_h[0]){
 	    cpi->macro[macroindex].Hyuv[0][frag] = Hrow*cpi->frag_h[0] + Hcol;	    
 #ifdef COLLECT_METRICS
@@ -266,14 +264,14 @@
 	}
 
 	/* U */
-	cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][1] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][2] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[1][3] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][1] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][2] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[1][3] = cpi->frag_total;
+	cpi->macro[macroindex].Ryuv[1][0] = -1;
+	cpi->macro[macroindex].Ryuv[1][1] = -1;
+	cpi->macro[macroindex].Ryuv[1][2] = -1;
+	cpi->macro[macroindex].Ryuv[1][3] = -1;
+	cpi->macro[macroindex].Hyuv[1][0] = -1;
+	cpi->macro[macroindex].Hyuv[1][1] = -1;
+	cpi->macro[macroindex].Hyuv[1][2] = -1;
+	cpi->macro[macroindex].Hyuv[1][3] = -1;
 	if(row<cpi->frag_v[1] && col<cpi->frag_h[1]){
 	  cpi->macro[macroindex].Hyuv[1][0] = cpi->frag_n[0] + macroindex;
 	  cpi->macro[macroindex].Ryuv[1][0] = cpi->frag_n[0] + macroindex;
@@ -283,14 +281,14 @@
 	}
 	
 	/* V */
-	cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][1] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][2] = cpi->frag_total;
-	cpi->macro[macroindex].Ryuv[2][3] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][1] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][2] = cpi->frag_total;
-	cpi->macro[macroindex].Hyuv[2][3] = cpi->frag_total;
+	cpi->macro[macroindex].Ryuv[2][0] = -1;
+	cpi->macro[macroindex].Ryuv[2][1] = -1;
+	cpi->macro[macroindex].Ryuv[2][2] = -1;
+	cpi->macro[macroindex].Ryuv[2][3] = -1;
+	cpi->macro[macroindex].Hyuv[2][0] = -1;
+	cpi->macro[macroindex].Hyuv[2][1] = -1;
+	cpi->macro[macroindex].Hyuv[2][2] = -1;
+	cpi->macro[macroindex].Hyuv[2][3] = -1;
 	if(row<cpi->frag_v[2] && col<cpi->frag_h[2]){
 	  cpi->macro[macroindex].Hyuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
 	  cpi->macro[macroindex].Ryuv[2][0] = cpi->frag_n[0] + cpi->frag_n[1] + macroindex;
@@ -374,21 +372,6 @@
     }
   }
 
-  /* fill in 'invalid' macroblock */
-  {
-    int p,f;
-    for(p=0;p<3;p++)
-      for(f=0;f<4;f++){
-	cpi->macro[cpi->macro_total].Ryuv[p][f] = cpi->frag_total;
-	cpi->macro[cpi->macro_total].Hyuv[p][f] = cpi->frag_total;
-      }
-    cpi->macro[cpi->macro_total].ncneighbors=0;
-    cpi->macro[cpi->macro_total].npneighbors=0;
-#ifdef COLLECT_METRICS
-    cpi->frag_mbi[cpi->frag_total] = cpi->macro_total;
-#endif
-  }
-
   /* allocate frames */
   cpi->frame = _ogg_calloc(cpi->frame_size,sizeof(*cpi->frame));
   cpi->lastrecon = _ogg_calloc(cpi->frame_size,sizeof(*cpi->lastrecon));

Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/mcenc.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -207,7 +207,6 @@
     const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
     
     err+=  dsp_sad8x8_xy2_thres (cpi->dsp, cur, ref+_mvoffset0, ref+_mvoffset1, cpi->stride[0], _best_err-err);
-    //err+=  dsp_inter8x8_err_xy2 (cpi->dsp, cur, ref+_mvoffset0, ref+_mvoffset1, cpi->stride[0]);
 
   }
   
@@ -231,13 +230,12 @@
   err=0;
   for(bi=0;bi<4;bi++){
     int fi = mb->Ryuv[0][bi];
-    if(fi < cpi->frag_total){ /* last fragment is the 'invalid fragment' */
+    if(fi >= 0){ /* last fragment is the 'invalid fragment' */
       ogg_uint32_t base_offset = cpi->frag_buffer_index[fi];
       const unsigned char *cur = cpi->frame + base_offset;
       const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
       
       _block_err[bi] = dsp_sad8x8_thres (cpi->dsp, cur, ref+mvoffset,stride,9999999); 
-      //_block_err[bi] = dsp_inter8x8_err (cpi->dsp, cur, ref+mvoffset,stride); 
 
       err += _block_err[bi];
     }
@@ -310,7 +308,7 @@
   int err;
   int fi = mb->Ryuv[0][_bi];
 
-  if(fi == cpi->frag_total) return _best_err;
+  if(fi < 0) return _best_err;
 
   mvoffset_base=_vec->x+_vec->y*stride;
   offset_y[0]=offset_y[1]=offset_y[2]=-stride;
@@ -342,7 +340,6 @@
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
 
     err=dsp_sad8x8_xy2_thres (cpi->dsp, cur, ref+mvoffset0, ref+mvoffset1, stride, _best_err);
-    //err=dsp_inter8x8_err_xy2 (cpi->dsp, cur, ref+mvoffset0, ref+mvoffset1, stride);
 
     if(err<_best_err){
       _best_err=err;

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/mode.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -71,7 +71,6 @@
 
 */
 
-#include<stdio.h>
 void oc_mode_scheme_chooser_init(CP_INSTANCE *cpi){
   oc_mode_scheme_chooser *chooser = &cpi->chooser;
   int i;
@@ -275,22 +274,27 @@
 static int BInterSAD(CP_INSTANCE *cpi, int fi, int plane, int goldenp, mv_t mv){
   int sad = 0;
   unsigned char *b = cpi->frame + cpi->frag_buffer_index[fi];
-  int qp = (plane>0);
-  int mx = mvmap[qp][mv.x+31];
-  int my = mvmap[qp][mv.y+31];
-  int mx2 = mvmap2[qp][mv.x+31];
-  int my2 = mvmap2[qp][mv.y+31];
-
-  int stride = cpi->stride[plane];
   unsigned char *r = (goldenp ? cpi->golden : cpi->lastrecon ) + 
-    cpi->frag_buffer_index[fi] + my * stride + mx;
-  
-  if(mx2 || my2){
-    unsigned char *r2 = r + my2 * stride + mx2;
-    sad =  dsp_sad8x8_xy2_thres (cpi->dsp, b, r, r2, stride, 9999999);
-  }else{
+    cpi->frag_buffer_index[fi];
+  int stride = cpi->stride[plane];
+
+  if(mv.x || mv.y){
+    int qp = (plane>0);
+    int mx = mvmap[qp][mv.x+31];
+    int my = mvmap[qp][mv.y+31];
+    int mx2 = mvmap2[qp][mv.x+31];
+    int my2 = mvmap2[qp][mv.y+31];
+    
+    r += my * stride + mx;
+    
+    if(mx2 || my2){
+      unsigned char *r2 = r + my2 * stride + mx2;
+      sad =  dsp_sad8x8_xy2_thres (cpi->dsp, b, r, r2, stride, 9999999);
+    }else{
+      sad =  dsp_sad8x8 (cpi->dsp, b, r, stride);
+    }
+  }else
     sad =  dsp_sad8x8 (cpi->dsp, b, r, stride);
-  }
 
   if(plane)
     return sad<<2;
@@ -305,7 +309,7 @@
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(fi<cpi->frag_total){
+      if(fi>=0){
 	int sad = BIntraSAD(cpi,fi,i);
 	cost += BINMAP(mode_rate[qi][i][1],sad);
       }
@@ -324,7 +328,7 @@
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(fi<cpi->frag_total){
+      if(fi>=0){
 	int sad = BInterSAD(cpi,fi,i,mode==CODE_USING_GOLDEN,mv);
 	cost += BINMAP(mode_rate[qi][i][0],sad);
       }
@@ -341,7 +345,7 @@
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(fi<cpi->frag_total){
+      if(fi>=0){
 	int bi = cpi->frag_buffer_index[fi];
 	int stride = cpi->stride[i];  
 	int sad =  dsp_sad8x8 (cpi->dsp, cpi->frame+bi, cpi->lastrecon+bi, stride);
@@ -363,7 +367,7 @@
   for(i=0;i<3;i++){
     for(j=0;j<4;j++){
       int fi=mb->Ryuv[i][j];
-      if(fi<cpi->frag_total){
+      if(fi>=0){
 	int sad = BInterSAD(cpi,fi,i,golden,mb->analysis_mv[0][golden]);
 	cost += BINMAP(mode_rate[qi][i][0],sad);
       }
@@ -391,7 +395,7 @@
 
   for(j=0;j<4;j++){
     int fi=mb->Ryuv[0][j];
-    if(fi<cpi->frag_total){
+    if(fi>=0){
       int sad = BInterSAD(cpi,fi,0,0,mb->mv[j]);
       cost += BINMAP(mode_rate[qi][0][0],sad);
       
@@ -415,7 +419,7 @@
       
       for(i=1;i<3;i++){
 	int fi=mb->Ryuv[i][0];
-	if(fi<cpi->frag_total){
+	if(fi>=0){
 	  int sad = BInterSAD(cpi,fi,i,0,ch);
 	  cost += BINMAP(mode_rate[qi][i][0],sad);
 	}
@@ -440,7 +444,7 @@
       for(i=1;i<3;i++){
 	for(j=0;j<2;j++){
 	  int fi=mb->Ryuv[i][j];
-	  if(fi<cpi->frag_total){
+	  if(fi>=0){
 	    int sad = BInterSAD(cpi,fi,i,0,mv[j]);
 	    cost += BINMAP(mode_rate[qi][i][0],sad);
 	  }
@@ -453,7 +457,7 @@
     for(i=1;i<3;i++){
       for(j=0;j<4;j++){
 	int fi=mb->Ryuv[i][j];
-	if(fi<cpi->frag_total){
+	if(fi>=0){
 	  int sad = BInterSAD(cpi,fi,i,0,mb->mv[j]);
 	  cost += BINMAP(mode_rate[qi][i][0],sad);
 	}
@@ -527,29 +531,38 @@
 		int coding_overhead, rd_metric_t *mo, long *rho_count,
 		token_checkpoint_t **stack){
   
-  int keyframe = (cpi->FrameType == KEY_FRAME);
-  int qi = ps->qi;
-  ogg_int32_t *iq = ps->iq[mode != CODE_INTRA];
+  const int keyframe = (cpi->FrameType == KEY_FRAME);
+  const int qi = ps->qi;
+  const ogg_int32_t *iq = ps->iq[mode != CODE_INTRA];
   ogg_int16_t buffer[64];
   ogg_int16_t data[64];
-  int bi = cpi->frag_buffer_index[fi];
-  int stride = cpi->stride[ps->plane];
-  unsigned char *frame_ptr = &cpi->frame[bi];
+  const int bi = cpi->frag_buffer_index[fi];
+  const int stride = cpi->stride[ps->plane];
+  const unsigned char *frame_ptr = &cpi->frame[bi];
   unsigned char *lastrecon = ((mode == CODE_USING_GOLDEN || 
 			       mode == CODE_GOLDEN_MV) ? 
 			      cpi->golden : cpi->lastrecon)+bi;
   unsigned char *thisrecon = cpi->recon+bi;
-  int nonzero=63;
-  ogg_int16_t *dequant = ps->re_q[mode != CODE_INTRA][ps->plane];
-  int uncoded_ssd=0,coded_ssd=0,coded_partial_ssd=0,sad=0;
+  int nonzero=0;
+  const ogg_int16_t *dequant = ps->re_q[mode != CODE_INTRA][ps->plane];
+  int uncoded_ssd=0,coded_ssd=0,coded_partial_ssd=0;
   int uncoded_dc=0,coded_dc=0,dc_flag=0;
   int lambda = cpi->skip_lambda;
   token_checkpoint_t *checkpoint=*stack;
-  int sad_cost=0,cost;
+  int cost;
   int i;
 
   cpi->frag_coded[fi]=1;
 
+  /* by way of explanation: although the f_array coding overhead
+     determination is accurate, it is greedy using very coarse-grained
+     local information.  Allowing it to mildly discourage coding turns
+     out to be beneficial, but it's not clear that allowing it to
+     encourage coding through negative coding overhead deltas is
+     useful.  For that reason, we disallow negative
+     coding_overheads */
+  if(coding_overhead<0)coding_overhead = 0; 
+
   /* motion comp */
   switch(mode){
   case CODE_INTER_PLUS_MV:
@@ -588,14 +601,14 @@
     break;
   }
 
+#ifdef COLLECT_METRICS
+  int sad=0;
   if(mode==CODE_INTRA){
     int acc=0;
-    for(i=0;i<64;i++)
+    for(i=0;i<64;i++){
       acc += data[i];
-
-    for(i=0;i<64;i++)
       sad += abs((data[i]<<6)-acc);
-
+    }
     sad >>=6;
   }else{
     for(i=0;i<64;i++)
@@ -604,7 +617,6 @@
     if(ps->plane)sad<<=2;
   }
 
-#ifdef COLLECT_METRICS
   cpi->frag_sad[fi]=sad;
 #endif
 
@@ -623,7 +635,6 @@
     }
     uncoded_ssd*=ps->ssdmul;
     uncoded_ssd <<= 4; /* scale to match DCT domain */
-    sad_cost = BINMAP(mode_rate[qi][ps->plane][mode==CODE_INTRA],sad);  
   }
 
   /* transform */
@@ -634,34 +645,9 @@
     int i;
     //quant_tables *qq = ps->qq[mode != CODE_INTRA];
     
-    {
-      int d;
-
-      // rho-domain distribution 
+    for(i=0;i<64;i++){
+      int v = buffer[dezigzag_index[i]];
       //int pos;
-      //int val = (abs(buffer[0])<<dcshift);
-      //ogg_int16_t *qqq = (*qq)[0];
-      //for(pos=64;pos>0;pos--)
-      //if(val < qqq[pos-1])break;
-      
-      //rho_count[pos]++;
-
-      if(abs(buffer[0])>=dequant[0]){
-	int val = (((iq[0]>>15)*buffer[0]) + (1<<15) + (((iq[0]&0x7fff)*buffer[0])>>15)) >>16;
-	val = (val>511?511:(val<-511?-511:val));
-	
-	d = val*dequant[0]-buffer[0];
-	coded_partial_ssd += d*d;
-	data[0] = val;
-      }else{
-	coded_partial_ssd += buffer[0]*buffer[0];
-	data[0] = 0;
-      }
-    }
-    
-    for(i=1;i<64;i++){
-      int ii = dezigzag_index[i];
-      //int pos;
       //int val = abs(buffer[ii])<<1;
       //ogg_int16_t *qqq = (*qq)[i];
       //for(pos=64;pos>0;pos--)
@@ -670,15 +656,18 @@
       /* rho-domain distribution */
       //rho_count[pos]++;
 
-      {
+      if((abs(v)<<1)>=dequant[i]){
 	int d;
-	int val = (((iq[ii]>>15)*buffer[ii]) + (1<<15) + (((iq[ii]&0x7fff)*buffer[ii])>>15)) >>16;
+	int val = (((iq[i]>>15)*v) + (1<<15) + (((iq[i]&0x7fff)*v)>>15)) >>16;
 	val = (val>511?511:(val<-511?-511:val));
 
-
-	d = val*dequant[i]-buffer[ii];
+	d = val*dequant[i]-v;
 	coded_partial_ssd += d*d;
 	data[i] = val;
+	nonzero=i;
+      }else{
+	coded_partial_ssd += v*v;
+	data[i] = 0;
       }
     }
 
@@ -688,44 +677,10 @@
   }
   cpi->frag_dc[fi] = data[0];
   
-#if 0
-  /* small performance short-circuit:
-
-     Because roundoff error means that C2 preservation can't really be
-     trusted at low energy levels (and Theora's intentionally leaky
-     fDCT makes this way way worse), we shouldn't reply on SSD
-     gathered in the frequency domain.  We can still use it if we
-     expect it to be... off... especially at low energies.
-
-     If the partial_ssd indicates this block is not worth the bits by
-     some large margin, don't proceed or bother to get a more precise
-     determination */
-
-  if(!keyframe){
-
-    /* Don't short circuit if there's a chance of coding a DC component */
-    if( (mode != CODE_INTRA && data[0]==0) ||
-	(mode == CODE_INTRA && abs( buffer[0] - (uncoded_dc>>1) + 4096 ) < (dequant[0]>>1))){
-
-      /* the partial_ssd underreports distortion, so this comparison
-	 will only yield false negatives, which are harmless */
-      if(uncoded_ssd <= coded_partial_ssd+coding_overhead*lambda+((sad_cost*lambda)>>OC_BIT_SCALE)){ 
-	/* SKIP */
-	
-	uncode_frag(cpi,fi,ps->plane);
-	mo->coded_ssd+=uncoded_ssd; /* We may still be coding the MB even if not this block */
-	return 0;
-	
-      }
-    }
-  }
-#endif
-
   /* tokenize */
   cost = dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack);
   
   /* reconstruct */
-  while(!data[nonzero] && --nonzero);
   switch(nonzero){
   case 0:
     IDct1( data, dequant, buffer );
@@ -909,7 +864,7 @@
     int fi = sb->f[i];
     int mb_phase;
 
-    if(fi<cpi->frag_total){
+    if(fi>=0){
       token_checkpoint_t *stackptr = stack;
       macroblock_t *mb = &cpi->macro[sb->m[i]];
       mv_t mv;
@@ -999,7 +954,7 @@
     for(j = 0; j<4; j++){ /* mode addressing is through Y plane, always 4 MB per SB */
       int mbi = sb->m[j];
 
-      if(mbi >= cpi->macro_total) continue;
+      if(mbi < 0) continue;
 
       int cost[8] = {0,0,0,0, 0,0,0,0};
       int overhead[8] = {0,0,0,0, 0,0,0,0};

Modified: branches/theora-thusnelda/lib/enc/reconstruct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/reconstruct.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/reconstruct.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -17,7 +17,7 @@
 
 #include "codec_internal.h"
 
-static void copy8x8__c (unsigned char *src,
+static void copy8x8__c (const unsigned char *src,
 			unsigned char *dest,
 			unsigned int stride)
 {
@@ -30,8 +30,8 @@
   }
 }
 
-static void copy8x8_half__c (unsigned char *src1,
-			     unsigned char *src2, 
+static void copy8x8_half__c (const unsigned char *src1,
+			     const unsigned char *src2, 
 			     unsigned char *dest,
 			     unsigned int stride)
 {
@@ -55,7 +55,7 @@
 
 
 static void recon8x8__c (unsigned char *ReconPtr, 
-			 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+			 const ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
 {
   ogg_uint32_t i;
 

Modified: branches/theora-thusnelda/lib/enc/x86_64/dsp_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/dsp_mmx.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/x86_64/dsp_mmx.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -22,15 +22,13 @@
 
 #if defined(USE_ASM)
 
-typedef unsigned long long ogg_uint64_t;
-
 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
 
 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
 
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
+static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
 			 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine) 
 {
   __asm__ __volatile__ (
@@ -62,15 +60,15 @@
      : "+r" (FiltPtr),
        "+r" (ReconPtr),
        "+r" (DctInputPtr)
-     : "r" ((ogg_uint64_t)PixelsPerLine)
+
+     : "r" ((unsigned long)PixelsPerLine)
      : "memory"
   );
 }
 
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
 			     ogg_uint32_t PixelsPerLine) 
 {
-  ogg_uint64_t ppl = PixelsPerLine;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
@@ -96,188 +94,12 @@
 
      : "+r" (FiltPtr),
        "+r" (DctInputPtr)
-     : "r" (ppl), /* gcc bug? a cast won't work here, e.g. (ogg_uint64_t)PixelsPerLine */
+     : "r" ((unsigned long)PixelsPerLine),
        [V128] "m" (V128)
      : "memory"
   );
 }
 
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-			     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-			     ogg_uint32_t PixelsPerLine)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
-    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
-    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
-    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
-    /* average ReconPtr1 and ReconPtr2 */
-    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
-    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
-    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %3           \n\t"
-    "  add         %4, %0           \n\t"
-    "  add         %4, %1           \n\t"
-    "  add         %4, %2           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr1),
-       "+r" (ReconPtr2),
-       "+r" (DctInputPtr)
-     : "r" ((ogg_uint64_t)PixelsPerLine)
-     : "memory"
-  );
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint64_t  XSum;
-  ogg_uint64_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%rdi     \n\t"
-    "  movsx       %%di, %%rdi      \n\t"
-    "  mov         %%rdi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=r" (XSum),
-       "=r" (XXSum),
-       "+r" (DataPtr) 
-     : "r" ((ogg_uint64_t)Stride)
-     : "rdi", "memory"
-  );
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, unsigned char *RefDataPtr, 
-				       ogg_uint32_t Stride)
-{
-  ogg_uint64_t  XSum;
-  ogg_uint64_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%3), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %4, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%rdi     \n\t"
-    "  movsx       %%di, %%rdi      \n\t"
-    "  mov         %%rdi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
-     : "r" ((ogg_uint64_t)Stride)
-     : "rdi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
 static void restore_fpu (void)
 {
   __asm__ __volatile__ (
@@ -290,9 +112,6 @@
   funcs->restore_fpu = restore_fpu;
   funcs->sub8x8 = sub8x8__mmx;
   funcs->sub8x8_128 = sub8x8_128__mmx;
-  funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->intra8x8_err = intra8x8_err__mmx;
-  funcs->inter8x8_err = inter8x8_err__mmx;
 }
 
 #endif /* USE_ASM */

Modified: branches/theora-thusnelda/lib/enc/x86_64/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/dsp_mmxext.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/x86_64/dsp_mmxext.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -22,9 +22,7 @@
 
 #if defined(USE_ASM)
 
-typedef unsigned long long ogg_uint64_t;
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, unsigned char *ptr2, 
+static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
 				    ogg_uint32_t stride)
 {
   ogg_uint32_t  DiffVal;
@@ -51,14 +49,14 @@
      : "=r" (DiffVal),
        "+r" (ptr1), 
        "+r" (ptr2) 
-     : "r" ((ogg_uint64_t)stride)
+     : "r" ((unsigned long)stride)
      : "memory"
   );
 
   return DiffVal;
 }
 
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, unsigned char *ptr2, 
+static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
 					  ogg_uint32_t stride, ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
@@ -81,15 +79,15 @@
      : "=r" (DiffVal),
        "+r" (ptr1), 
        "+r" (ptr2) 
-     : "r" ((ogg_uint64_t)stride)
+     : "r" ((unsigned long)stride)
      : "memory"
   );
 
   return DiffVal;
 }
 
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, unsigned char *RefDataPtr1,
-                                              unsigned char *RefDataPtr2, ogg_uint32_t Stride,
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (const unsigned char *SrcData, const unsigned char *RefDataPtr1,
+                                              const unsigned char *RefDataPtr2, ogg_uint32_t Stride,
                                               ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
@@ -115,202 +113,18 @@
        "+r" (SrcData), 
        "+r" (RefDataPtr1), 
        "+r" (RefDataPtr2) 
-     : "r" ((ogg_uint64_t)Stride)
+     : "r" ((unsigned long)Stride)
      : "memory"
   );
 
   return DiffVal;
 }
 		
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  movd        (%1), %%mm0      \n\t"
-    "  movd        (%2), %%mm1      \n\t"
-    "  psadbw      %%mm0, %%mm1     \n\t"
-    "  movd        4(%1), %%mm2     \n\t"
-    "  movd        4(%2), %%mm3     \n\t"
-    "  psadbw      %%mm2, %%mm3     \n\t"
-
-    "  pmaxsw      %%mm1, %%mm3     \n\t"
-    "  movd        %%mm3, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-					ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  pmaxsw      %%mm4, %%mm5     \n\t"
-    "  pmaxsw      %%mm5, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" ((ogg_uint64_t)stride)
-     : "memory", "rdi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, unsigned char *RefDataPtr1,
-                                              unsigned char *RefDataPtr2, ogg_uint32_t Stride)
-{
-  ogg_uint64_t XSum;
-  ogg_uint64_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
-    "  pavgb       %%mm2, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm4, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm4, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %5, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "r" ((ogg_uint64_t)Stride)
-     : "rdi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
 void dsp_mmxext_init(DspFunctions *funcs)
 {
-  funcs->row_sad8 = row_sad8__mmxext;
-  funcs->col_sad8x8 = col_sad8x8__mmxext;
   funcs->sad8x8 = sad8x8__mmxext;
   funcs->sad8x8_thres = sad8x8_thres__mmxext;
   funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
 }
 
 #endif /* USE_ASM */

Modified: branches/theora-thusnelda/lib/enc/x86_64/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/fdct_mmx.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/x86_64/fdct_mmx.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -27,13 +27,6 @@
 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
 
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
 /* execute stage 1 of forward DCT */
 #define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
   "  movq      " #ip0 ", %%mm0      \n\t"                                     \

Modified: branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/x86_64/idct_mmx.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -356,8 +356,8 @@
  *
  ***************************************************************************************
  */
-void IDctSlow__mmx(ogg_int16_t *in,
-		   ogg_int16_t *q,
+void IDctSlow__mmx(const ogg_int16_t *in,
+		   const ogg_int16_t *q,
 		   ogg_int16_t *out ) {
 
 #   define MID(M,I)     MtoSTR(M+(I)*8)"(%[c])"
@@ -810,9 +810,9 @@
 
 /* --------------------------------------------------------------- */
 /* IDCT 10 */
-void IDct10__mmx( ogg_int16_t *in,
-             ogg_int16_t *q,
-             ogg_int16_t *out ) {
+void IDct10__mmx( const ogg_int16_t *in,
+		  const ogg_int16_t *q,
+		  ogg_int16_t *out ) {
 
     __asm__ __volatile__ (
 
@@ -1063,9 +1063,9 @@
     "#end ColumnIDCT_3\n"
 //End of ColumnIDCT_3
 
-void IDct3__mmx( ogg_int16_t *in,
-            ogg_int16_t *q,
-            ogg_int16_t *out ) {
+void IDct3__mmx( const ogg_int16_t *in,
+		 const ogg_int16_t *q,
+		 ogg_int16_t *out ) {
 
     __asm__ __volatile__ (
 

Modified: branches/theora-thusnelda/lib/enc/x86_64/recon_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_64/recon_mmx.c	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/enc/x86_64/recon_mmx.c	2008-12-05 05:40:36 UTC (rev 15556)
@@ -19,11 +19,9 @@
 
 #if defined(USE_ASM)
 
-typedef unsigned long long ogg_uint64_t;
-
 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
 
-static void copy8x8__mmx (unsigned char *src,
+static void copy8x8__mmx (const unsigned char *src,
                           unsigned char *dest,
                           ogg_uint32_t stride)
 {
@@ -57,13 +55,13 @@
     "  movq        %%mm3, (%0, %%rdi)  \n\t"
       : "+a" (dest)
       : "c" (src),
-        "d" ((ogg_uint64_t)stride)
+        "d" ((unsigned long)stride)
       : "memory", "rdi"
   );
 }
 
 static void recon8x8__mmx (unsigned char *ReconPtr, 
-			   ogg_int16_t *ChangePtr, 
+			   const ogg_int16_t *ChangePtr, 
 			   ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
@@ -93,7 +91,7 @@
     "  jc          1b                  \n\t"
       : "+r" (ReconPtr)
       : "r" (ChangePtr),
-        "r" ((ogg_uint64_t)LineStep)
+        "r" ((unsigned long)LineStep)
       : "memory", "rdi"
   );
 }

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2008-12-03 10:34:41 UTC (rev 15555)
+++ branches/theora-thusnelda/lib/internal.h	2008-12-05 05:40:36 UTC (rev 15556)
@@ -33,7 +33,7 @@
 # endif
 
 /*This library's version.*/
-# define OC_VENDOR_STRING "Xiph.Org libThusnelda I 20080310"
+# define OC_VENDOR_STRING "Xiph.Org libThusnelda I 20081201"
 
 /*Theora bitstream version.*/
 # define TH_VERSION_MAJOR (3)