[xiph-commits] r11494 - branches/theora-playtime/lib/x86_32_vs

illiminable at svn.xiph.org illiminable at svn.xiph.org
Thu Jun 1 12:42:31 PDT 2006


Author: illiminable
Date: 2006-06-01 12:42:23 -0700 (Thu, 01 Jun 2006)
New Revision: 11494

Added:
   branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
Modified:
   branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
   branches/theora-playtime/lib/x86_32_vs/recon_mmx.c
Log:
* A couple of SSE2 functions to start with

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-01 19:42:23 UTC (rev 11494)
@@ -37,23 +37,23 @@
 
     //Make non-zero to use the C-version
 #if 0
-  int i;
-
-  /* For each block row */
-  for (i=8; i; i--) {
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    ReconPtr += ReconPixelsPerLine;
-    DctInputPtr += 8;
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr += ReconPixelsPerLine;
+    DctInputPtr += 8;
   }
 #else
     __asm {
@@ -248,25 +248,25 @@
 {
 
 #if 0
-  int i;
-  /* For each block row */
-  for (i=8; i; i--) {
-    /* INTRA mode so code raw image data */
-    /* We convert the data to 8 bit signed (by subtracting 128) as
-       this reduces the internal precision requirments in the DCT
-       transform. */
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    DctInputPtr += 8;
+  int i;
+  /* For each block row */
+  for (i=8; i; i--) {
+    /* INTRA mode so code raw image data */
+    /* We convert the data to 8 bit signed (by subtracting 128) as
+       this reduces the internal precision requirments in the DCT
+       transform. */
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    DctInputPtr += 8;
   }
 
 #else
@@ -422,24 +422,24 @@
 {
 
 #if 0
-  int i;
-
-  /* For each block row */
-  for (i=8; i; i--) {
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    ReconPtr1 += ReconPixelsPerLine;
-    ReconPtr2 += ReconPixelsPerLine;
-    DctInputPtr += 8;
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr1 += ReconPixelsPerLine;
+    ReconPtr2 += ReconPixelsPerLine;
+    DctInputPtr += 8;
   }
 #else
 
@@ -705,21 +705,21 @@
 {
 
 #if 0
-  ogg_uint32_t SadValue;
-  ogg_uint32_t SadValue1;
-
-  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
-	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
-	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
-	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
-
-  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
-	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
-	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
-	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
-
-  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
+  ogg_uint32_t SadValue;
+  ogg_uint32_t SadValue1;
+
+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
+	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+
+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
+	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+
+  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
+
   return SadValue;
 
 #else
@@ -784,46 +784,46 @@
 {
 
 #if 0
-  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t MaxSad = 0;
-  ogg_uint32_t i;
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue[0] += abs(Src1[0] - Src2[0]);
-    SadValue[1] += abs(Src1[1] - Src2[1]);
-    SadValue[2] += abs(Src1[2] - Src2[2]);
-    SadValue[3] += abs(Src1[3] - Src2[3]);
-    SadValue[4] += abs(Src1[4] - Src2[4]);
-    SadValue[5] += abs(Src1[5] - Src2[5]);
-    SadValue[6] += abs(Src1[6] - Src2[6]);
-    SadValue[7] += abs(Src1[7] - Src2[7]);
-    
-    Src1 += stride;
-    Src2 += stride;
-  }
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue2[0] += abs(Src1[0] - Src2[0]);
-    SadValue2[1] += abs(Src1[1] - Src2[1]);
-    SadValue2[2] += abs(Src1[2] - Src2[2]);
-    SadValue2[3] += abs(Src1[3] - Src2[3]);
-    SadValue2[4] += abs(Src1[4] - Src2[4]);
-    SadValue2[5] += abs(Src1[5] - Src2[5]);
-    SadValue2[6] += abs(Src1[6] - Src2[6]);
-    SadValue2[7] += abs(Src1[7] - Src2[7]);
-    
-    Src1 += stride;
-    Src2 += stride;
-  }
-    
-  for ( i = 0; i < 8; i++ ){
-    if ( SadValue[i] > MaxSad )
-      MaxSad = SadValue[i];
-    if ( SadValue2[i] > MaxSad )
-      MaxSad = SadValue2[i];
-  }
-    
+  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t MaxSad = 0;
+  ogg_uint32_t i;
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue[0] += abs(Src1[0] - Src2[0]);
+    SadValue[1] += abs(Src1[1] - Src2[1]);
+    SadValue[2] += abs(Src1[2] - Src2[2]);
+    SadValue[3] += abs(Src1[3] - Src2[3]);
+    SadValue[4] += abs(Src1[4] - Src2[4]);
+    SadValue[5] += abs(Src1[5] - Src2[5]);
+    SadValue[6] += abs(Src1[6] - Src2[6]);
+    SadValue[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue2[0] += abs(Src1[0] - Src2[0]);
+    SadValue2[1] += abs(Src1[1] - Src2[1]);
+    SadValue2[2] += abs(Src1[2] - Src2[2]);
+    SadValue2[3] += abs(Src1[3] - Src2[3]);
+    SadValue2[4] += abs(Src1[4] - Src2[4]);
+    SadValue2[5] += abs(Src1[5] - Src2[5]);
+    SadValue2[6] += abs(Src1[6] - Src2[6]);
+    SadValue2[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+    
+  for ( i = 0; i < 8; i++ ){
+    if ( SadValue[i] > MaxSad )
+      MaxSad = SadValue[i];
+    if ( SadValue2[i] > MaxSad )
+      MaxSad = SadValue2[i];
+  }
+    
   return MaxSad;
 #else
   ogg_uint32_t MaxSad;
@@ -912,24 +912,24 @@
 {
 
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
-    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
-    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
-    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
-    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
-    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
-    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
-    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
-    /* Step to next row of block. */
-    ptr1 += stride1;
-    ptr2 += stride2;
-  }
-
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
   return sad;
 #else
   ogg_uint32_t  DiffVal;
@@ -1113,27 +1113,27 @@
 			   	  ogg_uint32_t thres)
 {
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
-    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
-    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
-    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
-    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
-    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
-    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
-    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
-    if (sad > thres )
-      break;
-
-    /* Step to next row of block. */
-    ptr1 += stride1;
-    ptr2 += stride2;
-  }
-
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    if (sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
   return sad;
 #else
   return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
@@ -1147,28 +1147,28 @@
 			              ogg_uint32_t thres)
 {
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
-    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
-    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
-    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
-    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
-    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
-    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
-    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
-
-    if ( sad > thres )
-      break;
-
-    /* Step to next row of block. */
-    SrcData += SrcStride;
-    RefDataPtr1 += RefStride;
-    RefDataPtr2 += RefStride;
-  }
-
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+
+    if ( sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
   return sad;
 #else
   ogg_uint32_t  DiffVal;
@@ -1239,34 +1239,34 @@
 static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
 {
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-
-  for (i=8; i; i--) {
-     /* Examine alternate pixel locations. */
-     XSum += DataPtr[0];
-     XXSum += DataPtr[0]*DataPtr[0];
-     XSum += DataPtr[1];
-     XXSum += DataPtr[1]*DataPtr[1];
-     XSum += DataPtr[2];
-     XXSum += DataPtr[2]*DataPtr[2];
-     XSum += DataPtr[3];
-     XXSum += DataPtr[3]*DataPtr[3];
-     XSum += DataPtr[4];
-     XXSum += DataPtr[4]*DataPtr[4];
-     XSum += DataPtr[5];
-     XXSum += DataPtr[5]*DataPtr[5];
-     XSum += DataPtr[6];
-     XXSum += DataPtr[6]*DataPtr[6];
-     XSum += DataPtr[7];
-     XXSum += DataPtr[7]*DataPtr[7];
-
-     /* Step to next row of block. */
-     DataPtr += Stride;
-   }
-
-   /* Compute population variance as mis-match metric. */
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+
+  for (i=8; i; i--) {
+     /* Examine alternate pixel locations. */
+     XSum += DataPtr[0];
+     XXSum += DataPtr[0]*DataPtr[0];
+     XSum += DataPtr[1];
+     XXSum += DataPtr[1]*DataPtr[1];
+     XSum += DataPtr[2];
+     XXSum += DataPtr[2]*DataPtr[2];
+     XSum += DataPtr[3];
+     XXSum += DataPtr[3]*DataPtr[3];
+     XSum += DataPtr[4];
+     XXSum += DataPtr[4]*DataPtr[4];
+     XSum += DataPtr[5];
+     XXSum += DataPtr[5]*DataPtr[5];
+     XSum += DataPtr[6];
+     XXSum += DataPtr[6]*DataPtr[6];
+     XSum += DataPtr[7];
+     XXSum += DataPtr[7]*DataPtr[7];
+
+     /* Step to next row of block. */
+     DataPtr += Stride;
+   }
+
+   /* Compute population variance as mis-match metric. */
    return (( (XXSum<<6) - XSum*XSum ) );
 #else
   ogg_uint32_t  XSum;
@@ -1334,50 +1334,50 @@
 {
 
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  ogg_int32_t   DiffVal;
-
-  for (i=8; i; i--) {
-    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-        
-    /* Step to next row of block. */
-    SrcData += SrcStride;
-    RefDataPtr += RefStride;
-  }
-
-  /* Compute and return population variance as mis-match metric. */
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
   return (( (XXSum<<6) - XSum*XSum ));
 #else
   ogg_uint32_t  XSum;
@@ -1455,51 +1455,51 @@
 				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
 {
 #if 0
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  ogg_int32_t   DiffVal;
-
-  for (i=8; i; i--) {
-    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
-    XSum += DiffVal;
-    XXSum += DiffVal*DiffVal;
-
-    /* Step to next row of block. */
-    SrcData += SrcStride;
-    RefDataPtr1 += RefStride;
-    RefDataPtr2 += RefStride;
-  }
-
-  /* Compute and return population variance as mis-match metric. */
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
   return (( (XXSum<<6) - XSum*XSum ));
 #else
   ogg_uint32_t XSum;

Modified: branches/theora-playtime/lib/x86_32_vs/recon_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_mmx.c	2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/recon_mmx.c	2006-06-01 19:42:23 UTC (rev 11494)
@@ -8,11 +8,6 @@
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
  ********************************************************************/
 
 #include "codec_internal.h"

Added: branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_sse2.c	2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/recon_sse2.c	2006-06-01 19:42:23 UTC (rev 11494)
@@ -0,0 +1,273 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+
+#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
+
+static const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
+static const unsigned int* V128x16Ptr = V128x16;
+
+static void copy8x8__sse2 (unsigned char *src,
+	                unsigned char *dest,
+	                unsigned int stride)
+{
+#if 0
+  int j;
+  for ( j = 0; j < 8; j++ ){
+    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+    src+=stride;
+    dest+=stride;
+  }
+
+#else
+
+    /*
+
+            @src
+            <----    stride    ---->
+    0       FFFF FFFF .... .... ....
+    1       FFFF FFFF .... .... ....
+    ...
+    7       FFFF FFFF .... .... ....
+
+
+            @dest
+            <----    stride    ---->
+    0       TTTT TTTT .... .... ....
+    1       TTTT TTTT .... .... ....
+    ...
+    7       TTTT TTTT .... .... ....
+
+
+    */
+    __asm {
+        align 16
+
+        /* Load the parameters into the general registers */
+        mov         eax, src
+        mov         ebx, dest
+        mov         ecx, stride
+
+        /* edi = 3*stride */
+        /* edx = 5*stride */
+        /* edi = 7*stride */
+        lea		    edi, [ecx + ecx * 2]
+        lea         edx, [ecx + ecx * 4]
+        lea         esi, [ecx + edi * 2]
+
+        /* 
+            TODO::: If we can somehow ensure each addressed element of src 
+            and dest, were 16 byte aligned could maybe use movdqa which might be
+            faster. That requires that the base pointer is aligned,
+            and that the stride is a multiple of 16
+            */
+
+        /* Load all 8 registers */
+        movq      xmm0, QWORD PTR [eax]
+        movq      xmm1, QWORD PTR [eax + ecx]
+        movq      xmm2, QWORD PTR [eax + ecx * 2]
+        movq      xmm3, QWORD PTR [eax + edi]
+
+        movq      xmm4, QWORD PTR [eax + ecx * 4]
+        movq      xmm5, QWORD PTR [eax + edx]
+        movq      xmm6, QWORD PTR [eax + edi * 2]
+        movq      xmm7, QWORD PTR [eax + esi]
+
+
+        /* Write out all 8 registers */
+        movq      QWORD PTR [ebx], xmm0
+        movq      QWORD PTR [ebx + ecx], xmm1
+        movq      QWORD PTR [ebx + ecx * 2], xmm2
+        movq      QWORD PTR [ebx + edi], xmm3
+
+        movq      QWORD PTR [ebx + ecx * 4], xmm4
+        movq      QWORD PTR [ebx + edx], xmm5
+        movq      QWORD PTR [ebx + edi * 2], xmm6
+        movq      QWORD PTR [ebx + esi], xmm7
+
+
+
+    };
+
+#endif
+}
+
+static void recon_intra8x8__sse2 (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+		      ogg_uint32_t LineStep)
+{
+
+#if 0
+  ogg_uint32_t i;
+
+  for (i = 8; i; i--){
+    /* Convert the data back to 8 bit unsigned */
+    /* Saturate the output to unsigend 8 bit values */
+    ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
+    ReconPtr[1] = clamp255( ChangePtr[1] + 128 );
+    ReconPtr[2] = clamp255( ChangePtr[2] + 128 );
+    ReconPtr[3] = clamp255( ChangePtr[3] + 128 );
+    ReconPtr[4] = clamp255( ChangePtr[4] + 128 );
+    ReconPtr[5] = clamp255( ChangePtr[5] + 128 );
+    ReconPtr[6] = clamp255( ChangePtr[6] + 128 );
+    ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
+
+    ReconPtr += LineStep;
+    ChangePtr += 8;
+  }
+
+#else
+
+    /*
+        @ChangePtr
+        <--- 8 int16's --->
+    0   HLHL HLHL HLHL HLHL
+    ...
+    7   HLHL HLHL HLHL HLHL
+
+    
+        @ReconPtr
+        <----- LineStep ------->
+    0   CCCC CCCC .... .... .... 
+    ...
+    7   CCCC CCCC .... .... .... 
+    */
+
+    __asm {
+
+        align 16
+
+        mov     eax, ReconPtr
+        mov     ebx, ChangePtr
+        mov     ecx, LineStep
+        mov     edx, V128x16Ptr
+
+        /* Check whether we can use movdqa for 16 byte alignment */
+
+        movdqu      xmm7, [edx]
+        /* 8 lots of int16 per register on the first mov */
+        /* Then packs those 8 + another 8 down to 16x 8 bits */
+        /* Loads the data in only 4 iterations into different registers */
+        /* Maybe just make all the loads offsetted adress and no lea? */
+        
+        /* Iteration 1 - xmm0 */
+        movdqu      xmm0, [ebx]
+        packsswb    xmm0, [ebx + 16]
+        pxor        xmm0, xmm7
+        lea         ebx, [ebx + 32]
+
+        /* Iteration 2 - xmm1*/
+        movdqu      xmm1, [ebx]
+        packsswb    xmm1, [ebx + 16]
+        pxor        xmm1, xmm7
+        lea         ebx, [ebx + 32]
+
+        /* Iteration 3 - xmm2 */
+        movdqu      xmm2, [ebx]
+        packsswb    xmm2, [ebx + 16]
+        pxor        xmm2, xmm7
+        lea         ebx, [ebx + 32]
+
+        /* Iteration 4 - xmm3 */
+        movdqu      xmm3, [ebx]
+        packsswb    xmm3, [ebx + 16]
+        pxor        xmm3, xmm7
+        /* lea         ebx, [ebx + 16] */
+
+
+        /* Output the data - lower bits, then shift then low bits again */
+
+        /* Iteration 1 - xmm0 */
+        movq        QWORD PTR [eax], xmm0
+        psrldq      xmm0, 8
+        movq        QWORD PTR [eax + ecx], xmm0
+        lea         eax, [eax + ecx * 2]
+        
+        /* Iteration 2 - xmm1 */
+        movq        QWORD PTR [eax], xmm1
+        psrldq      xmm1, 8
+        movq        QWORD PTR [eax + ecx], xmm1
+        lea         eax, [eax + ecx * 2]
+
+        /* Iteration 3 - xmm2 */
+        movq        QWORD PTR [eax], xmm2
+        psrldq      xmm2, 8
+        movq        QWORD PTR [eax + ecx], xmm2
+        lea         eax, [eax + ecx * 2]
+
+        /* Iteration 4 - xmm3 */
+        movq        QWORD PTR [eax], xmm3
+        psrldq      xmm3, 8
+        movq        QWORD PTR [eax + ecx], xmm3
+        /* lea         eax, [eax + ecx]*/
+
+
+    };
+
+#endif
+}
+
+static void recon_inter8x8__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr,
+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
+  ogg_uint32_t i;
+
+  for (i = 8; i; i--){
+    ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
+    ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
+    ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
+    ReconPtr[3] = clamp255(RefPtr[3] + ChangePtr[3]);
+    ReconPtr[4] = clamp255(RefPtr[4] + ChangePtr[4]);
+    ReconPtr[5] = clamp255(RefPtr[5] + ChangePtr[5]);
+    ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
+    ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
+
+    ChangePtr += 8;
+    ReconPtr += LineStep;
+    RefPtr += LineStep;
+  }
+}
+
+static void recon_inter8x8_half__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr1,
+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+			   ogg_uint32_t LineStep)
+{
+  ogg_uint32_t  i;
+
+  for (i = 8; i; i--){
+    ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
+    ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
+    ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
+    ReconPtr[3] = clamp255((((int)RefPtr1[3] + (int)RefPtr2[3]) >> 1) + ChangePtr[3] );
+    ReconPtr[4] = clamp255((((int)RefPtr1[4] + (int)RefPtr2[4]) >> 1) + ChangePtr[4] );
+    ReconPtr[5] = clamp255((((int)RefPtr1[5] + (int)RefPtr2[5]) >> 1) + ChangePtr[5] );
+    ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
+    ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
+
+    ChangePtr += 8;
+    ReconPtr += LineStep;
+    RefPtr1 += LineStep;
+    RefPtr2 += LineStep;
+  }
+}
+
+
+void dsp_sse2_recon_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 sse2 recon functions.\n");
+  funcs->copy8x8 = copy8x8__sse2;
+  funcs->recon_intra8x8 = recon_intra8x8__sse2;
+  funcs->recon_inter8x8 = recon_inter8x8__sse2;
+  funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+}
+



More information about the commits mailing list