[xiph-commits] r11494 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Thu Jun 1 12:42:31 PDT 2006
Author: illiminable
Date: 2006-06-01 12:42:23 -0700 (Thu, 01 Jun 2006)
New Revision: 11494
Added:
branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
Modified:
branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
branches/theora-playtime/lib/x86_32_vs/recon_mmx.c
Log:
* A couple of SSE2 functions to start with
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c 2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c 2006-06-01 19:42:23 UTC (rev 11494)
@@ -37,23 +37,23 @@
//Make non-zero to use the C-version
#if 0
- int i;
-
- /* For each block row */
- for (i=8; i; i--) {
- DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
- DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
- DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
- DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
- DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
- DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
- DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
- DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
-
- /* Start next row */
- FiltPtr += PixelsPerLine;
- ReconPtr += ReconPixelsPerLine;
- DctInputPtr += 8;
+ int i;
+
+ /* For each block row */
+ for (i=8; i; i--) {
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ ReconPtr += ReconPixelsPerLine;
+ DctInputPtr += 8;
}
#else
__asm {
@@ -248,25 +248,25 @@
{
#if 0
- int i;
- /* For each block row */
- for (i=8; i; i--) {
- /* INTRA mode so code raw image data */
- /* We convert the data to 8 bit signed (by subtracting 128) as
- this reduces the internal precision requirments in the DCT
- transform. */
- DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
- DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
- DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
- DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
- DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
- DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
- DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
- DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
-
- /* Start next row */
- FiltPtr += PixelsPerLine;
- DctInputPtr += 8;
+ int i;
+ /* For each block row */
+ for (i=8; i; i--) {
+ /* INTRA mode so code raw image data */
+ /* We convert the data to 8 bit signed (by subtracting 128) as
+ this reduces the internal precision requirments in the DCT
+ transform. */
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ DctInputPtr += 8;
}
#else
@@ -422,24 +422,24 @@
{
#if 0
- int i;
-
- /* For each block row */
- for (i=8; i; i--) {
- DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
- DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
- DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
- DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
- DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
- DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
- DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
- DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
-
- /* Start next row */
- FiltPtr += PixelsPerLine;
- ReconPtr1 += ReconPixelsPerLine;
- ReconPtr2 += ReconPixelsPerLine;
- DctInputPtr += 8;
+ int i;
+
+ /* For each block row */
+ for (i=8; i; i--) {
+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
+
+ /* Start next row */
+ FiltPtr += PixelsPerLine;
+ ReconPtr1 += ReconPixelsPerLine;
+ ReconPtr2 += ReconPixelsPerLine;
+ DctInputPtr += 8;
}
#else
@@ -705,21 +705,21 @@
{
#if 0
- ogg_uint32_t SadValue;
- ogg_uint32_t SadValue1;
-
- SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
- DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
- DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
- DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
-
- SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
- DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
- DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
- DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
-
- SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
+ ogg_uint32_t SadValue;
+ ogg_uint32_t SadValue1;
+
+ SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
+ DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+ DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+ DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+
+ SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
+ DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+ DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+ DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+
+ SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
+
return SadValue;
#else
@@ -784,46 +784,46 @@
{
#if 0
- ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t MaxSad = 0;
- ogg_uint32_t i;
-
- for ( i = 0; i < 4; i++ ){
- SadValue[0] += abs(Src1[0] - Src2[0]);
- SadValue[1] += abs(Src1[1] - Src2[1]);
- SadValue[2] += abs(Src1[2] - Src2[2]);
- SadValue[3] += abs(Src1[3] - Src2[3]);
- SadValue[4] += abs(Src1[4] - Src2[4]);
- SadValue[5] += abs(Src1[5] - Src2[5]);
- SadValue[6] += abs(Src1[6] - Src2[6]);
- SadValue[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += stride;
- Src2 += stride;
- }
-
- for ( i = 0; i < 4; i++ ){
- SadValue2[0] += abs(Src1[0] - Src2[0]);
- SadValue2[1] += abs(Src1[1] - Src2[1]);
- SadValue2[2] += abs(Src1[2] - Src2[2]);
- SadValue2[3] += abs(Src1[3] - Src2[3]);
- SadValue2[4] += abs(Src1[4] - Src2[4]);
- SadValue2[5] += abs(Src1[5] - Src2[5]);
- SadValue2[6] += abs(Src1[6] - Src2[6]);
- SadValue2[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += stride;
- Src2 += stride;
- }
-
- for ( i = 0; i < 8; i++ ){
- if ( SadValue[i] > MaxSad )
- MaxSad = SadValue[i];
- if ( SadValue2[i] > MaxSad )
- MaxSad = SadValue2[i];
- }
-
+ ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+ ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+ ogg_uint32_t MaxSad = 0;
+ ogg_uint32_t i;
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue[0] += abs(Src1[0] - Src2[0]);
+ SadValue[1] += abs(Src1[1] - Src2[1]);
+ SadValue[2] += abs(Src1[2] - Src2[2]);
+ SadValue[3] += abs(Src1[3] - Src2[3]);
+ SadValue[4] += abs(Src1[4] - Src2[4]);
+ SadValue[5] += abs(Src1[5] - Src2[5]);
+ SadValue[6] += abs(Src1[6] - Src2[6]);
+ SadValue[7] += abs(Src1[7] - Src2[7]);
+
+ Src1 += stride;
+ Src2 += stride;
+ }
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue2[0] += abs(Src1[0] - Src2[0]);
+ SadValue2[1] += abs(Src1[1] - Src2[1]);
+ SadValue2[2] += abs(Src1[2] - Src2[2]);
+ SadValue2[3] += abs(Src1[3] - Src2[3]);
+ SadValue2[4] += abs(Src1[4] - Src2[4]);
+ SadValue2[5] += abs(Src1[5] - Src2[5]);
+ SadValue2[6] += abs(Src1[6] - Src2[6]);
+ SadValue2[7] += abs(Src1[7] - Src2[7]);
+
+ Src1 += stride;
+ Src2 += stride;
+ }
+
+ for ( i = 0; i < 8; i++ ){
+ if ( SadValue[i] > MaxSad )
+ MaxSad = SadValue[i];
+ if ( SadValue2[i] > MaxSad )
+ MaxSad = SadValue2[i];
+ }
+
return MaxSad;
#else
ogg_uint32_t MaxSad;
@@ -912,24 +912,24 @@
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t sad = 0;
-
- for (i=8; i; i--) {
- sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
- sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
- sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
- sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
- sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
- sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
- sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
- sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
- /* Step to next row of block. */
- ptr1 += stride1;
- ptr2 += stride2;
- }
-
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+ /* Step to next row of block. */
+ ptr1 += stride1;
+ ptr2 += stride2;
+ }
+
return sad;
#else
ogg_uint32_t DiffVal;
@@ -1113,27 +1113,27 @@
ogg_uint32_t thres)
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t sad = 0;
-
- for (i=8; i; i--) {
- sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
- sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
- sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
- sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
- sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
- sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
- sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
- sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
- if (sad > thres )
- break;
-
- /* Step to next row of block. */
- ptr1 += stride1;
- ptr2 += stride2;
- }
-
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+ if (sad > thres )
+ break;
+
+ /* Step to next row of block. */
+ ptr1 += stride1;
+ ptr2 += stride2;
+ }
+
return sad;
#else
return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
@@ -1147,28 +1147,28 @@
ogg_uint32_t thres)
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t sad = 0;
-
- for (i=8; i; i--) {
- sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
- sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
- sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
- sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
- sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
- sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
- sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
- sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
-
- if ( sad > thres )
- break;
-
- /* Step to next row of block. */
- SrcData += SrcStride;
- RefDataPtr1 += RefStride;
- RefDataPtr2 += RefStride;
- }
-
+ ogg_uint32_t i;
+ ogg_uint32_t sad = 0;
+
+ for (i=8; i; i--) {
+ sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+ sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+ sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+ sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+ sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+ sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+ sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+ sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+
+ if ( sad > thres )
+ break;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr1 += RefStride;
+ RefDataPtr2 += RefStride;
+ }
+
return sad;
#else
ogg_uint32_t DiffVal;
@@ -1239,34 +1239,34 @@
static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t XSum=0;
- ogg_uint32_t XXSum=0;
-
- for (i=8; i; i--) {
- /* Examine alternate pixel locations. */
- XSum += DataPtr[0];
- XXSum += DataPtr[0]*DataPtr[0];
- XSum += DataPtr[1];
- XXSum += DataPtr[1]*DataPtr[1];
- XSum += DataPtr[2];
- XXSum += DataPtr[2]*DataPtr[2];
- XSum += DataPtr[3];
- XXSum += DataPtr[3]*DataPtr[3];
- XSum += DataPtr[4];
- XXSum += DataPtr[4]*DataPtr[4];
- XSum += DataPtr[5];
- XXSum += DataPtr[5]*DataPtr[5];
- XSum += DataPtr[6];
- XXSum += DataPtr[6]*DataPtr[6];
- XSum += DataPtr[7];
- XXSum += DataPtr[7]*DataPtr[7];
-
- /* Step to next row of block. */
- DataPtr += Stride;
- }
-
- /* Compute population variance as mis-match metric. */
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+
+ for (i=8; i; i--) {
+ /* Examine alternate pixel locations. */
+ XSum += DataPtr[0];
+ XXSum += DataPtr[0]*DataPtr[0];
+ XSum += DataPtr[1];
+ XXSum += DataPtr[1]*DataPtr[1];
+ XSum += DataPtr[2];
+ XXSum += DataPtr[2]*DataPtr[2];
+ XSum += DataPtr[3];
+ XXSum += DataPtr[3]*DataPtr[3];
+ XSum += DataPtr[4];
+ XXSum += DataPtr[4]*DataPtr[4];
+ XSum += DataPtr[5];
+ XXSum += DataPtr[5]*DataPtr[5];
+ XSum += DataPtr[6];
+ XXSum += DataPtr[6]*DataPtr[6];
+ XSum += DataPtr[7];
+ XXSum += DataPtr[7]*DataPtr[7];
+
+ /* Step to next row of block. */
+ DataPtr += Stride;
+ }
+
+ /* Compute population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ) );
#else
ogg_uint32_t XSum;
@@ -1334,50 +1334,50 @@
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t XSum=0;
- ogg_uint32_t XXSum=0;
- ogg_int32_t DiffVal;
-
- for (i=8; i; i--) {
- DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- SrcData += SrcStride;
- RefDataPtr += RefStride;
- }
-
- /* Compute and return population variance as mis-match metric. */
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+ ogg_int32_t DiffVal;
+
+ for (i=8; i; i--) {
+ DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr += RefStride;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ));
#else
ogg_uint32_t XSum;
@@ -1455,51 +1455,51 @@
unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
#if 0
- ogg_uint32_t i;
- ogg_uint32_t XSum=0;
- ogg_uint32_t XXSum=0;
- ogg_int32_t DiffVal;
-
- for (i=8; i; i--) {
- DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- SrcData += SrcStride;
- RefDataPtr1 += RefStride;
- RefDataPtr2 += RefStride;
- }
-
- /* Compute and return population variance as mis-match metric. */
+ ogg_uint32_t i;
+ ogg_uint32_t XSum=0;
+ ogg_uint32_t XXSum=0;
+ ogg_int32_t DiffVal;
+
+ for (i=8; i; i--) {
+ DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+ XSum += DiffVal;
+ XXSum += DiffVal*DiffVal;
+
+ /* Step to next row of block. */
+ SrcData += SrcStride;
+ RefDataPtr1 += RefStride;
+ RefDataPtr2 += RefStride;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ));
#else
ogg_uint32_t XSum;
Modified: branches/theora-playtime/lib/x86_32_vs/recon_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_mmx.c 2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/recon_mmx.c 2006-06-01 19:42:23 UTC (rev 11494)
@@ -8,11 +8,6 @@
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
* by the Xiph.Org Foundation http://www.xiph.org/ *
* *
- ********************************************************************
-
- function:
- last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
********************************************************************/
#include "codec_internal.h"
Added: branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_sse2.c 2006-06-01 15:15:18 UTC (rev 11493)
+++ branches/theora-playtime/lib/x86_32_vs/recon_sse2.c 2006-06-01 19:42:23 UTC (rev 11494)
@@ -0,0 +1,273 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************/
+
+#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
+
+static const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
+static const unsigned int* V128x16Ptr = V128x16;
+
+static void copy8x8__sse2 (unsigned char *src,
+ unsigned char *dest,
+ unsigned int stride)
+{
+#if 0
+ int j;
+ for ( j = 0; j < 8; j++ ){
+ ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+ ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+ src+=stride;
+ dest+=stride;
+ }
+
+#else
+
+ /*
+
+ @src
+ <---- stride ---->
+ 0 FFFF FFFF .... .... ....
+ 1 FFFF FFFF .... .... ....
+ ...
+ 7 FFFF FFFF .... .... ....
+
+
+ @dest
+ <---- stride ---->
+ 0 TTTT TTTT .... .... ....
+ 1 TTTT TTTT .... .... ....
+ ...
+ 7 TTTT TTTT .... .... ....
+
+
+ */
+ __asm {
+ align 16
+
+ /* Load the parameters into the general registers */
+ mov eax, src
+ mov ebx, dest
+ mov ecx, stride
+
+ /* edi = 3*stride */
+ /* edx = 5*stride */
+ /* edi = 7*stride */
+ lea edi, [ecx + ecx * 2]
+ lea edx, [ecx + ecx * 4]
+ lea esi, [ecx + edi * 2]
+
+ /*
+ TODO::: If we can somehow ensure each addressed element of src
+ and dest, were 16 byte aligned could maybe use movdqa which might be
+ faster. That requires that the base pointer is aligned,
+ and that the stride is a multiple of 16
+ */
+
+ /* Load all 8 registers */
+ movq xmm0, QWORD PTR [eax]
+ movq xmm1, QWORD PTR [eax + ecx]
+ movq xmm2, QWORD PTR [eax + ecx * 2]
+ movq xmm3, QWORD PTR [eax + edi]
+
+ movq xmm4, QWORD PTR [eax + ecx * 4]
+ movq xmm5, QWORD PTR [eax + edx]
+ movq xmm6, QWORD PTR [eax + edi * 2]
+ movq xmm7, QWORD PTR [eax + esi]
+
+
+ /* Write out all 8 registers */
+ movq QWORD PTR [ebx], xmm0
+ movq QWORD PTR [ebx + ecx], xmm1
+ movq QWORD PTR [ebx + ecx * 2], xmm2
+ movq QWORD PTR [ebx + edi], xmm3
+
+ movq QWORD PTR [ebx + ecx * 4], xmm4
+ movq QWORD PTR [ebx + edx], xmm5
+ movq QWORD PTR [ebx + edi * 2], xmm6
+ movq QWORD PTR [ebx + esi], xmm7
+
+
+
+ };
+
+#endif
+}
+
+static void recon_intra8x8__sse2 (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
+
+#if 0
+ ogg_uint32_t i;
+
+ for (i = 8; i; i--){
+ /* Convert the data back to 8 bit unsigned */
+ /* Saturate the output to unsigend 8 bit values */
+ ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
+ ReconPtr[1] = clamp255( ChangePtr[1] + 128 );
+ ReconPtr[2] = clamp255( ChangePtr[2] + 128 );
+ ReconPtr[3] = clamp255( ChangePtr[3] + 128 );
+ ReconPtr[4] = clamp255( ChangePtr[4] + 128 );
+ ReconPtr[5] = clamp255( ChangePtr[5] + 128 );
+ ReconPtr[6] = clamp255( ChangePtr[6] + 128 );
+ ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
+
+ ReconPtr += LineStep;
+ ChangePtr += 8;
+ }
+
+#else
+
+ /*
+ @ChangePtr
+ <--- 8 int16's --->
+ 0 HLHL HLHL HLHL HLHL
+ ...
+ 7 HLHL HLHL HLHL HLHL
+
+
+ @ReconPtr
+ <----- LineStep ------->
+ 0 CCCC CCCC .... .... ....
+ ...
+ 7 CCCC CCCC .... .... ....
+ */
+
+ __asm {
+
+ align 16
+
+ mov eax, ReconPtr
+ mov ebx, ChangePtr
+ mov ecx, LineStep
+ mov edx, V128x16Ptr
+
+ /* Check whether we can use movdqa for 16 byte alignment */
+
+ movdqu xmm7, [edx]
+ /* 8 lots of int16 per register on the first mov */
+ /* Then packs those 8 + another 8 down to 16x 8 bits */
+ /* Loads the data in only 4 iterations into different registers */
+ /* Maybe just make all the loads offsetted adress and no lea? */
+
+ /* Iteration 1 - xmm0 */
+ movdqu xmm0, [ebx]
+ packsswb xmm0, [ebx + 16]
+ pxor xmm0, xmm7
+ lea ebx, [ebx + 32]
+
+ /* Iteration 2 - xmm1*/
+ movdqu xmm1, [ebx]
+ packsswb xmm1, [ebx + 16]
+ pxor xmm1, xmm7
+ lea ebx, [ebx + 32]
+
+ /* Iteration 3 - xmm2 */
+ movdqu xmm2, [ebx]
+ packsswb xmm2, [ebx + 16]
+ pxor xmm2, xmm7
+ lea ebx, [ebx + 32]
+
+ /* Iteration 4 - xmm3 */
+ movdqu xmm3, [ebx]
+ packsswb xmm3, [ebx + 16]
+ pxor xmm3, xmm7
+ /* lea ebx, [ebx + 16] */
+
+
+ /* Output the data - lower bits, then shift then low bits again */
+
+ /* Iteration 1 - xmm0 */
+ movq QWORD PTR [eax], xmm0
+ psrldq xmm0, 8
+ movq QWORD PTR [eax + ecx], xmm0
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 2 - xmm1 */
+ movq QWORD PTR [eax], xmm1
+ psrldq xmm1, 8
+ movq QWORD PTR [eax + ecx], xmm1
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 3 - xmm2 */
+ movq QWORD PTR [eax], xmm2
+ psrldq xmm2, 8
+ movq QWORD PTR [eax + ecx], xmm2
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 4 - xmm3 */
+ movq QWORD PTR [eax], xmm3
+ psrldq xmm3, 8
+ movq QWORD PTR [eax + ecx], xmm3
+ /* lea eax, [eax + ecx]*/
+
+
+ };
+
+#endif
+}
+
+static void recon_inter8x8__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr,
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
+ ogg_uint32_t i;
+
+ for (i = 8; i; i--){
+ ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
+ ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
+ ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
+ ReconPtr[3] = clamp255(RefPtr[3] + ChangePtr[3]);
+ ReconPtr[4] = clamp255(RefPtr[4] + ChangePtr[4]);
+ ReconPtr[5] = clamp255(RefPtr[5] + ChangePtr[5]);
+ ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
+ ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
+
+ ChangePtr += 8;
+ ReconPtr += LineStep;
+ RefPtr += LineStep;
+ }
+}
+
+static void recon_inter8x8_half__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
+ ogg_uint32_t i;
+
+ for (i = 8; i; i--){
+ ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
+ ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
+ ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
+ ReconPtr[3] = clamp255((((int)RefPtr1[3] + (int)RefPtr2[3]) >> 1) + ChangePtr[3] );
+ ReconPtr[4] = clamp255((((int)RefPtr1[4] + (int)RefPtr2[4]) >> 1) + ChangePtr[4] );
+ ReconPtr[5] = clamp255((((int)RefPtr1[5] + (int)RefPtr2[5]) >> 1) + ChangePtr[5] );
+ ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
+ ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
+
+ ChangePtr += 8;
+ ReconPtr += LineStep;
+ RefPtr1 += LineStep;
+ RefPtr2 += LineStep;
+ }
+}
+
+
+void dsp_sse2_recon_init(DspFunctions *funcs)
+{
+ TH_DEBUG("enabling accelerated x86_32 sse2 recon functions.\n");
+ funcs->copy8x8 = copy8x8__sse2;
+ funcs->recon_intra8x8 = recon_intra8x8__sse2;
+ funcs->recon_inter8x8 = recon_inter8x8__sse2;
+ funcs->recon_inter8x8_half = recon_inter8x8_half__sse2;
+}
+
More information about the commits
mailing list