[xiph-commits] r11545 - in branches/theora-playtime: lib
lib/x86_32_vs win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Thu Jun 8 07:50:22 PDT 2006
Author: illiminable
Date: 2006-06-08 07:50:13 -0700 (Thu, 08 Jun 2006)
New Revision: 11545
Modified:
branches/theora-playtime/lib/scan.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.h
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Temprorary work in RowDiffScan+ApplyPakLowPass in scan.c, still slightly incorrect implementation.
Modified: branches/theora-playtime/lib/scan.c
===================================================================
--- branches/theora-playtime/lib/scan.c 2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/lib/scan.c 2006-06-08 14:50:13 UTC (rev 11545)
@@ -20,7 +20,13 @@
#include <string.h>
#include "codec_internal.h"
#include "dsp.h"
+#include "perf_helper.h"
+
+
+static unsigned __int64 perf_rds_datmf_time = 0;
+static unsigned __int64 perf_rds_datmf_count = 0;
+static unsigned __int64 perf_rds_datmf_min = -1;
//#include "perf_helper.h"
#define MAX_SEARCH_LINE_LEN 7
@@ -682,6 +688,316 @@
}
+
+static void ApplyPakLowPass_Vectorised( PP_INSTANCE *ppi,
+ unsigned char * SrcPtr,
+ unsigned char * OutputPtr)
+{
+
+#if 0
+ //static __declspec(align(16)) unsigned char temp[8];
+ //static unsigned char* temp_ptr = temp;
+ for (i = 0; i < 8; i++)
+ {
+ unsigned char * SrcPtr1 = SrcPtr[i] - 1;
+ unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
+ use of
+ stride not
+ width. */
+ unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
+
+ OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
+ (ogg_uint32_t)SrcPtr[i-s] +
+ (ogg_uint32_t)SrcPtr[i-s+1] +
+ (ogg_uint32_t)SrcPtr[i-1] +
+ (ogg_uint32_t)SrcPtr[i+1] +
+ (ogg_uint32_t)SrcPtr[i+s-1] +
+ (ogg_uint32_t)SrcPtr[i+s] +
+ (ogg_uint32_t)SrcPtr[i+s+1] ) >> 3 );
+ }
+
+ return temp_ptr;
+#else
+
+ /*
+ .... .... .... .... XXXX XXXX .... .... .... ....
+
+
+
+
+ .... .... .... ...1 23.. ..ab c... .... .... ....
+ .... .... .... ...4 X5.. ..dY e... .... .... ....
+ .... .... .... ...6 78.. ..fg h... .... .... ....
+
+
+ //Different numbering below
+ //Showing per row for the top and bottom rows
+
+ 1234567abc
+
+ desired,
+ 1+2+3 = A
+ 2+3+4 = B
+ 3+4+5 = C
+ 4+5+6 = D
+ 5+6+7 = E
+ 6+7+a = F
+ 7+a+b = G
+ a+b+c = H
+
+ 1 2 3 4 5 6 7 a | b c
+ + _ 1 2 3 4 5 6 7 | a b c
+ -------------------------------------------------------
+ 1 1+2 2+3 3+4 4+5 5+6 6+7 7+a |a+b b+c c
+
+ + 2 3 4 5 6 7 a | b c _
+ -------------------------------------------------------
+ 1+2 A B C D E F | G H
+
+
+
+ //Showing per row for the middle row
+
+ 1234567abc
+
+ desired,
+ 1+3 = A
+ 2+4 = B
+ 3+5 = C
+ 4+6 = D
+ 5+7 = E
+ 6+a = F
+ 7+b = G
+ a+c = H
+
+
+ 1 2 3 4 5 6 7 a | b c
+ + _ _ 1 2 3 4 5 6 | 7 a b c
+ -------------------------------------------------------
+ A B C D E F G H
+
+
+ */
+
+ static __declspec(align(16)) unsigned long Low6BytesMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
+ static unsigned char* Low6BytesMaskPtr = (unsigned char*)Low6BytesMask;
+ long stride = ppi->PlaneStride;
+ unsigned char* SrcPtrTopLeft = SrcPtr - stride - 1;
+ __asm {
+ align 16
+
+ mov esi, SrcPtrTopLeft
+ mov eax, Low6BytesMaskPtr
+ mov ecx, stride
+ mov edi, OutputPtr
+
+ pxor xmm0, xmm0
+ pcmpeqw xmm6, xmm6 /* All 1's */
+
+ /***************************************/
+ /* TOP ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* There are 10 bytes, read the first 8 into the first register, after the shifting
+ there will be 6 usable results. For the second register start at plus 2
+ so it also has 8 but 6 of them overlap, this stops us reading past the block
+ we are supposed to be looking at, and since we operate on the whole register
+ anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+ movq xmm1, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+ movdqa xmm7, [eax]
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+ psrldq xmm1, 2
+ psrldq xmm2, 2
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+ pslldq xmm3, 2
+ pslldq xmm4, 2
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
+ in the first regsiter. and in bytes 5 and 6 of the second register
+ there is the final 2 triple sums */
+
+
+ /* Merge the 8 results into 1 register */
+ /* Shift words 1-6 to positions 0-5 */
+ psrldq xmm1, 2
+ /* Shift words 5 and 6 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the dword 32 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ psllq xmm2, 16
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm1, xmm7
+
+ /* Create the inverse mask -- xmm6 = ~xmm7 */
+ pxor xmm6, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
+ in each of the eight 3x3 adjacent blocks */
+ por xmm1, xmm2
+
+
+
+
+
+
+ /***************************************/
+ /* BOTTOM ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* Jump down 2 lines */
+ lea esi, [esi + ecx*2]
+
+ /* There are 10 bytes, read the first 8 into the first register, after the shifting
+ there will be 6 usable results. For the second register start at plus 2
+ so it also has 8 but 6 of them overlap, this stops us reading past the block
+ we are supposed to be looking at, and since we operate on the whole register
+ anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+ movq xmm5, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+ movdqa xmm7, [eax]
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+ psrldq xmm5, 2
+ psrldq xmm2, 2
+ paddw xmm5, xmm3
+ paddw xmm2, xmm4
+
+ /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+ pslldq xmm3, 2
+ pslldq xmm4, 2
+ paddw xmm5, xmm3
+ paddw xmm2, xmm4
+
+ /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk)
+ in the first regsiter. and in bytes 5 and 6 of the second register
+ there is the final 2 triple sums */
+
+
+ /* Merge the 8 results into 1 register */
+ /* Shift words 1-6 to positions 0-5 */
+ psrldq xmm5, 2
+ /* Shift words 5 and 6 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the dword 32 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ psllq xmm2, 16
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm5, xmm7
+
+ /* Create the inverse mask -- xmm6 = ~xmm7 */
+ pxor xmm6, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains all 8 triple sums */
+ por xmm5, xmm2
+
+
+ /* xmm1 contains the top rows, and xmm5 the bottom rows
+ now sum the top rows into the bottom rows.
+ */
+ paddw xmm5, xmm1
+
+
+
+ /***************************************/
+ /* MIDDLE ROW OF THE 8 SURROUNDING PIXELS */
+ /***************************************/
+
+ /* Go back one row to the middle row */
+ sub esi, ecx
+
+ /* In this row, the middle pixel of each consecutive 3 is not to be summed */
+
+
+ movq xmm1, QWORD PTR [esi]
+ movq xmm2, QWORD PTR [esi + 2] /* this one partly overlaps */
+ movdqa xmm7, [eax]
+
+
+ /* Expand to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ /* Shift all 8 items right by 2 lot of 16 bits to get the intermediate sums */
+ psrldq xmm1, 4
+ psrldq xmm2, 4
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+
+ /* Merge the 8 results into 1 register */
+ /* Shift words 2-7 to positions 0-5 */
+ psrldq xmm1, 2
+ /* Shift words 5 and 6 to positions 6 and 7 - since
+ we don't care about any of the other positions in this regsiter
+ use the dword 32 bitwise shift which is twice as fast as the
+ dq 128 bitwise one */
+ //psllq xmm2, 16
+
+ /* Clear the high 32 bits in the first register */
+ pand xmm1, xmm7
+
+ /* Create the inverse mask -- xmm6 = ~xmm7 */
+ pxor xmm6, xmm7
+
+ /* Clear the low 6 bytes of the second register */
+ pand xmm2, xmm6
+
+ /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
+ in each of the eight 3x3 adjacent blocks */
+ por xmm1, xmm2
+
+
+ /* ---------------------- */
+
+ /* Final 8 sums */
+ paddw xmm1, xmm5
+
+ /* Divide by 8 */
+ psrlw xmm1, 3
+
+ movdqa [edi], xmm1
+
+
+
+ }
+
+
+
+#endif
+}
+
/* This is a new function factor out of rowdiffscan, maybe needs a better name */
static ogg_int32_t RowDiffScan_DiffAndThresholding(PP_INSTANCE *ppi,
unsigned char * YuvPtr1,
@@ -721,6 +1037,7 @@
unsigned char * bits_map_ptr,
signed char * SgcPtr)
{
+
ogg_int16_t Diff; /* Temp local workspace. */
ogg_int32_t j;
ogg_int32_t FragChangedPixels = 0;
@@ -797,9 +1114,16 @@
unsigned char * bits_map_ptr,
signed char * SgcPtr)
{
+#if 0
+
+ /* 10% of all encode exectution is in this function, most
+ heavily used function in alpha 6 */
+
ogg_int16_t Diff; /* Temp local workspace. */
ogg_int32_t j;
ogg_int32_t FragChangedPixels = 0;
+
+
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -820,7 +1144,186 @@
FragChangedPixels += ppi->SrfThreshTable[Diff+255];
}
return FragChangedPixels;
+ //PERF_BLOCK_END("RowDiffScan_DiffAndThresholdingMiddleFrag", perf_rds_datmf_time, perf_rds_datmf_time,perf_rds_datmf_time, 10000);
+#else
+ /*
+ xmm1 = yuvptr1[0..7]
+ xmm2 = yuvptr2[0..7]
+ xmm3 = TO_16BIT(xmm1)
+ xmm4 = TO_16BIT(xmm4)
+ xmm5 = xmm3 - xmm4 [wordwise]
+ YUVDiffsPtr[0..7] = xmm5
+
+ //Sum into SgcPtr[0] --- need the old Diff's for this. Make sure SgcPtr doesn't alias onto this other stuff we're using
+
+ xmm6 = DoVectorisedPakLowPass //xmm6 now has the "new" updated Diff's
+
+ //Use the new Diff's to update bits_map_ptr while summing up frag changed pix
+
+
+
+ */
+
+ static __declspec(align(16)) unsigned long Some255s[4] = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
+ static __declspec(align(16)) unsigned char temp[48];
+ static unsigned char* temp_ptr = temp;
+
+ static unsigned char* some_255s_ptr = (unsigned char*)Some255s;
+ unsigned char* local_sgc_thresh_table = ppi->SgcThreshTable;
+ unsigned char* local_srf_thresh_table = ppi->SrfThreshTable;
+ unsigned char* local_srf_pak_thresh_table = ppi->SrfPakThreshTable;
+
+
+ unsigned char val, thresh_val;
+ int i, FragChangedPixels = 0;
+
+
+ __asm {
+ align 16
+ mov esi, YuvPtr1
+ mov edx, YuvPtr2
+ mov edi, YUVDiffsPtr /* Not aligned */
+ mov eax, some_255s_ptr;
+ mov ecx, temp_ptr
+
+ movdqa xmm7, [eax]
+ pxor xmm0, xmm0
+
+ /* Load yuvptr1[0..7] into low 8 bytes */
+ movq xmm1, QWORD PTR [esi]
+ /* Load yuvptr2[0..7] into low 8 bytes */
+ movq xmm2, QWORD PTR [edx]
+
+ /* Unpack to 16 bits */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ /* Subtract the YUV Ptr values */
+ psubsw xmm1, xmm2
+
+ /* Write out to YUVDiffs */
+ movdqu [edi], xmm1
+
+ /* Add 255 to them all */
+ paddw xmm1, xmm7
+
+ /* Write them to the temp area */
+ movdqa [ecx], xmm1
+
+
+
+ }
+
+ ApplyPakLowPass_Vectorised(ppi, YuvPtr1, temp_ptr + 16);
+ ApplyPakLowPass_Vectorised(ppi, YuvPtr2, temp_ptr + 32);
+
+ __asm {
+ align 16
+
+ mov esi, temp_ptr
+ mov edi, YUVDiffsPtr /* Not aligned */
+ mov ecx, some_255s_ptr
+
+
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + 32]
+ movdqu xmm3, [edi] /* Old diffs */
+ movdqa xmm6, [ecx]
+
+ /* New diffs after PakLowPass */
+ psubw xmm1, xmm2
+
+ /* Add 255 to the diffs */
+ paddw xmm1, xmm6
+
+ /* Write back out to temp */
+ movdqa [esi +16], xmm1
+
+ /* Now need to process with normal registers ops */
+
+
+
+ /* At this point
+ temp_ptr[0..15] = 8 lots of Early loop diffs - 255
+ temp_ptr[16..31] = 8 lots of late loop diffs - 255
+ temp_ptr[32..47] = who cares */
+
+ }
+
+
+ temp_ptr[32] = local_srf_pak_thresh_table[temp_ptr[0]];
+ temp_ptr[33] = local_srf_pak_thresh_table[temp_ptr[1]];
+ temp_ptr[34] = local_srf_pak_thresh_table[temp_ptr[2]];
+ temp_ptr[35] = local_srf_pak_thresh_table[temp_ptr[3]];
+ temp_ptr[36] = local_srf_pak_thresh_table[temp_ptr[4]];
+ temp_ptr[37] = local_srf_pak_thresh_table[temp_ptr[5]];
+ temp_ptr[38] = local_srf_pak_thresh_table[temp_ptr[6]];
+ temp_ptr[39] = local_srf_pak_thresh_table[temp_ptr[7]];
+
+ __asm {
+ align 16
+
+ mov edx, YUVDiffsPtr
+ mov esi, temp_ptr
+
+ /* Read back the old diffs */
+ movdqu xmm4, [edx]
+
+ /* Read back the new diffs */
+ movdqa xmm3, [esi + 16]
+
+ /* Read back the pak_threshed values used in the if statement */
+ movdqa xmm6, [esi + 32]
+
+ pxor xmm0, xmm0
+ pcmpeqw xmm7, xmm7 /* All 1's */
+
+
+ /* Compare the pak_thresh values to 0, any word which was 0, will now be set to all 1's in xmm0
+ the if basically said, if it's zero, leave it alone, otherwise, replace it
+ with the new diff */
+ pcmpeqw xmm0, xmm3
+
+ /* On the old diffs, keep all the words where the pak_thresh is zero */
+ pand xmm4, xmm0
+
+ /* Flip the bits so that the places that were 0 are now all zeros */
+ pxor xmm0, xmm7
+
+ /* This zero's out all the words in the new diffs which were 0 in the pak_thresh */
+ pand xmm3, xmm0
+
+ /* Merge the old and new diffs */
+ por xmm3, xmm4
+
+ /* Add 255 to the diffs */
+ paddw xmm3, xmm6
+
+ /* Write back out to temp */
+ movdqa [esi + 32], xmm3
+ }
+
+ for (i = 0; i < 8; i++)
+ {
+ val = temp[32 + i];
+ thresh_val = local_srf_thresh_table[val];
+ SgcPtr[0] += local_sgc_thresh_table[val];
+ bits_map_ptr[i] = thresh_val;
+ FragChangedPixels += thresh_val;
+
+ }
+
+
+ return FragChangedPixels;
+
+
+
+
+
+
+
+#endif
}
Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-08 14:50:13 UTC (rev 11545)
@@ -8,7 +8,7 @@
extern unsigned __int64 GetCPUTime();
-
+//#define PERF_DATA_ON
#ifdef PERF_DATA_ON
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-08 14:50:13 UTC (rev 11545)
@@ -41,7 +41,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib"
+ AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;"G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs""
PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -129,7 +129,7 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
OmitFramePointers="true"
- AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib"
+ AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
StringPooling="true"
ExceptionHandling="0"
More information about the commits
mailing list