[xiph-commits] r11545 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/libtheora

Thu Jun 8 07:50:22 PDT 2006

Author: illiminable
Date: 2006-06-08 07:50:13 -0700 (Thu, 08 Jun 2006)
New Revision: 11545

Modified:
   branches/theora-playtime/lib/scan.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.h
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Temprorary work in RowDiffScan+ApplyPakLowPass in scan.c, still slightly incorrect implementation.

Modified: branches/theora-playtime/lib/scan.c
===================================================================

--- branches/theora-playtime/lib/scan.c	2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/lib/scan.c	2006-06-08 14:50:13 UTC (rev 11545)
@@ -20,7 +20,13 @@
 #include <string.h>
 #include "codec_internal.h"
 #include "dsp.h"
+#include "perf_helper.h"
 
+
+
+static unsigned __int64 perf_rds_datmf_time = 0;
+static unsigned __int64 perf_rds_datmf_count = 0;
+static unsigned __int64 perf_rds_datmf_min = -1;
 //#include "perf_helper.h"
 
 #define MAX_SEARCH_LINE_LEN                   7
@@ -682,6 +688,316 @@
 
 }
 
+
+static void ApplyPakLowPass_Vectorised( PP_INSTANCE *ppi,
+                                      unsigned char * SrcPtr,
+                                      unsigned char * OutputPtr)
+{
+
+#if 0
+    //static __declspec(align(16)) unsigned char temp[8];
+    //static unsigned char* temp_ptr = temp;
+  for (i = 0; i < 8; i++)
+  {
+      unsigned char * SrcPtr1 = SrcPtr[i] - 1;
+      unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
+                                                               use of
+                                                               stride not
+                                                               width. */
+      unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
+
+      OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
+                  (ogg_uint32_t)SrcPtr[i-s] +
+                  (ogg_uint32_t)SrcPtr[i-s+1] +
+                  (ogg_uint32_t)SrcPtr[i-1] +
+                  (ogg_uint32_t)SrcPtr[i+1] +
+                  (ogg_uint32_t)SrcPtr[i+s-1] +
+                  (ogg_uint32_t)SrcPtr[i+s] +
+                  (ogg_uint32_t)SrcPtr[i+s+1]   ) >> 3 );
+  }
+
+  return temp_ptr;
+#else
+
+  /*                                                            
+            .... .... .... .... XXXX XXXX .... .... .... ....
+
+
+
+
+            .... .... .... ...1 23.. ..ab c... .... .... ....
+            .... .... .... ...4 X5.. ..dY e... .... .... ....
+            .... .... .... ...6 78.. ..fg h... .... .... ....
+
+
+            //Different numbering below
+            //Showing per row for the top and bottom rows
+
+            1234567abc
+
+            desired,
+            1+2+3 = A
+            2+3+4 = B
+            3+4+5 = C
+            4+5+6 = D
+            5+6+7 = E
+            6+7+a = F
+            7+a+b = G
+            a+b+c = H
+
+                    1    2    3    4    5    6    7    a  | b    c
+                +   _    1    2    3    4    5    6    7  | a    b    c
+                -------------------------------------------------------
+                    1   1+2  2+3  3+4  4+5  5+6  6+7  7+a |a+b  b+c   c
+
+                +   2    3    4    5    6    7    a  | b    c    _
+                -------------------------------------------------------
+                   1+2   A    B    C    D    E    F  | G    H
+
+
+
+            //Showing per row for the middle row
+
+            1234567abc
+
+            desired,
+            1+3 = A
+            2+4 = B
+            3+5 = C
+            4+6 = D
+            5+7 = E
+            6+a = F
+            7+b = G
+            a+c = H
+
+
+                    1    2    3    4    5    6    7    a  | b    c
+                +   _    _    1    2    3    4    5    6  | 7    a    b    c
+                -------------------------------------------------------
+                              A    B    C   D     E    F    G    H
+
+
+  */
+
+    static __declspec(align(16)) unsigned long Low6BytesMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
+    static unsigned char* Low6BytesMaskPtr = (unsigned char*)Low6BytesMask;
+    long    stride = ppi->PlaneStride;
+    unsigned char* SrcPtrTopLeft = SrcPtr - stride - 1;
+    __asm {
+        align           16
+
+        mov         esi, SrcPtrTopLeft
+        mov         eax, Low6BytesMaskPtr
+        mov         ecx, stride
+        mov         edi, OutputPtr
+
+        pxor        xmm0, xmm0
+        pcmpeqw     xmm6, xmm6  /* All 1's */
+
+        /***************************************/
+        /* TOP ROW OF THE 8 SURROUNDING PIXELS */
+        /***************************************/
+
+        /* There are 10 bytes, read the first 8 into the first register, after the shifting
+            there will be 6 usable results. For the second register start at plus 2
+            so it also has 8 but 6 of them overlap, this stops us reading past the block
+            we are supposed to be looking at, and since we operate on the whole register
+            anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+        movq        xmm1, QWORD PTR [esi]
+        movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
+        movdqa      xmm7, [eax]
+
+
+        /* Expand to 16 bits */
+        punpcklbw   xmm1, xmm0
+        punpcklbw   xmm2, xmm0
+        movdqa      xmm3, xmm1
+        movdqa      xmm4, xmm2
+
+        /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+        psrldq      xmm1, 2
+        psrldq      xmm2, 2
+        paddw       xmm1, xmm3
+        paddw       xmm2, xmm4
+
+        /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+        pslldq      xmm3, 2
+        pslldq      xmm4, 2
+        paddw       xmm1, xmm3
+        paddw       xmm2, xmm4
+
+        /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk) 
+             in the first regsiter. and in bytes 5 and 6 of the second register
+             there is the final 2 triple sums */
+
+
+        /* Merge the 8 results into 1 register */
+            /* Shift words 1-6 to positions 0-5 */
+            psrldq      xmm1, 2
+            /* Shift words 5 and 6 to positions 6 and 7 - since
+                we don't care about any of the other positions in this regsiter
+                use the dword 32 bitwise shift which is twice as fast as the 
+                dq 128 bitwise one */
+            psllq       xmm2, 16
+
+            /* Clear the high 32 bits in the first register */
+            pand        xmm1, xmm7
+
+            /* Create the inverse mask -- xmm6 = ~xmm7 */
+            pxor        xmm6, xmm7
+
+            /* Clear the low 6 bytes of the second register */
+            pand        xmm2, xmm6
+
+            /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
+                in each of the eight 3x3 adjacent blocks */
+            por         xmm1, xmm2
+
+            
+
+
+
+
+        /***************************************/
+        /* BOTTOM ROW OF THE 8 SURROUNDING PIXELS */
+        /***************************************/
+
+        /* Jump down 2 lines */
+        lea     esi, [esi + ecx*2]
+        
+        /* There are 10 bytes, read the first 8 into the first register, after the shifting
+            there will be 6 usable results. For the second register start at plus 2
+            so it also has 8 but 6 of them overlap, this stops us reading past the block
+            we are supposed to be looking at, and since we operate on the whole register
+            anyway, it actually doesn't matter if theres 8 or only 2 inside */
+
+        movq        xmm5, QWORD PTR [esi]
+        movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
+        movdqa      xmm7, [eax]
+
+
+        /* Expand to 16 bits */
+        punpcklbw   xmm5, xmm0
+        punpcklbw   xmm2, xmm0
+        movdqa      xmm3, xmm1
+        movdqa      xmm4, xmm2
+
+        /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
+        psrldq      xmm5, 2
+        psrldq      xmm2, 2
+        paddw       xmm5, xmm3
+        paddw       xmm2, xmm4
+
+        /* Shift right by 1 lot of 16 to get the intermediate triple sums */
+        pslldq      xmm3, 2
+        pslldq      xmm4, 2
+        paddw       xmm5, xmm3
+        paddw       xmm2, xmm4
+
+        /* Now have 6 lots of triple sums in words 1-6 (0, and 7 have junk) 
+             in the first regsiter. and in bytes 5 and 6 of the second register
+             there is the final 2 triple sums */
+
+
+        /* Merge the 8 results into 1 register */
+            /* Shift words 1-6 to positions 0-5 */
+            psrldq      xmm5, 2
+            /* Shift words 5 and 6 to positions 6 and 7 - since
+                we don't care about any of the other positions in this regsiter
+                use the dword 32 bitwise shift which is twice as fast as the 
+                dq 128 bitwise one */
+            psllq       xmm2, 16
+
+            /* Clear the high 32 bits in the first register */
+            pand        xmm5, xmm7
+
+            /* Create the inverse mask -- xmm6 = ~xmm7 */
+            pxor        xmm6, xmm7
+
+            /* Clear the low 6 bytes of the second register */
+            pand        xmm2, xmm6
+
+            /* First register now contains all 8 triple sums */
+            por         xmm5, xmm2
+
+
+        /* xmm1 contains the top rows, and xmm5 the bottom rows 
+            now sum the top rows into the bottom rows.
+        */
+        paddw           xmm5, xmm1
+
+
+
+        /***************************************/
+        /* MIDDLE ROW OF THE 8 SURROUNDING PIXELS */
+        /***************************************/
+
+        /* Go back one row to the middle row */
+        sub     esi, ecx
+
+        /* In this row, the middle pixel of each consecutive 3 is not to be summed */
+
+
+        movq        xmm1, QWORD PTR [esi]
+        movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
+        movdqa      xmm7, [eax]
+
+
+        /* Expand to 16 bits */
+        punpcklbw   xmm1, xmm0
+        punpcklbw   xmm2, xmm0
+        movdqa      xmm3, xmm1
+        movdqa      xmm4, xmm2
+
+        /* Shift all 8 items right by 2 lot of 16 bits to get the intermediate sums */
+        psrldq      xmm1, 4
+        psrldq      xmm2, 4
+        paddw       xmm1, xmm3
+        paddw       xmm2, xmm4
+        
+        /* Merge the 8 results into 1 register */
+            /* Shift words 2-7 to positions 0-5 */
+            psrldq      xmm1, 2
+            /* Shift words 5 and 6 to positions 6 and 7 - since
+                we don't care about any of the other positions in this regsiter
+                use the dword 32 bitwise shift which is twice as fast as the 
+                dq 128 bitwise one */
+            //psllq       xmm2, 16
+
+            /* Clear the high 32 bits in the first register */
+            pand        xmm1, xmm7
+
+            /* Create the inverse mask -- xmm6 = ~xmm7 */
+            pxor        xmm6, xmm7
+
+            /* Clear the low 6 bytes of the second register */
+            pand        xmm2, xmm6
+
+            /* First register now contains all 8 triple sums ie. the sum of the top 3 pixels
+                in each of the eight 3x3 adjacent blocks */
+            por         xmm1, xmm2
+
+
+        /* ---------------------- */
+
+        /* Final 8 sums */
+        paddw           xmm1, xmm5
+
+        /* Divide by 8 */
+        psrlw           xmm1, 3
+
+        movdqa          [edi], xmm1
+
+
+
+    }
+
+  
+
+#endif
+}
+
 /* This is a new function factor out of rowdiffscan, maybe needs a better name */
 static ogg_int32_t RowDiffScan_DiffAndThresholding(PP_INSTANCE *ppi,
                          unsigned char * YuvPtr1,
@@ -721,6 +1037,7 @@
                          unsigned char * bits_map_ptr,
                          signed char   * SgcPtr)
 {
+
   ogg_int16_t Diff;     /* Temp local workspace. */
   ogg_int32_t j; 
   ogg_int32_t    FragChangedPixels = 0;
@@ -797,9 +1114,16 @@
                          unsigned char * bits_map_ptr,
                          signed char   * SgcPtr)
 {
+#if 0
+
+    /* 10% of all encode exectution is in this function, most
+        heavily used function in alpha 6 */
+
   ogg_int16_t Diff;     /* Temp local workspace. */
   ogg_int32_t j; 
   ogg_int32_t    FragChangedPixels = 0;
+
+  
     for ( j = 0; j < HFRAGPIXELS; j++ ){
       /* Take a local copy of the measured difference. */
       Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -820,7 +1144,186 @@
       FragChangedPixels += ppi->SrfThreshTable[Diff+255];
     }
     return FragChangedPixels;
+  //PERF_BLOCK_END("RowDiffScan_DiffAndThresholdingMiddleFrag", perf_rds_datmf_time, perf_rds_datmf_time,perf_rds_datmf_time, 10000);
 
+#else
+   /*
+        xmm1 = yuvptr1[0..7]
+        xmm2 = yuvptr2[0..7]
+        xmm3 = TO_16BIT(xmm1)
+        xmm4 = TO_16BIT(xmm4)
+        xmm5 = xmm3 - xmm4 [wordwise]
+        YUVDiffsPtr[0..7] = xmm5
+
+        //Sum into SgcPtr[0] --- need the old Diff's for this. Make sure SgcPtr doesn't alias onto this other stuff we're using
+
+        xmm6 = DoVectorisedPakLowPass  //xmm6 now has the "new" updated Diff's
+
+        //Use the new Diff's to update bits_map_ptr while summing up frag changed pix
+
+
+
+    */
+
+    static __declspec(align(16)) unsigned long Some255s[4] = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
+    static __declspec(align(16)) unsigned char temp[48];
+    static unsigned char* temp_ptr = temp;
+    
+    static unsigned char* some_255s_ptr = (unsigned char*)Some255s;
+    unsigned char* local_sgc_thresh_table = ppi->SgcThreshTable;
+    unsigned char* local_srf_thresh_table = ppi->SrfThreshTable;
+    unsigned char* local_srf_pak_thresh_table = ppi->SrfPakThreshTable;
+
+    
+    unsigned char val, thresh_val;
+    int i, FragChangedPixels = 0;
+
+
+    __asm {
+        align       16
+        mov         esi, YuvPtr1
+        mov         edx, YuvPtr2
+        mov         edi, YUVDiffsPtr        /* Not aligned */
+        mov         eax, some_255s_ptr;
+        mov         ecx, temp_ptr
+
+        movdqa      xmm7, [eax]
+        pxor        xmm0, xmm0
+
+        /* Load yuvptr1[0..7] into low 8 bytes */
+        movq        xmm1, QWORD PTR [esi]
+        /* Load yuvptr2[0..7] into low 8 bytes */
+        movq        xmm2, QWORD PTR [edx]
+
+        /* Unpack to 16 bits */
+        punpcklbw   xmm1, xmm0
+        punpcklbw   xmm2, xmm0
+
+        /* Subtract the YUV Ptr values */
+        psubsw      xmm1, xmm2
+
+        /* Write out to YUVDiffs */
+        movdqu      [edi], xmm1
+
+        /* Add 255 to them all */
+        paddw       xmm1, xmm7
+
+        /* Write them to the temp area */
+        movdqa      [ecx], xmm1 
+
+        
+
+    }
+
+    ApplyPakLowPass_Vectorised(ppi, YuvPtr1, temp_ptr + 16);
+    ApplyPakLowPass_Vectorised(ppi, YuvPtr2, temp_ptr + 32);
+
+    __asm {
+        align 16
+
+        mov         esi, temp_ptr
+        mov         edi, YUVDiffsPtr  /* Not aligned */
+        mov         ecx, some_255s_ptr
+
+        
+        movdqa      xmm1, [esi + 16]
+        movdqa      xmm2, [esi + 32]
+        movdqu      xmm3, [edi]  /* Old diffs */
+        movdqa      xmm6, [ecx]
+
+        /* New diffs after PakLowPass */
+        psubw       xmm1, xmm2
+
+        /* Add 255 to the diffs */
+        paddw       xmm1, xmm6
+
+        /* Write back out to temp */
+        movdqa      [esi +16], xmm1
+
+        /* Now need to process with normal registers ops */
+
+
+
+        /* At this point
+                temp_ptr[0..15] = 8 lots of Early loop diffs - 255
+                temp_ptr[16..31] = 8 lots of late loop diffs - 255
+                temp_ptr[32..47] = who cares */
+
+    }
+
+
+    temp_ptr[32] = local_srf_pak_thresh_table[temp_ptr[0]];
+    temp_ptr[33] = local_srf_pak_thresh_table[temp_ptr[1]];
+    temp_ptr[34] = local_srf_pak_thresh_table[temp_ptr[2]];
+    temp_ptr[35] = local_srf_pak_thresh_table[temp_ptr[3]];
+    temp_ptr[36] = local_srf_pak_thresh_table[temp_ptr[4]];
+    temp_ptr[37] = local_srf_pak_thresh_table[temp_ptr[5]];
+    temp_ptr[38] = local_srf_pak_thresh_table[temp_ptr[6]];
+    temp_ptr[39] = local_srf_pak_thresh_table[temp_ptr[7]];
+
+    __asm {
+        align       16
+
+        mov         edx, YUVDiffsPtr
+        mov         esi, temp_ptr
+
+        /* Read back the old diffs */
+        movdqu     xmm4, [edx]
+
+        /* Read back the new diffs */
+        movdqa     xmm3, [esi + 16]
+        
+        /* Read back the pak_threshed values used in the if statement */
+        movdqa     xmm6, [esi + 32]
+
+        pxor        xmm0, xmm0
+        pcmpeqw     xmm7, xmm7      /* All 1's */
+
+
+        /* Compare the pak_thresh values to 0, any word which was 0, will now be set to all 1's in xmm0 
+                the if basically said, if it's zero, leave it alone, otherwise, replace it
+                with the new diff */
+        pcmpeqw     xmm0, xmm3
+
+        /* On the old diffs, keep all the words where the pak_thresh is zero */
+        pand        xmm4, xmm0
+
+        /* Flip the bits so that the places that were 0 are now all zeros */
+        pxor        xmm0, xmm7
+
+        /* This zero's out all the words in the new diffs which were 0 in the pak_thresh */
+        pand        xmm3, xmm0
+
+        /* Merge the old and new diffs */
+        por         xmm3, xmm4
+
+        /* Add 255 to the diffs */
+        paddw       xmm3, xmm6
+
+        /* Write back out to temp */
+        movdqa      [esi + 32], xmm3
+    }
+
+    for (i = 0; i < 8; i++)
+    {
+        val = temp[32 + i];
+        thresh_val = local_srf_thresh_table[val];
+        SgcPtr[0] += local_sgc_thresh_table[val];
+        bits_map_ptr[i] = thresh_val;
+        FragChangedPixels += thresh_val;
+
+    }
+
+
+    return FragChangedPixels;
+
+
+
+
+   
+
+
+#endif
 }
 
 

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-08 14:50:13 UTC (rev 11545)
@@ -8,7 +8,7 @@
 
 
 extern unsigned __int64 GetCPUTime();
-
+//#define PERF_DATA_ON
 #ifdef PERF_DATA_ON
 
 

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-08 10:49:12 UTC (rev 11544)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-08 14:50:13 UTC (rev 11545)
@@ -41,7 +41,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib"
+				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;&quot;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs&quot;"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -129,7 +129,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				OmitFramePointers="true"
-				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib"
+				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
 				StringPooling="true"
 				ExceptionHandling="0"