[xiph-commits] r11546 - branches/theora-playtime/lib

Fri Jun 9 02:22:15 PDT 2006

Author: illiminable
Date: 2006-06-09 02:22:10 -0700 (Fri, 09 Jun 2006)
New Revision: 11546

Modified:
   branches/theora-playtime/lib/scan.c
Log:
* sse Implementation of RowDiffScan which is fast *and* gives correct output

Modified: branches/theora-playtime/lib/scan.c
===================================================================

--- branches/theora-playtime/lib/scan.c	2006-06-08 14:50:13 UTC (rev 11545)
+++ branches/theora-playtime/lib/scan.c	2006-06-09 09:22:10 UTC (rev 11546)
@@ -691,32 +691,42 @@
 
 static void ApplyPakLowPass_Vectorised( PP_INSTANCE *ppi,
                                       unsigned char * SrcPtr,
-                                      unsigned char * OutputPtr)
+                                      unsigned short * OutputPtr)
 {
 
 #if 0
     //static __declspec(align(16)) unsigned char temp[8];
     //static unsigned char* temp_ptr = temp;
+    int i;
   for (i = 0; i < 8; i++)
   {
-      unsigned char * SrcPtr1 = SrcPtr[i] - 1;
+      unsigned char * SrcPtr1 = SrcPtr - 1;
       unsigned char * SrcPtr0 = SrcPtr1 - ppi->PlaneStride; /* Note the
                                                                use of
                                                                stride not
                                                                width. */
       unsigned char * SrcPtr2 = SrcPtr1 + ppi->PlaneStride;
 
-      OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
-                  (ogg_uint32_t)SrcPtr[i-s] +
-                  (ogg_uint32_t)SrcPtr[i-s+1] +
-                  (ogg_uint32_t)SrcPtr[i-1] +
-                  (ogg_uint32_t)SrcPtr[i+1] +
-                  (ogg_uint32_t)SrcPtr[i+s-1] +
-                  (ogg_uint32_t)SrcPtr[i+s] +
-                  (ogg_uint32_t)SrcPtr[i+s+1]   ) >> 3 );
+      //OutputPtr[i] = ( ( (ogg_uint32_t)SrcPtr[i-1-s] +
+      //            (ogg_uint32_t)SrcPtr[i-s] +
+      //            (ogg_uint32_t)SrcPtr[i-s+1] +
+      //            (ogg_uint32_t)SrcPtr[i-1] +
+      //            (ogg_uint32_t)SrcPtr[i+1] +
+      //            (ogg_uint32_t)SrcPtr[i+s-1] +
+      //            (ogg_uint32_t)SrcPtr[i+s] +
+      //            (ogg_uint32_t)SrcPtr[i+s+1]   ) >> 3 );
+
+  OutputPtr[i] =   (unsigned char)( ( (ogg_uint32_t)SrcPtr0[0 + i] +
+              (ogg_uint32_t)SrcPtr0[1 + i] +
+              (ogg_uint32_t)SrcPtr0[2 + i] +
+              (ogg_uint32_t)SrcPtr1[0 + i] +
+              (ogg_uint32_t)SrcPtr1[2 + i] +
+              (ogg_uint32_t)SrcPtr2[0 + i] +
+              (ogg_uint32_t)SrcPtr2[1 + i] +
+              (ogg_uint32_t)SrcPtr2[2 + i]   ) >> 3 );
   }
 
-  return temp_ptr;
+  //return temp_ptr;
 #else
 
   /*                                                            
@@ -779,21 +789,25 @@
 
   */
 
-    static __declspec(align(16)) unsigned long Low6BytesMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
-    static unsigned char* Low6BytesMaskPtr = (unsigned char*)Low6BytesMask;
+    static __declspec(align(16)) unsigned long Low6WordsMask[4] = { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 };
+    static unsigned char* Low6WordsMaskPtr = (unsigned char*)Low6WordsMask;
     long    stride = ppi->PlaneStride;
     unsigned char* SrcPtrTopLeft = SrcPtr - stride - 1;
     __asm {
         align           16
 
         mov         esi, SrcPtrTopLeft
-        mov         eax, Low6BytesMaskPtr
+        mov         eax, Low6WordsMaskPtr
         mov         ecx, stride
         mov         edi, OutputPtr
 
+        movdqa      xmm7, [eax]
         pxor        xmm0, xmm0
         pcmpeqw     xmm6, xmm6  /* All 1's */
 
+        /* Create the inverse mask -- xmm6 = ~xmm7 */
+        pxor        xmm6, xmm7
+
         /***************************************/
         /* TOP ROW OF THE 8 SURROUNDING PIXELS */
         /***************************************/
@@ -806,9 +820,9 @@
 
         movq        xmm1, QWORD PTR [esi]
         movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
-        movdqa      xmm7, [eax]
 
 
+
         /* Expand to 16 bits */
         punpcklbw   xmm1, xmm0
         punpcklbw   xmm2, xmm0
@@ -837,7 +851,7 @@
             psrldq      xmm1, 2
             /* Shift words 5 and 6 to positions 6 and 7 - since
                 we don't care about any of the other positions in this regsiter
-                use the dword 32 bitwise shift which is twice as fast as the 
+                use the qword 64 bitwise shift which is twice as fast as the 
                 dq 128 bitwise one */
             psllq       xmm2, 16
 
@@ -845,7 +859,7 @@
             pand        xmm1, xmm7
 
             /* Create the inverse mask -- xmm6 = ~xmm7 */
-            pxor        xmm6, xmm7
+            //pxor        xmm6, xmm7
 
             /* Clear the low 6 bytes of the second register */
             pand        xmm2, xmm6
@@ -874,13 +888,15 @@
 
         movq        xmm5, QWORD PTR [esi]
         movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
-        movdqa      xmm7, [eax]
+        //movdqa      xmm7, [eax]
 
 
+
+
         /* Expand to 16 bits */
         punpcklbw   xmm5, xmm0
         punpcklbw   xmm2, xmm0
-        movdqa      xmm3, xmm1
+        movdqa      xmm3, xmm5
         movdqa      xmm4, xmm2
 
         /* Shift all 8 items right by 1 lot of 16 bits to get the intermediate sums */
@@ -913,7 +929,7 @@
             pand        xmm5, xmm7
 
             /* Create the inverse mask -- xmm6 = ~xmm7 */
-            pxor        xmm6, xmm7
+            //pxor        xmm6, xmm7
 
             /* Clear the low 6 bytes of the second register */
             pand        xmm2, xmm6
@@ -941,7 +957,7 @@
 
         movq        xmm1, QWORD PTR [esi]
         movq        xmm2, QWORD PTR [esi + 2]       /* this one partly overlaps */
-        movdqa      xmm7, [eax]
+        //movdqa      xmm7, [eax]
 
 
         /* Expand to 16 bits */
@@ -957,19 +973,19 @@
         paddw       xmm2, xmm4
         
         /* Merge the 8 results into 1 register */
-            /* Shift words 2-7 to positions 0-5 */
-            psrldq      xmm1, 2
-            /* Shift words 5 and 6 to positions 6 and 7 - since
+            /* First register has words 0-5 filled with sums */
+
+            /* Shift words 4 and 5 to positions 6 and 7 - since
                 we don't care about any of the other positions in this regsiter
-                use the dword 32 bitwise shift which is twice as fast as the 
+                use the qword 64 bitwise shift which is twice as fast as the 
                 dq 128 bitwise one */
-            //psllq       xmm2, 16
+            psllq       xmm2, 32
 
             /* Clear the high 32 bits in the first register */
             pand        xmm1, xmm7
 
             /* Create the inverse mask -- xmm6 = ~xmm7 */
-            pxor        xmm6, xmm7
+            //pxor        xmm6, xmm7
 
             /* Clear the low 6 bytes of the second register */
             pand        xmm2, xmm6
@@ -1147,27 +1163,10 @@
   //PERF_BLOCK_END("RowDiffScan_DiffAndThresholdingMiddleFrag", perf_rds_datmf_time, perf_rds_datmf_time,perf_rds_datmf_time, 10000);
 
 #else
-   /*
-        xmm1 = yuvptr1[0..7]
-        xmm2 = yuvptr2[0..7]
-        xmm3 = TO_16BIT(xmm1)
-        xmm4 = TO_16BIT(xmm4)
-        xmm5 = xmm3 - xmm4 [wordwise]
-        YUVDiffsPtr[0..7] = xmm5
 
-        //Sum into SgcPtr[0] --- need the old Diff's for this. Make sure SgcPtr doesn't alias onto this other stuff we're using
-
-        xmm6 = DoVectorisedPakLowPass  //xmm6 now has the "new" updated Diff's
-
-        //Use the new Diff's to update bits_map_ptr while summing up frag changed pix
-
-
-
-    */
-
     static __declspec(align(16)) unsigned long Some255s[4] = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
     static __declspec(align(16)) unsigned char temp[48];
-    static unsigned char* temp_ptr = temp;
+    static unsigned short* temp_ptr = (unsigned short*)temp;
     
     static unsigned char* some_255s_ptr = (unsigned char*)Some255s;
     unsigned char* local_sgc_thresh_table = ppi->SgcThreshTable;
@@ -1175,7 +1174,7 @@
     unsigned char* local_srf_pak_thresh_table = ppi->SrfPakThreshTable;
 
     
-    unsigned char val, thresh_val;
+    unsigned char thresh_val;
     int i, FragChangedPixels = 0;
 
 
@@ -1200,7 +1199,7 @@
         punpcklbw   xmm2, xmm0
 
         /* Subtract the YUV Ptr values */
-        psubsw      xmm1, xmm2
+        psubw      xmm1, xmm2   /*should it be subsw?? */
 
         /* Write out to YUVDiffs */
         movdqu      [edi], xmm1
@@ -1215,20 +1214,20 @@
 
     }
 
-    ApplyPakLowPass_Vectorised(ppi, YuvPtr1, temp_ptr + 16);
-    ApplyPakLowPass_Vectorised(ppi, YuvPtr2, temp_ptr + 32);
+    ApplyPakLowPass_Vectorised(ppi, YuvPtr1, temp_ptr + 8); /* Bytes 16-31 */
+    ApplyPakLowPass_Vectorised(ppi, YuvPtr2, temp_ptr + 16); /* Bytes 32 - 47 */
 
     __asm {
         align 16
 
         mov         esi, temp_ptr
-        mov         edi, YUVDiffsPtr  /* Not aligned */
+        //mov         edi, YUVDiffsPtr  /* Not aligned */
         mov         ecx, some_255s_ptr
 
-        
+        //movdqu      xmm3, [esi]  /* Old diffs +255 */
         movdqa      xmm1, [esi + 16]
         movdqa      xmm2, [esi + 32]
-        movdqu      xmm3, [edi]  /* Old diffs */
+        
         movdqa      xmm6, [ecx]
 
         /* New diffs after PakLowPass */
@@ -1245,32 +1244,33 @@
 
 
         /* At this point
-                temp_ptr[0..15] = 8 lots of Early loop diffs - 255
-                temp_ptr[16..31] = 8 lots of late loop diffs - 255
+                temp_ptr[0..15] = 8 lots of Early loop diffs + 255
+                temp_ptr[16..31] = 8 lots of late loop diffs + 255
                 temp_ptr[32..47] = who cares */
 
     }
 
 
-    temp_ptr[32] = local_srf_pak_thresh_table[temp_ptr[0]];
-    temp_ptr[33] = local_srf_pak_thresh_table[temp_ptr[1]];
-    temp_ptr[34] = local_srf_pak_thresh_table[temp_ptr[2]];
-    temp_ptr[35] = local_srf_pak_thresh_table[temp_ptr[3]];
-    temp_ptr[36] = local_srf_pak_thresh_table[temp_ptr[4]];
-    temp_ptr[37] = local_srf_pak_thresh_table[temp_ptr[5]];
-    temp_ptr[38] = local_srf_pak_thresh_table[temp_ptr[6]];
-    temp_ptr[39] = local_srf_pak_thresh_table[temp_ptr[7]];
+    /* Apply the pak threash_table and write into temp[32..47] */
+    temp_ptr[16] = local_srf_pak_thresh_table[temp_ptr[0]];
+    temp_ptr[17] = local_srf_pak_thresh_table[temp_ptr[1]];
+    temp_ptr[18] = local_srf_pak_thresh_table[temp_ptr[2]];
+    temp_ptr[19] = local_srf_pak_thresh_table[temp_ptr[3]];
+    temp_ptr[20] = local_srf_pak_thresh_table[temp_ptr[4]];
+    temp_ptr[21] = local_srf_pak_thresh_table[temp_ptr[5]];
+    temp_ptr[22] = local_srf_pak_thresh_table[temp_ptr[6]];
+    temp_ptr[23] = local_srf_pak_thresh_table[temp_ptr[7]];
 
     __asm {
         align       16
 
-        mov         edx, YUVDiffsPtr
+        //mov         edx, YUVDiffsPtr
         mov         esi, temp_ptr
 
-        /* Read back the old diffs */
-        movdqu     xmm4, [edx]
+        /* Read back the old diffs+255 */
+        movdqu     xmm4, [esi]
 
-        /* Read back the new diffs */
+        /* Read back the new diffs+255 */
         movdqa     xmm3, [esi + 16]
         
         /* Read back the pak_threshed values used in the if statement */
@@ -1283,7 +1283,7 @@
         /* Compare the pak_thresh values to 0, any word which was 0, will now be set to all 1's in xmm0 
                 the if basically said, if it's zero, leave it alone, otherwise, replace it
                 with the new diff */
-        pcmpeqw     xmm0, xmm3
+        pcmpeqw     xmm0, xmm6
 
         /* On the old diffs, keep all the words where the pak_thresh is zero */
         pand        xmm4, xmm0
@@ -1297,18 +1297,15 @@
         /* Merge the old and new diffs */
         por         xmm3, xmm4
 
-        /* Add 255 to the diffs */
-        paddw       xmm3, xmm6
-
         /* Write back out to temp */
         movdqa      [esi + 32], xmm3
     }
 
     for (i = 0; i < 8; i++)
     {
-        val = temp[32 + i];
-        thresh_val = local_srf_thresh_table[val];
-        SgcPtr[0] += local_sgc_thresh_table[val];
+
+        thresh_val = local_srf_thresh_table[temp_ptr[16 + i]];
+        SgcPtr[0] += local_sgc_thresh_table[temp_ptr[i]];
         bits_map_ptr[i] = thresh_val;
         FragChangedPixels += thresh_val;
 
@@ -1316,13 +1313,9 @@
 
 
     return FragChangedPixels;
+  
 
 
-
-
-   
-
-
 #endif
 }