[xiph-commits] r11508 - branches/theora-playtime/lib/x86_32_vs

Sat Jun 3 10:40:43 PDT 2006

Author: illiminable
Date: 2006-06-03 10:40:39 -0700 (Sat, 03 Jun 2006)
New Revision: 11508

Modified:
   branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
Log:
* Implement col_sad8x8__sse2

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================

--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-03 17:32:27 UTC (rev 11507)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-03 17:40:39 UTC (rev 11508)
@@ -584,6 +584,7 @@
     movdqa      xmm1, xmm3
     psrldq      xmm3, 8
 
+    /* eax = max(SadValue, SadValue1) */
     psubusw     xmm1, xmm3
     paddw       xmm1, xmm3
 
@@ -595,54 +596,7 @@
 
   return SadValue;
 
-  
-  //__asm {
-  //  align       16
-  //  mov         ebx, Src1
-  //  mov         ecx, Src2
 
-
-  //  pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-  //  pxor		mm7, mm7		;	/* zero out mm7 for unpack */
-  //  movq		mm0, [ebx]		;	/* take 8 bytes */
-  //  movq		mm1, [ecx]		;	
-
-  //  movq		mm2, mm0		;	
-  //  psubusb		mm0, mm1		;	/* A - B */
-  //  psubusb		mm1, mm2		;	/* B - A */
-  //  por		mm0, mm1		;	/* and or gives abs difference */
-
-  //  movq		mm1, mm0		;	
-
-  //  punpcklbw		mm0, mm6		;	/* ; unpack low four bytes to higher precision */
-  //  punpckhbw		mm1, mm7		;	/* ; unpack high four bytes to higher precision */
-
-  //  movq		mm2, mm0		;	
-  //  movq		mm3, mm1		;	
-  //  psrlq		mm2, 32		;	/* fold and add */
-  //  psrlq		mm3, 32		;	
-  //  paddw		mm0, mm2		;	
-  //  paddw		mm1, mm3		;	
-  //  movq		mm2, mm0		;	
-  //  movq		mm3, mm1		;	
-  //  psrlq		mm2, 16		;	
-  //  psrlq		mm3, 16		;	
-  //  paddw		mm0, mm2		;	
-  //  paddw		mm1, mm3		;	
-
-  //  psubusw		mm1, mm0		;	
-  //  paddw		mm1, mm0		;	/* mm1 = max(mm1, mm0) */
-  //  movd		eax, mm1		;
-
-  //  and         eax, 0xffff
-  //  mov         MaxSad, eax
-  //};
-  // return MaxSad;
-  
-  
-  
- 
-
 #endif
 }
 
@@ -696,84 +650,215 @@
     
   return MaxSad;
 #else
-  ogg_uint32_t MaxSad;
 
+    static __declspec(align(16)) unsigned int temp_regs[8];
+    static unsigned int* const temp_reg_ptr = temp_regs;
+    static unsigned int* const temp_reg_result_ptr = &temp_regs[4];
+ 
+    ogg_uint32_t SadValue;
 
+
     __asm {
         align       16
+        
+        /* Setup the paramters */
         mov         ebx, Src1
         mov         ecx, Src2
+        mov         edx, stride
+        lea         eax, [edx + edx*2]
+        mov         edi, temp_reg_ptr
+        mov         esi, temp_reg_result_ptr
 
-        pxor		mm3, mm3		;	/* zero out mm3 for unpack */
-        pxor		mm4, mm4		;	/* mm4 low sum */
-        pxor		mm5, mm5		;	/* mm5 high sum */
-        pxor		mm6, mm6		;	/* mm6 low sum */
-        pxor		mm7, mm7		;	/* mm7 high sum */
-        mov		edi, 4		;	/* 4 rows */
-        label_1:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
+        /* Read the first 4 iterations */
+        movq      xmm0, QWORD PTR [ebx]
+        movq      xmm4, QWORD PTR [ebx + edx]
+        movq      xmm2, QWORD PTR [ebx + edx*2]
+        movq      xmm6, QWORD PTR [ebx + eax]
 
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
+        movq      xmm1, QWORD PTR [ecx]
+        movq      xmm5, QWORD PTR [ecx + edx]
+        movq      xmm3, QWORD PTR [ecx + edx*2]
+        movq      xmm7, QWORD PTR [ecx + eax]
 
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm4, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm5, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
+        /* Consolidate the results from 8 registers of 8x16bits to 4 of 16x8bits */
+            movdqa      [edi], xmm7 /* Save xmm7 */
+            pxor        xmm7, xmm7
 
-        dec		edi		;	
-        jnz		label_1		;	
+            /* Expand everything to 16 bits */
+            punpcklbw   xmm0, xmm7
+            punpcklbw   xmm1, xmm7
+            punpcklbw   xmm2, xmm7
+            punpcklbw   xmm3, xmm7
+            punpcklbw   xmm4, xmm7
+            punpcklbw   xmm5, xmm7
+            punpcklbw   xmm6, xmm7
 
-        mov		edi, 4		;	/* 4 rows */
-        label_2:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
+            /* Now merge the first 3 */
+            packuswb    xmm0, xmm4
+            packuswb    xmm1, xmm5
+            packuswb    xmm2, xmm6
 
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
+            /* Restore xmm7 for the final merge into xmm3 */
+            movdqa      xmm6, [edi]
+            punpcklbw   xmm6, xmm7
+            packuswb    xmm3, xmm6
 
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm6, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm7, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
+        /* Duplicate all the registers */
+        movdqa      xmm4, xmm0
+        movdqa      xmm5, xmm1
+        movdqa      xmm6, xmm2
+        movdqa      xmm7, xmm3
 
-        dec		edi		;	
-        jnz		label_2		;	
+        /* result = abs_diff(a,b) = (a-b)|(b-a) */
+        psubusb     xmm0, xmm1
+        psubusb     xmm2, xmm3
 
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm7, mm6) */
-        psubusw		mm5, mm4		;	
-        paddw		mm5, mm4		;	/* mm5 = max(mm5, mm4) */
-        psubusw		mm7, mm5		;	
-        paddw		mm7, mm5		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 32		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 16		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movd		eax, mm7		;	
-        and		    eax, 0xffff		;
+        psubusb     xmm1, xmm4
+        psubusb     xmm3, xmm6
 
-        mov         MaxSad, eax
+        por         xmm0, xmm1
+        por         xmm2, xmm3
+
+        /* Expand the 32x8bits in 2 registers to 32x16bits in 4 registers */
+        pxor        xmm7, xmm7
+
+        movdqa      xmm1, xmm0
+        movdqa      xmm3, xmm2
+        
+        punpcklbw   xmm0, xmm7
+        punpckhbw   xmm1, xmm7
+        punpcklbw   xmm2, xmm7
+        punpckhbw   xmm3, xmm7
+
+        /* Add them up and then xmm0 contains the 8x16bit SadValue array*/
+        paddw       xmm0, xmm1
+        paddw       xmm2, xmm3
+        paddw       xmm0, xmm2
+
+        /* Save xmm0 for later so we can use all 8 registers again in the memread */
+        /* push        xmm0 */
+        movdqa      [esi], xmm0
+
+        /* Advance the read pointers */
+        lea         ebx, [ebx + edx*4]
+        lea         ecx, [ecx + edx*4]
+
+        /* ----- Repeat of above for the second sad array ------ */
+
+        /* Read the first 4 iterations */
+        movq      xmm0, QWORD PTR [ebx]
+        movq      xmm4, QWORD PTR [ebx + edx]
+        movq      xmm2, QWORD PTR [ebx + edx*2]
+        movq      xmm6, QWORD PTR [ebx + eax]
+
+        movq      xmm1, QWORD PTR [ecx]
+        movq      xmm5, QWORD PTR [ecx + edx]
+        movq      xmm3, QWORD PTR [ecx + edx*2]
+        movq      xmm7, QWORD PTR [ecx + eax]
+
+        /* Consolidate the results from 8 registers of 8x16bits to 4 of 16x8bits */
+            movdqa      [edi], xmm7 /* Save xmm7 */
+            pxor        xmm7, xmm7
+
+            /* Expand everything to 16 bits */
+            punpcklbw   xmm0, xmm7
+            punpcklbw   xmm1, xmm7
+            punpcklbw   xmm2, xmm7
+            punpcklbw   xmm3, xmm7
+            punpcklbw   xmm4, xmm7
+            punpcklbw   xmm5, xmm7
+            punpcklbw   xmm6, xmm7
+
+            /* Now merge the first 3 */
+            packuswb    xmm0, xmm4
+            packuswb    xmm1, xmm5
+            packuswb    xmm2, xmm6
+
+            /* Restore xmm7 for the final merge into xmm3 */
+            movdqa      xmm6, [edi]
+            punpcklbw   xmm6, xmm7
+            packuswb    xmm3, xmm6
+
+
+        /* Duplicate all the registers */
+        movdqa      xmm4, xmm0
+        movdqa      xmm5, xmm1
+        movdqa      xmm6, xmm2
+        movdqa      xmm7, xmm3
+
+        /* result = abs_diff(a,b) = (a-b)|(b-a) */
+        psubusb     xmm0, xmm1
+        psubusb     xmm2, xmm3
+
+        psubusb     xmm1, xmm4
+        psubusb     xmm3, xmm6
+
+        por         xmm0, xmm1
+        por         xmm2, xmm3
+
+        /* Expand the 32x8bits in 2 registers to 32x16bits in 4 registers */
+        pxor        xmm7, xmm7
+
+        movdqa      xmm1, xmm0
+        movdqa      xmm3, xmm2
+        
+        punpcklbw   xmm0, xmm7
+        punpckhbw   xmm1, xmm7
+        punpcklbw   xmm2, xmm7
+        punpckhbw   xmm3, xmm7
+
+        /* Add them up and then xmm0 contains the 8x16bit SadValue array*/
+        paddw       xmm0, xmm1
+        paddw       xmm2, xmm3
+        paddw       xmm0, xmm2
+
+        /* --------------- End of repeat ---------- */
+
+        /* Restore the save sadarray - then xmm0 has sad1, and xmm1 has sad2*/
+        /* pop         xmm1 */
+        movdqa      xmm1, [esi]
+
+                /* Find the maximum sad value */
+
+        /* 
+            Eliminate sad values from each array if they are not max.
+            If any posistion in xmm1 was greater than the one in xmm0
+            It's value is now in xmm0.
+         */
+        psubusw     xmm0, xmm1
+        paddw       xmm0, xmm1
+
+        /* reduce from 8 possibles to 4 with a shift-max */
+        movdqa      xmm1, xmm0
+        psrlq		xmm0, 32
+
+        psubusw     xmm0, xmm1
+        paddw       xmm0, xmm1
+
+        /* reduce from 4 to 2 with another shift-max */
+        movdqa      xmm1, xmm0
+        psrlq		xmm0, 16
+
+        psubusw     xmm0, xmm1
+        paddw       xmm0, xmm1
+        /* reduce to final value with another full register shift-max */
+        movdqa      xmm1, xmm0
+        psrldq      xmm0, 8
+
+        psubusw     xmm0, xmm1
+        paddw       xmm0, xmm1
+
+        /* Put it in the return variable */
+        movd         eax, xmm0
+        and         eax, 0xffff
+        mov         SadValue, eax
+
+
     };
 
-    return MaxSad;
+    return SadValue;
+ 
 
-
 #endif
 }
 
@@ -1457,7 +1542,7 @@
   funcs->sub8x8_128 = sub8x8_128__sse2;
   funcs->sub8x8avg2 = sub8x8avg2__sse2;
   funcs->row_sad8 = row_sad8__sse2;
-  //funcs->col_sad8x8 = col_sad8x8__sse2;
+  funcs->col_sad8x8 = col_sad8x8__sse2;
   //funcs->sad8x8 = sad8x8__sse2;
   //funcs->sad8x8_thres = sad8x8_thres__sse2;
   //funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__sse2;