[xiph-commits] r11510 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sat Jun 3 11:29:22 PDT 2006
Author: illiminable
Date: 2006-06-03 11:29:18 -0700 (Sat, 03 Jun 2006)
New Revision: 11510
Modified:
branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
Log:
* Implement sad8x8__sse2
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 17:47:52 UTC (rev 11509)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 18:29:18 UTC (rev 11510)
@@ -657,7 +657,10 @@
ogg_uint32_t SadValue;
+ /* TODO::: It may not be worth contracting to 8 bit in the middle
+ The conversion back and forth possibly outweighs the saving */
+
__asm {
align 16
@@ -887,177 +890,163 @@
return sad;
#else
+
ogg_uint32_t DiffVal;
__asm {
align 16
- mov ebx, ptr1
- mov edx, ptr2
+ mov eax, ptr1
+ mov ebx, ptr2
+ mov ecx, stride1
+ mov edx, stride2
+ lea edi, [ecx + ecx*2]
+ lea esi, [edx + edx*2]
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* mm7 contains the result */
-
- ; /* ITERATION 1 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ pxor xmm2, xmm2 /* Result */
+ pxor xmm3, xmm3
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /* Iteration 1-4 */
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /*Read 2 lots of 8 bytes from each */
+ movq xmm0, QWORD PTR [eax]
+ movq xmm1, QWORD PTR [eax + ecx]
- ; /* ITERATION 2 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq xmm4, QWORD PTR [ebx]
+ movq xmm5, QWORD PTR [ebx + edx]
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /* Absolute difference */
+ movq xmm6, xmm0
+ movq xmm7, xmm1
+ psubusb xmm0, xmm4
+ psubusb xmm1, xmm5
+ psubusb xmm4, xmm6
+ psubusb xmm5, xmm7
+ por xmm0, xmm4
+ por xmm1, xmm5
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /* Expand to 16 bits */
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm3
+ /* Accumulate */
+ paddw xmm0, xmm1
+ paddw xmm2, xmm0
- ; /* ITERATION 3 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ /* ----- half ----- */
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /*Read second 2 lots of 8 bytes from each */
+ movq xmm0, QWORD PTR [eax + ecx * 2]
+ movq xmm1, QWORD PTR [eax + edi]
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
- ; /* ITERATION 4 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq xmm4, QWORD PTR [ebx + edx * 2]
+ movq xmm5, QWORD PTR [ebx + esi]
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /* Absolute difference */
+ movq xmm6, xmm0
+ movq xmm7, xmm1
+ psubusb xmm0, xmm4
+ psubusb xmm1, xmm5
+ psubusb xmm4, xmm6
+ psubusb xmm5, xmm7
+ por xmm0, xmm4
+ por xmm1, xmm5
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /* Expand to 16 bits */
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm3
+ /* Accumulate */
+ paddw xmm0, xmm1
+ paddw xmm2, xmm0
- ; /* ITERATION 5 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ /* Advance read ptrs */
+ lea eax, [eax + ecx*4]
+ lea ebx, [ebx + edx*4]
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /* Iteration 5-8 */
+ /*Read 2 lots of 8 bytes from each */
+ movq xmm0, QWORD PTR [eax]
+ movq xmm1, QWORD PTR [eax + ecx]
- ; /* ITERATION 6 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ movq xmm4, QWORD PTR [ebx]
+ movq xmm5, QWORD PTR [ebx + edx]
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /* Absolute difference */
+ movq xmm6, xmm0
+ movq xmm7, xmm1
+ psubusb xmm0, xmm4
+ psubusb xmm1, xmm5
+ psubusb xmm4, xmm6
+ psubusb xmm5, xmm7
+ por xmm0, xmm4
+ por xmm1, xmm5
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /* Expand to 16 bits */
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm3
+ /* Accumulate */
+ paddw xmm0, xmm1
+ paddw xmm2, xmm0
- ; /* ITERATION 7 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ /* ----- half ----- */
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /*Read second 2 lots of 8 bytes from each */
+ movq xmm0, QWORD PTR [eax + ecx * 2]
+ movq xmm1, QWORD PTR [eax + edi]
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ movq xmm4, QWORD PTR [ebx + edx * 2]
+ movq xmm5, QWORD PTR [ebx + esi]
+ /* Absolute difference */
+ movq xmm6, xmm0
+ movq xmm7, xmm1
+ psubusb xmm0, xmm4
+ psubusb xmm1, xmm5
+ psubusb xmm4, xmm6
+ psubusb xmm5, xmm7
+ por xmm0, xmm4
+ por xmm1, xmm5
- ; /* ITERATION 8 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
+ /* Expand to 16 bits */
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm3
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
+ /* Accumulate */
+ paddw xmm0, xmm1
+ paddw xmm2, xmm0
+
+ /*---------------------------*/
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
+ /* Add the items in the result */
+ movdqa xmm0, xmm2
+ psrlq xmm2, 32
+ paddw xmm0, xmm2
- ; /* ------ */
+ movdqa xmm2, xmm0
+ psrlq xmm0, 16
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddw mm7, mm0 ;
- movq mm0, mm7 ;
- psrlq mm7, 16 ;
- paddw mm7, mm0 ;
- movd eax, mm7 ;
- and eax, 0xffff ;
+ paddw xmm2, xmm0
+ movdqa xmm0, xmm2
+ psrldq xmm2, 8
+ paddw xmm0, xmm2
+
+ /* Put it in the return variable */
+ movd eax, xmm0
+ and eax, 0xffff
mov DiffVal, eax
- };
- return DiffVal;
+ };
+ return DiffVal;
#endif
@@ -1543,8 +1532,8 @@
funcs->sub8x8avg2 = sub8x8avg2__sse2;
funcs->row_sad8 = row_sad8__sse2;
funcs->col_sad8x8 = col_sad8x8__sse2;
- //funcs->sad8x8 = sad8x8__sse2;
- //funcs->sad8x8_thres = sad8x8_thres__sse2;
+ funcs->sad8x8 = sad8x8__sse2;
+ funcs->sad8x8_thres = sad8x8_thres__sse2;
//funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__sse2;
//funcs->intra8x8_err = intra8x8_err__sse2;
//funcs->inter8x8_err = inter8x8_err__sse2;
More information about the commits
mailing list