[xiph-commits] r11504 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sat Jun 3 07:22:14 PDT 2006
Author: illiminable
Date: 2006-06-03 07:22:10 -0700 (Sat, 03 Jun 2006)
New Revision: 11504
Modified:
branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
Log:
* Implement sub8x8avg2__sse2
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 12:33:36 UTC (rev 11503)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 14:22:10 UTC (rev 11504)
@@ -322,262 +322,182 @@
DctInputPtr += 8;
}
#else
-
__asm {
align 16
- pxor mm7, mm7
+ pxor xmm0, xmm0
- mov eax, FiltPtr
- mov ebx, ReconPtr1
- mov ecx, ReconPtr2
- mov edx, DctInputPtr
+ /* Setup input params */
+ mov eax, ReconPtr1
+ mov ebx, ReconPtr2
+ mov ecx, PixelsPerLine
+ mov edx, ReconPixelsPerLine
+ mov esi, FiltPtr
+ mov edi, DctInputPtr
- /* ITERATION 1 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
- /* ITERATION 2 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 1&2 */
+ /* Read 2 iterations worth of the input arrays */
+ movq xmm1, QWORD PTR [eax]
+ movq xmm2, QWORD PTR [eax + edx]
+ movq xmm3, QWORD PTR [ebx]
+ movq xmm4, QWORD PTR [ebx + edx]
+ movq xmm5, QWORD PTR [esi]
+ movq xmm6, QWORD PTR [esi + ecx]
- /* ITERATION 3 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* Extend out to int16's */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm6, xmm0
+ /* Average ReconPtr1 and 2 */
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+ psrlw xmm1, 1
+ psrlw xmm2, 1
- /* ITERATION 4 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* Do Result = FilterPtr[i] - avg(ReconPtr[i], ReconPtr[i]) */
+ psubw xmm5, xmm1
+ psubw xmm6, xmm2
+ /* Write out two iterations worth */
+ movdqa [edi], xmm5
+ movdqa [edi + 16], xmm6
- /* ITERATION 5 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* Update pointers */
+ lea eax, [eax + edx*2]
+ lea ebx, [ebx + edx*2]
+ lea esi, [esi + ecx*2]
+ add edi, 32
- /* ITERATION 6 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* ITERATION 3&4 */
+ /* Read 2 iterations worth of the input arrays */
+ movq xmm1, QWORD PTR [eax]
+ movq xmm2, QWORD PTR [eax + edx]
+ movq xmm3, QWORD PTR [ebx]
+ movq xmm4, QWORD PTR [ebx + edx]
+ movq xmm5, QWORD PTR [esi]
+ movq xmm6, QWORD PTR [esi + ecx]
- /* ITERATION 7 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* Extend out to int16's */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm6, xmm0
+ /* Average ReconPtr1 and 2 */
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+ psrlw xmm1, 1
+ psrlw xmm2, 1
- /* ITERATION 8 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
+ /* Do Result = FilterPtr[i] - avg(ReconPtr[i], ReconPtr[i]) */
+ psubw xmm5, xmm1
+ psubw xmm6, xmm2
- };
+ /* Write out two iterations worth */
+ movdqa [edi], xmm5
+ movdqa [edi + 16], xmm6
+ /* Update pointers */
+ lea eax, [eax + edx*2]
+ lea ebx, [ebx + edx*2]
+ lea esi, [esi + ecx*2]
+ add edi, 32
+
+ /* ITERATION 5&6 */
+ /* Read 2 iterations worth of the input arrays */
+ movq xmm1, QWORD PTR [eax]
+ movq xmm2, QWORD PTR [eax + edx]
+ movq xmm3, QWORD PTR [ebx]
+ movq xmm4, QWORD PTR [ebx + edx]
+ movq xmm5, QWORD PTR [esi]
+ movq xmm6, QWORD PTR [esi + ecx]
+
+ /* Extend out to int16's */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm6, xmm0
+
+ /* Average ReconPtr1 and 2 */
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+ psrlw xmm1, 1
+ psrlw xmm2, 1
+
+ /* Do Result = FilterPtr[i] - avg(ReconPtr[i], ReconPtr[i]) */
+ psubw xmm5, xmm1
+ psubw xmm6, xmm2
+
+ /* Write out two iterations worth */
+ movdqa [edi], xmm5
+ movdqa [edi + 16], xmm6
+
+ /* Update pointers */
+ lea eax, [eax + edx*2]
+ lea ebx, [ebx + edx*2]
+ lea esi, [esi + ecx*2]
+ add edi, 32
+
+
+
+
+ /* ITERATION 7&8 */
+
+ /* Read 2 iterations worth of the input arrays */
+ movq xmm1, QWORD PTR [eax]
+ movq xmm2, QWORD PTR [eax + edx]
+ movq xmm3, QWORD PTR [ebx]
+ movq xmm4, QWORD PTR [ebx + edx]
+ movq xmm5, QWORD PTR [esi]
+ movq xmm6, QWORD PTR [esi + ecx]
+
+ /* Extend out to int16's */
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ punpcklbw xmm6, xmm0
+
+ /* Average ReconPtr1 and 2 */
+ paddw xmm1, xmm3
+ paddw xmm2, xmm4
+ psrlw xmm1, 1
+ psrlw xmm2, 1
+
+ /* Do Result = FilterPtr[i] - avg(ReconPtr[i], ReconPtr[i]) */
+ psubw xmm5, xmm1
+ psubw xmm6, xmm2
+
+ /* Write out two iterations worth */
+ movdqa [edi], xmm5
+ movdqa [edi + 16], xmm6
+
+ /* Update pointers */
+ //lea eax, [eax + edx*2]
+ //lea ebx, [ebx + edx*2]
+ //lea esi, [esi + ecx*2]
+ //add edi, 32
+};
+
+
+
#endif
}
@@ -1465,7 +1385,7 @@
TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
funcs->sub8x8 = sub8x8__sse2;
funcs->sub8x8_128 = sub8x8_128__sse2;
- //funcs->sub8x8avg2 = sub8x8avg2__sse2;
+ funcs->sub8x8avg2 = sub8x8avg2__sse2;
//funcs->row_sad8 = row_sad8__sse2;
//funcs->col_sad8x8 = col_sad8x8__sse2;
//funcs->sad8x8 = sad8x8__sse2;
More information about the commits
mailing list