[xiph-commits] r11501 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sat Jun 3 05:17:30 PDT 2006
Author: illiminable
Date: 2006-06-03 05:17:26 -0700 (Sat, 03 Jun 2006)
New Revision: 11501
Modified:
branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
Log:
* Implement sub8x8_128__sse2
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 10:08:41 UTC (rev 11500)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-03 12:17:26 UTC (rev 11501)
@@ -24,8 +24,11 @@
#endif
-static const ogg_int64_t V128 = 0x0080008000800080LL;
+//static const ogg_int64_t V128 = 0x0080008000800080LL;
+static __declspec(align(16)) const unsigned int V128_8x16bits[4] = { 0x00800080, 0x00800080, 0x00800080, 0x00800080 };
+static const unsigned int* V128_8x16bitsPtr = V128_8x16bits;
+
static void sub8x8__sse2 (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
ogg_uint32_t ReconPixelsPerLine)
@@ -216,142 +219,214 @@
#else
__asm {
align 16
+
+ pxor xmm0, xmm0
+ mov edx, V128_8x16bitsPtr
+ movdqa xmm7, [edx]
- pxor mm7, mm7
+ /* Setup the parameters */
+ mov esi, FiltPtr
+ mov edi, DctInputPtr
+ mov eax, PixelsPerLine
+ lea ebx, [eax + eax*2] ; /* ebx = 3 * PixelsPerLine */
- mov eax, FiltPtr
- mov ebx, DctInputPtr
+ /*
+ Read the first 4 lots of 8x8bits from FiltPtr into the
+ low 64 bits of the registers. Then expand out into
+ 8x16bits to fill all 128 bits of the register
+ */
+ movq xmm1, QWORD PTR [esi]
+ punpcklbw xmm1, xmm0
- movq mm1, V128
+ movq xmm2, QWORD PTR [esi + eax]
+ punpcklbw xmm2, xmm0
- /* ITERATION 1 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ movq xmm3, QWORD PTR [esi + eax * 2]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [esi + ebx]
+ punpcklbw xmm4, xmm0
- /* ITERATION 2 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ /* Subtract 128 16bitwise and write*/
+ psubw xmm1, xmm7
+ movdqa [edi], xmm1
+ psubw xmm2, xmm7
+ movdqa [edi + 16], xmm2
+ psubw xmm3, xmm7
+ movdqa [edi + 32], xmm3
+ psubw xmm4, xmm7
+ movdqa [edi + 48], xmm4
- /* ITERATION 3 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ /* Advance the source and dest pointer for the next 4 iterations */
+ lea esi, [esi + eax * 4]
+ add edi, 64
+ /* Repeat of above for second round */
+ movq xmm1, QWORD PTR [esi]
+ punpcklbw xmm1, xmm0
- /* ITERATION 4 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ movq xmm2, QWORD PTR [esi + eax]
+ punpcklbw xmm2, xmm0
+ movq xmm3, QWORD PTR [esi + eax * 2]
+ punpcklbw xmm3, xmm0
- /* ITERATION 5 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ movq xmm4, QWORD PTR [esi + ebx]
+ punpcklbw xmm4, xmm0
+ /* Subtract 128 16bitwise and write*/
+ psubw xmm1, xmm7
+ movdqa [edi], xmm1
+ psubw xmm2, xmm7
+ movdqa [edi + 16], xmm2
+ psubw xmm3, xmm7
+ movdqa [edi + 32], xmm3
+ psubw xmm4, xmm7
+ movdqa [edi + 48], xmm4
- /* ITERATION 6 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ };
- /* ITERATION 7 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ //__asm {
+ // align 16
- /* ITERATION 8 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
+ // pxor mm7, mm7
- };
+ // mov eax, FiltPtr
+ // mov ebx, DctInputPtr
+
+ // movq mm1, V128
+
+ // /* ITERATION 1 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 2 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 3 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 4 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 5 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 6 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 7 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+
+ // /* ITERATION 8 */
+ // movq mm0, [eax] /* mm0 = FiltPtr */
+ // movq mm2, mm0 /* dup to prepare for up conversion */
+ // /* convert from UINT8 to INT16 */
+ // punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ // punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ // /* start calculation */
+ // psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ // psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ // movq [ebx], mm0 /* write answer out */
+ // movq [8 + ebx], mm2 /* write answer out */
+ // /* Increment pointers */
+ // add ebx, 16
+ // add eax, PixelsPerLine
+
+ //};
#endif
}
@@ -1528,7 +1603,7 @@
{
TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
funcs->sub8x8 = sub8x8__sse2;
- //funcs->sub8x8_128 = sub8x8_128__sse2;
+ funcs->sub8x8_128 = sub8x8_128__sse2;
//funcs->sub8x8avg2 = sub8x8avg2__sse2;
//funcs->row_sad8 = row_sad8__sse2;
//funcs->col_sad8x8 = col_sad8x8__sse2;
More information about the commits
mailing list