[xiph-commits] r11517 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Sun Jun 4 10:03:42 PDT 2006
Author: illiminable
Date: 2006-06-04 10:03:37 -0700 (Sun, 04 Jun 2006)
New Revision: 11517
Modified:
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
Log:
* Roller the loops back up in dequant
Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-04 13:29:50 UTC (rev 11516)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-04 17:03:37 UTC (rev 11517)
@@ -65,6 +65,9 @@
*/
/* 16 Iterations at a time */
+ mov ecx, 4 /* 4 lots of 16 */
+
+ calc_loop_start:
/* Read 16x16 bits of quatized_list and dequant_coeffs */
movdqu xmm1, [esi]
movdqu xmm5, [esi + 16]
@@ -98,115 +101,128 @@
movdqa [eax + 32], xmm5
movdqa [eax + 48], xmm2
+ /* Update the pointers */
+ add esi, 32
+ add ebx, 32
+ add eax, 64
+ /* Loop check */
+ sub ecx, 1
+ jnz calc_loop_start
- /* 16 Iterations at a time */
- /* Read 16x16 bits of quatized_list and dequant_coeffs */
- movdqu xmm1, [esi + 32]
- movdqu xmm5, [esi + 48]
+ /* Restore the pointer to the start of the temp buffer */
+ sub eax, 256
+
- movdqa xmm2, [ebx + 32]
- movdqa xmm6, [ebx + 48]
- /* Make a copy of xmm1 and xmm5 */
- movdqa xmm7, xmm1
- movdqa xmm0, xmm5
- /* Multiply */
- pmullw xmm1, xmm2
- pmulhw xmm2, xmm7
+ ///* 16 Iterations at a time */
+ // /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ // movdqu xmm1, [esi + 32]
+ // movdqu xmm5, [esi + 48]
- pmullw xmm5, xmm6
- pmulhw xmm6, xmm0
+ // movdqa xmm2, [ebx + 32]
+ // movdqa xmm6, [ebx + 48]
- /* Interleave the multiplicataion results */
- movdqa xmm0, xmm1
- punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
- punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+ // /* Make a copy of xmm1 and xmm5 */
+ // movdqa xmm7, xmm1
+ // movdqa xmm0, xmm5
- movdqa xmm2, xmm5
- punpcklwd xmm5, xmm6
- punpckhwd xmm2, xmm6
+ // /* Multiply */
+ // pmullw xmm1, xmm2
+ // pmulhw xmm2, xmm7
- /* Write the 16x32 bits of output to temp space */
- movdqa [eax + 64], xmm1
- movdqa [eax + 80], xmm0
- movdqa [eax + 96], xmm5
- movdqa [eax + 112], xmm2
+ // pmullw xmm5, xmm6
+ // pmulhw xmm6, xmm0
- /* 16 Iterations at a time */
- /* Read 16x16 bits of quatized_list and dequant_coeffs */
- movdqu xmm1, [esi + 64]
- movdqu xmm5, [esi + 80]
+ // /* Interleave the multiplicataion results */
+ // movdqa xmm0, xmm1
+ // punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ // punpckhwd xmm0, xmm2 /* The high 4x32 bits */
- movdqa xmm2, [ebx + 64]
- movdqa xmm6, [ebx + 80]
+ // movdqa xmm2, xmm5
+ // punpcklwd xmm5, xmm6
+ // punpckhwd xmm2, xmm6
- /* Make a copy of xmm1 and xmm5 */
- movdqa xmm7, xmm1
- movdqa xmm0, xmm5
+ // /* Write the 16x32 bits of output to temp space */
+ // movdqa [eax + 64], xmm1
+ // movdqa [eax + 80], xmm0
+ // movdqa [eax + 96], xmm5
+ // movdqa [eax + 112], xmm2
- /* Multiply */
- pmullw xmm1, xmm2
- pmulhw xmm2, xmm7
+ ///* 16 Iterations at a time */
+ // /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ // movdqu xmm1, [esi + 64]
+ // movdqu xmm5, [esi + 80]
- pmullw xmm5, xmm6
- pmulhw xmm6, xmm0
+ // movdqa xmm2, [ebx + 64]
+ // movdqa xmm6, [ebx + 80]
- /* Interleave the multiplicataion results */
- movdqa xmm0, xmm1
- punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
- punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+ // /* Make a copy of xmm1 and xmm5 */
+ // movdqa xmm7, xmm1
+ // movdqa xmm0, xmm5
- movdqa xmm2, xmm5
- punpcklwd xmm5, xmm6
- punpckhwd xmm2, xmm6
+ // /* Multiply */
+ // pmullw xmm1, xmm2
+ // pmulhw xmm2, xmm7
- /* Write the 16x32 bits of output to temp space */
- movdqa [eax + 128], xmm1
- movdqa [eax + 144], xmm0
- movdqa [eax + 160], xmm5
- movdqa [eax + 176], xmm2
+ // pmullw xmm5, xmm6
+ // pmulhw xmm6, xmm0
+ // /* Interleave the multiplicataion results */
+ // movdqa xmm0, xmm1
+ // punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ // punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+ // movdqa xmm2, xmm5
+ // punpcklwd xmm5, xmm6
+ // punpckhwd xmm2, xmm6
- /* 16 Iterations at a time */
- /* Read 16x16 bits of quatized_list and dequant_coeffs */
- movdqu xmm1, [esi + 96]
- movdqu xmm5, [esi + 112]
+ // /* Write the 16x32 bits of output to temp space */
+ // movdqa [eax + 128], xmm1
+ // movdqa [eax + 144], xmm0
+ // movdqa [eax + 160], xmm5
+ // movdqa [eax + 176], xmm2
- movdqa xmm2, [ebx + 96]
- movdqa xmm6, [ebx + 112]
- /* Make a copy of xmm1 and xmm5 */
- movdqa xmm7, xmm1
- movdqa xmm0, xmm5
- /* Multiply */
- pmullw xmm1, xmm2
- pmulhw xmm2, xmm7
+ ///* 16 Iterations at a time */
+ // /* Read 16x16 bits of quatized_list and dequant_coeffs */
+ // movdqu xmm1, [esi + 96]
+ // movdqu xmm5, [esi + 112]
- pmullw xmm5, xmm6
- pmulhw xmm6, xmm0
+ // movdqa xmm2, [ebx + 96]
+ // movdqa xmm6, [ebx + 112]
- /* Interleave the multiplicataion results */
- movdqa xmm0, xmm1
- punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
- punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+ // /* Make a copy of xmm1 and xmm5 */
+ // movdqa xmm7, xmm1
+ // movdqa xmm0, xmm5
- movdqa xmm2, xmm5
- punpcklwd xmm5, xmm6
- punpckhwd xmm2, xmm6
+ // /* Multiply */
+ // pmullw xmm1, xmm2
+ // pmulhw xmm2, xmm7
- /* Write the 16x32 bits of output to temp space */
- movdqa [eax + 192], xmm1
- movdqa [eax + 208], xmm0
- movdqa [eax + 224], xmm5
- movdqa [eax + 240], xmm2
+ // pmullw xmm5, xmm6
+ // pmulhw xmm6, xmm0
+ // /* Interleave the multiplicataion results */
+ // movdqa xmm0, xmm1
+ // punpcklwd xmm1, xmm2 /* Now the low 4 x 32 bits */
+ // punpckhwd xmm0, xmm2 /* The high 4x32 bits */
+
+ // movdqa xmm2, xmm5
+ // punpcklwd xmm5, xmm6
+ // punpckhwd xmm2, xmm6
+
+ // /* Write the 16x32 bits of output to temp space */
+ // movdqa [eax + 192], xmm1
+ // movdqa [eax + 208], xmm0
+ // movdqa [eax + 224], xmm5
+ // movdqa [eax + 240], xmm2
+
/* Now follow the pattern to write - can't use simd */
- mov ebx, 4
- loop_start:
+ mov ebx, 8
+ write_loop_start:
mov ecx , [edx]
mov esi , [eax]
mov [edi + ecx*4] , esi
@@ -233,39 +249,47 @@
mov esi , [eax + 28]
mov [edi + ecx*4] , esi
- mov ecx , [edx + 32]
- mov esi , [eax + 32]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 36]
- mov esi , [eax + 36]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 40]
- mov esi , [eax + 40]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 44]
- mov esi , [eax + 44]
- mov [edi + ecx*4] , esi
+ /* Update the pointers */
+ add eax, 32
+ add edx, 32
- mov ecx , [edx + 48]
- mov esi , [eax + 48]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 52]
- mov esi , [eax + 52]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 56]
- mov esi , [eax + 56]
- mov [edi + ecx*4] , esi
- mov ecx , [edx + 60]
- mov esi , [eax + 60]
- mov [edi + ecx*4] , esi
+ /* Check the loop */
+ sub ebx, 1
+ jnz write_loop_start
- add eax, 64
- add edx, 64
+ //mov ecx , [edx + 32]
+ //mov esi , [eax + 32]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 36]
+ //mov esi , [eax + 36]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 40]
+ //mov esi , [eax + 40]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 44]
+ //mov esi , [eax + 44]
+ //mov [edi + ecx*4] , esi
- sub ebx, 1
- jnz loop_start
+ //mov ecx , [edx + 48]
+ //mov esi , [eax + 48]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 52]
+ //mov esi , [eax + 52]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 56]
+ //mov esi , [eax + 56]
+ //mov [edi + ecx*4] , esi
+ //mov ecx , [edx + 60]
+ //mov esi , [eax + 60]
+ //mov [edi + ecx*4] , esi
+ //add eax, 64
+ //add edx, 64
+ //sub ebx, 1
+ //jnz write_loop_start
+
+
};
#endif
}
More information about the commits
mailing list