[xiph-commits] r11550 - branches/theora-playtime/lib/x86_32_vs
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Fri Jun 9 05:15:43 PDT 2006
Author: illiminable
Date: 2006-06-09 05:15:37 -0700 (Fri, 09 Jun 2006)
New Revision: 11550
Modified:
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.h
Log:
* dequant_slow10__sse2 from 100 cycles to 84
Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-09 10:09:58 UTC (rev 11549)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-09 12:15:37 UTC (rev 11550)
@@ -40,7 +40,11 @@
static unsigned __int64 perf_idct1_count;
static unsigned __int64 perf_idct1_min;
+static unsigned __int64 perf_dequant_slow10_time;
+static unsigned __int64 perf_dequant_slow10_count;
+static unsigned __int64 perf_dequant_slow10_min;
+
static void dequant_slow__sse2( ogg_int16_t * dequant_coeffs,
ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block)
@@ -391,17 +395,126 @@
static void dequant_slow10__sse2( ogg_int16_t * dequant_coeffs,
ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block){
+#if 0
int i;
+ PERF_BLOCK_START();
memset(DCT_block,0, 128);
for(i=0;i<10;i++)
DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+ PERF_BLOCK_END("dequant_slow10 C", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 10000);
+#else
+
+ static __declspec(align(16)) unsigned char temp_block[40];
+ static unsigned char* temp_block_ptr = temp_block;
+ static ogg_int32_t* zigzag_ptr = dezigzag_index;
+
+ PERF_BLOCK_START();
+ __asm {
+
+ align 16
+ mov edi, DCT_block
+ mov esi, quantized_list
+ mov edx, dequant_coeffs
+ mov eax, temp_block_ptr
+
+ pxor xmm0, xmm0
+
+ movdqa [edi], xmm0
+ movdqa [edi+16], xmm0
+ movdqa [edi+32], xmm0
+ movdqa [edi+48], xmm0
+ movdqa [edi+64], xmm0
+ movdqa [edi+80], xmm0
+ movdqa [edi+96], xmm0
+ movdqa [edi+112], xmm0
+
+ movdqu xmm1, [esi]
+ movdqu xmm2, [esi + 16] /* These can maybe be modq 's */
+ movdqa xmm3, [edx]
+ movdqa xmm4, [edx + 16]
+
+
+
+ /* Make a copy of xmm1 and xmm2 */
+ movdqa xmm5, xmm1
+ movdqa xmm6, xmm2
+
+ /* Multiply */
+ pmullw xmm1, xmm3
+ pmulhw xmm3, xmm5
+
+ pmullw xmm2, xmm4
+ pmulhw xmm4, xmm6
+
+ /* Interleave the multiplicataion results */
+ movdqa xmm0, xmm1
+ punpcklwd xmm1, xmm3 /* Now the low 4 x 32 bits */
+ punpckhwd xmm0, xmm3 /* The high 4x32 bits */
+
+ movdqa xmm6, xmm2
+ punpcklwd xmm2, xmm4
+ punpckhwd xmm6, xmm4
+
+ /* Write to temp */
+
+ movdqa [eax], xmm1
+ movdqa [eax + 16], xmm0
+ movdqa [eax + 32], xmm2
+ movdqa [eax + 48], xmm6
+
+ /* Get the zigzag pointer */
+ mov edx, zigzag_ptr
+
+
+
+ mov ecx , [edx]
+ mov esi , [eax]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 4]
+ mov esi , [eax + 4]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 8]
+ mov esi , [eax + 8]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 12]
+ mov esi , [eax + 12]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 16]
+ mov esi , [eax + 16]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 20]
+ mov esi , [eax + 20]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 24]
+ mov esi , [eax + 24]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 28]
+ mov esi , [eax + 28]
+ mov [edi + ecx*4] , esi
+
+ mov ecx , [edx + 32]
+ mov esi , [eax + 32]
+ mov [edi + ecx*4] , esi
+ mov ecx , [edx + 36]
+ mov esi , [eax + 36]
+ mov [edi + ecx*4] , esi
+
+
+ }
+ PERF_BLOCK_END("dequant_slow10 sse2", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 5000);
+#endif
+
}
void IDct10__sse2( Q_LIST_ENTRY * InputData,
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData ){
- ogg_int32_t IntermediateData[64];
+ __declspec(align(16)) ogg_int32_t IntermediateData[64];
ogg_int32_t * ip = IntermediateData;
ogg_int16_t * op = OutputData;
@@ -654,7 +767,7 @@
}
- PERF_BLOCK_END("IDct1 C", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
+ PERF_BLOCK_END("IDct1 sse2", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
#endif
}
@@ -668,6 +781,10 @@
perf_dequant_slow_count = 0;
perf_dequant_slow_min = -1;
+ perf_dequant_slow10_time = 0;
+ perf_dequant_slow10_count = 0;
+ perf_dequant_slow10_min = -1;
+
perf_idct1_time = 0;
perf_idct1_count = 0;
perf_idct1_min = -1;
Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-09 10:09:58 UTC (rev 11549)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-09 12:15:37 UTC (rev 11550)
@@ -6,6 +6,14 @@
static unsigned __int64 perf_temp;
static unsigned long depth = 0;
+/*
+//typedef struct {
+// unsigned __int64 sum;
+// unsigned __int64 count;
+// unsigned __int64 min;
+//
+//} perf_info;
+*/
extern unsigned __int64 GetCPUTime();
#define PERF_DATA_ON
@@ -14,10 +22,14 @@
#define PERF_BLOCK_START() perf_start_time[depth++] = GetCPUTime();
-#define PERF_BLOCK_END(s, x, y, l, z) perf_temp = (GetCPUTime() - perf_start_time[--depth]); (l) = ((l) > perf_temp) ? perf_temp : (l); x += perf_temp; (y)++; \
+#define PERF_BLOCK_END(s, x, y, l, z) \
+ perf_temp = (GetCPUTime() - perf_start_time[--depth]); \
+ (l) = ((l) > perf_temp) ? perf_temp : (l); \
+ x += perf_temp; \
+ (y)++; \
if (((y) % (z)) == 0) \
{ \
- printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld)\n", x, y, (x) / (y), l); \
+ printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld)\n", x, y, (x) / (y), l); \
}
#else
More information about the commits
mailing list