[xiph-commits] r11536 - in branches/theora-playtime: lib
lib/x86_32_vs win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Wed Jun 7 08:55:49 PDT 2006
Author: illiminable
Date: 2006-06-07 08:55:33 -0700 (Wed, 07 Jun 2006)
New Revision: 11536
Added:
branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.h
Modified:
branches/theora-playtime/lib/dct_decode.c
branches/theora-playtime/lib/dsp.h
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Init function in dct_decode
* Some performance data scaffolding adding
* dequant_slow sse2 implementation best case 293 cycles vs 357 per iteration
* FilterHoriz implementation - still pretty dodgy bit faster on amd, slightly slower on pentium M
Modified: branches/theora-playtime/lib/dct_decode.c
===================================================================
--- branches/theora-playtime/lib/dct_decode.c 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/dct_decode.c 2006-06-07 15:55:33 UTC (rev 11536)
@@ -1243,8 +1243,8 @@
// dsp_mmx_idct_init(funcs);
//}
- //if (cpu_flags & CPU_X86_SSE2) {
- // dsp_sse2_dct_decode_init(funcs);
- //}
+ if (cpu_flags & CPU_X86_SSE2) {
+ dsp_sse2_dct_decode_init(funcs);
+ }
#endif
}
Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/dsp.h 2006-06-07 15:55:33 UTC (rev 11536)
@@ -103,12 +103,12 @@
ogg_int32_t * DCT_block);
/* dct_decode */
- void (*FilterHoriz)(unsigned char * PixelPtr,
- ogg_int32_t LineLength,
+ void (*FilterHoriz)(unsigned char * PixelPtr,
+ ogg_int32_t LineLength,
ogg_int32_t *BoundingValuePtr);
- void (*FilterVert)(unsigned char * PixelPtr,
- ogg_int32_t LineLength,
+ void (*FilterVert)(unsigned char * PixelPtr,
+ ogg_int32_t LineLength,
ogg_int32_t *BoundingValuePtr);
@@ -133,6 +133,7 @@
extern void dsp_sse2_init(DspFunctions *funcs);
extern void dsp_sse2_recon_init(DspFunctions *funcs);
extern void dsp_sse2_idct_init(DspFunctions *funcs);
+extern void dsp_sse2_dct_decode_init(DspFunctions *funcs);
#endif
Added: branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c 2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,195 @@
+
+
+#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
+
+#include "perf_helper.h"
+
+//static __declspec(align(16)) const unsigned int PixMask[4] = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+//static const unsigned int* PixMaskPtr = PixMask;
+//
+//static __declspec(align(16)) const unsigned int TripleMask[4] = { 0x0000FFFF, 0xFFFF0000, 0x0000FFFF, 0xFFFF0000 };
+//static const unsigned int* PTripleMaskPtr = TripleMask;
+
+static unsigned __int64 perf_filter_horiz_time;
+static unsigned __int64 perf_filter_horiz_min;
+static unsigned __int64 perf_filter_horiz_count;
+
+static void FilterHoriz__sse2(unsigned char * PixelPtr,
+ ogg_int32_t LineLength,
+ ogg_int32_t *BoundingValuePtr){
+
+#if 1
+
+
+
+ ogg_int32_t j;
+ ogg_int32_t FiltVal;
+ PERF_BLOCK_START();
+ for ( j = 0; j < 8; j++ ){
+ FiltVal =
+ ( PixelPtr[0] ) -
+ ( PixelPtr[1] * 3 ) +
+ ( PixelPtr[2] * 3 ) -
+ ( PixelPtr[3] );
+
+ FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
+
+ PixelPtr[1] = clamp255(PixelPtr[1] + FiltVal);
+ PixelPtr[2] = clamp255(PixelPtr[2] - FiltVal);
+
+ PixelPtr += LineLength;
+
+ }
+ PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
+
+#else
+ static __declspec(align(16)) unsigned char temp[128];
+ static unsigned char* temp_ptr = temp;
+
+ PERF_BLOCK_START();
+ __asm {
+ align 16
+ mov esi, PixelPtr
+ mov edi, temp_ptr
+ mov ecx, BoundingValuePtr
+
+ push ebp
+ push ebx
+ mov ebp, LineLength
+ mov ebx, 8
+
+ /* It */
+ loop_start:
+ movzx eax, BYTE PTR [esi]
+ mov [edi+64], ax
+
+ movzx edx, BYTE PTR [esi+1]
+ mov [edi+66], dx
+ sub eax, edx
+ add edx, edx
+ sub eax, edx
+
+ movzx edx, BYTE PTR [esi+2]
+ mov [edi+68], dx
+ add eax, edx
+ add edx, edx
+ add eax, edx
+
+ movzx edx, BYTE PTR [esi+3]
+ mov [edi+70], dx
+ sub eax, edx
+
+ add eax, 4
+ sar eax, 3
+ sal eax, 2
+
+ mov eax, [eax + ecx]
+
+ mov WORD PTR [edi], 0
+ mov [edi + 2], ax
+ neg ax
+ mov [edi + 4], ax
+ mov WORD PTR [edi + 6], 0
+
+
+
+ add edi, 8
+ add esi, ebp
+
+ sub ebx, 1
+ jnz loop_start
+
+ sub edi, 64
+ shl ebp, 3
+ sub esi, ebp
+ shr ebp, 3
+
+ movdqa xmm1, [edi]
+ movdqa xmm2, [edi + 16]
+ movdqa xmm3, [edi + 32]
+ movdqa xmm4, [edi + 48]
+
+
+ movdqa xmm5, [edi + 64]
+ movdqa xmm6, [edi + 80]
+ movdqa xmm7, [edi + 96]
+ movdqa xmm0, [edi + 112]
+
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm3, xmm7
+ paddsw xmm4, xmm0
+
+ packuswb xmm1, xmm1
+ movdqa [edi], xmm1
+ packuswb xmm2, xmm2
+ movdqa [edi + 16], xmm2
+ packuswb xmm3, xmm3
+ movdqa [edi + 32], xmm3
+ packuswb xmm4, xmm4
+
+ movdqa [edi + 48], xmm4
+
+
+
+ mov ebx, 4
+ write_loop_start:
+ mov eax, [edi]
+ mov edx, [edi + 4]
+ mov [esi], eax
+ mov [esi + ebp], edx
+
+ lea esi, [esi + 2*ebp]
+ add edi, 16
+
+ sub ebx, 1
+ jnz write_loop_start
+
+
+
+ pop ebx
+ pop ebp
+ }
+ PERF_BLOCK_END("filter horiz sse2", perf_filter_horiz_time, perf_filter_horiz_count, perf_filter_horiz_min, 10000);
+#endif
+}
+
+static void FilterVert__sse2(unsigned char * PixelPtr,
+ ogg_int32_t LineLength,
+ ogg_int32_t *BoundingValuePtr){
+ ogg_int32_t j;
+ ogg_int32_t FiltVal;
+
+ /* the math was correct, but negative array indicies are forbidden
+ by ANSI/C99 and will break optimization on several modern
+ compilers */
+
+ PixelPtr -= 2*LineLength;
+
+ for ( j = 0; j < 8; j++ ) {
+ FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
+ ( (ogg_int32_t)PixelPtr[LineLength] * 3 ) +
+ ( (ogg_int32_t)PixelPtr[2 * LineLength] * 3 ) -
+ ( (ogg_int32_t)PixelPtr[3 * LineLength] );
+
+ FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
+
+ PixelPtr[LineLength] = clamp255(PixelPtr[LineLength] + FiltVal);
+ PixelPtr[2 * LineLength] = clamp255(PixelPtr[2*LineLength] - FiltVal);
+
+ PixelPtr ++;
+ }
+}
+
+void dsp_sse2_dct_decode_init(DspFunctions *funcs)
+{
+ TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
+
+ perf_filter_horiz_time = 0;
+ perf_filter_horiz_min = -1;
+ perf_filter_horiz_count = 0;
+ funcs->FilterHoriz = FilterHoriz__sse2;
+
+}
\ No newline at end of file
Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-07 15:55:33 UTC (rev 11536)
@@ -19,6 +19,8 @@
#include "codec_internal.h"
#include "quant_lookup.h"
+#include "perf_helper.h"
+
#define IdctAdjustBeforeShift 8
/* cos(n*pi/16) or sin(8-n)*pi/16) */
@@ -30,16 +32,23 @@
#define xC6S2 25080
#define xC7S1 12785
+static unsigned __int64 perf_dequant_slow_time;
+static unsigned __int64 perf_dequant_slow_count;
+static unsigned __int64 perf_dequant_slow_min;
static void dequant_slow__sse2( ogg_int16_t * dequant_coeffs,
ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block)
{
-#if 1
+#if 0
+
int i;
+ PERF_BLOCK_START();
for(i=0;i<64;i++)
DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+
+ PERF_BLOCK_END("dequant_slow C", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
#else
static __declspec(align(16)) ogg_int32_t temp_block[64];
@@ -48,7 +57,7 @@
/* quantized list is not aligned */
-
+ PERF_BLOCK_START();
__asm {
align 16
@@ -147,6 +156,7 @@
jnz write_loop_start
};
+ PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
#endif
}
@@ -590,6 +600,11 @@
void dsp_sse2_idct_init (DspFunctions *funcs)
{
+
+
+ perf_dequant_slow_time = 0;
+ perf_dequant_slow_count = 0;
+ perf_dequant_slow_min = -1;
/* TODO::: Match function order */
funcs->dequant_slow = dequant_slow__sse2;
funcs->IDct1 = IDct1__sse2;
Added: branches/theora-playtime/lib/x86_32_vs/perf_helper.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.c 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.c 2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,19 @@
+#include "perf_helper.h"
+unsigned __int64 GetCPUTime()
+{
+ unsigned long upper;
+ unsigned long lower;
+ unsigned __int64 ret;
+ __asm {
+ RDTSC
+ mov upper, edx
+ mov lower, eax
+ }
+
+ ret = upper;
+ ret <<= 32;
+ ret += lower;
+ return ret;
+
+
+}
Added: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,24 @@
+#ifdef WIN32
+
+#include <windows.h>
+#include <stdio.h>
+static unsigned __int64 perf_start_time[64];
+static unsigned __int64 perf_temp;
+static unsigned long depth = 0;
+
+
+extern unsigned __int64 GetCPUTime();
+
+
+#define PERF_BLOCK_START() perf_start_time[depth++] = GetCPUTime();
+
+#define PERF_BLOCK_END(s, x, y, l, z) perf_temp = (GetCPUTime() - perf_start_time[--depth]); (l) = ((l) > perf_temp) ? perf_temp : (l); x += perf_temp; (y)++; \
+ if (((y) % (z)) == 0) \
+ { \
+ printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld)\n", x, y, (x) / (y), l); \
+ }
+
+
+
+
+#endif
\ No newline at end of file
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-07 15:55:33 UTC (rev 11536)
@@ -401,6 +401,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\dct_decode_sse2.c"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\dct_encode.c"
>
</File>
@@ -473,6 +477,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\perf_helper.c"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\pp.c"
>
</File>
@@ -543,6 +551,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\perf_helper.h"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\pp.h"
>
</File>
More information about the commits
mailing list