[xiph-commits] r11536 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/libtheora

illiminable at svn.xiph.org illiminable at svn.xiph.org
Wed Jun 7 08:55:49 PDT 2006


Author: illiminable
Date: 2006-06-07 08:55:33 -0700 (Wed, 07 Jun 2006)
New Revision: 11536

Added:
   branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.h
Modified:
   branches/theora-playtime/lib/dct_decode.c
   branches/theora-playtime/lib/dsp.h
   branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Init function in dct_decode
* Some performance data scaffolding adding
* dequant_slow sse2 implementation best case 293 cycles vs 357 per iteration
* FilterHoriz implementation - still pretty dodgy bit faster on amd, slightly slower on pentium M

Modified: branches/theora-playtime/lib/dct_decode.c
===================================================================
--- branches/theora-playtime/lib/dct_decode.c	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/dct_decode.c	2006-06-07 15:55:33 UTC (rev 11536)
@@ -1243,8 +1243,8 @@
   //  dsp_mmx_idct_init(funcs);
   //}
 
-  //if (cpu_flags & CPU_X86_SSE2) {
-  //  dsp_sse2_dct_decode_init(funcs);
-  //}
+  if (cpu_flags & CPU_X86_SSE2) {
+    dsp_sse2_dct_decode_init(funcs);
+  }
 #endif
 }

Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/dsp.h	2006-06-07 15:55:33 UTC (rev 11536)
@@ -103,12 +103,12 @@
                    ogg_int32_t * DCT_block);
 
   /* dct_decode */
-  void (*FilterHoriz)(unsigned char * PixelPtr,
-                        ogg_int32_t LineLength,
+  void (*FilterHoriz)(unsigned char * PixelPtr,
+                        ogg_int32_t LineLength,
                         ogg_int32_t *BoundingValuePtr);
 
-  void (*FilterVert)(unsigned char * PixelPtr,
-                ogg_int32_t LineLength,
+  void (*FilterVert)(unsigned char * PixelPtr,
+                ogg_int32_t LineLength,
                 ogg_int32_t *BoundingValuePtr);
 
 
@@ -133,6 +133,7 @@
 extern void dsp_sse2_init(DspFunctions *funcs);
 extern void dsp_sse2_recon_init(DspFunctions *funcs);
 extern void dsp_sse2_idct_init(DspFunctions *funcs);
+extern void dsp_sse2_dct_decode_init(DspFunctions *funcs);
 
 #endif
 

Added: branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,195 @@
+
+
+#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
+
+#include "perf_helper.h"
+
+//static __declspec(align(16)) const unsigned int PixMask[4] = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+//static const unsigned int* PixMaskPtr = PixMask;
+//
+//static __declspec(align(16)) const unsigned int TripleMask[4] = { 0x0000FFFF, 0xFFFF0000, 0x0000FFFF, 0xFFFF0000 };
+//static const unsigned int* PTripleMaskPtr = TripleMask;
+
+static unsigned __int64 perf_filter_horiz_time;
+static unsigned __int64 perf_filter_horiz_min;
+static unsigned __int64 perf_filter_horiz_count;
+
+static void FilterHoriz__sse2(unsigned char * PixelPtr,
+                        ogg_int32_t LineLength,
+                        ogg_int32_t *BoundingValuePtr){
+
+#if 1
+  
+  
+
+  ogg_int32_t j;
+  ogg_int32_t FiltVal;
+  PERF_BLOCK_START();
+  for ( j = 0; j < 8; j++ ){
+    FiltVal =
+      ( PixelPtr[0] ) -
+      ( PixelPtr[1] * 3 ) +
+      ( PixelPtr[2] * 3 ) -
+      ( PixelPtr[3] );
+
+    FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
+
+    PixelPtr[1] = clamp255(PixelPtr[1] + FiltVal);
+    PixelPtr[2] = clamp255(PixelPtr[2] - FiltVal);
+
+    PixelPtr += LineLength;
+    
+  }
+  PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
+
+#else
+    static __declspec(align(16)) unsigned char temp[128];
+    static unsigned char* temp_ptr = temp; 
+
+  PERF_BLOCK_START();
+    __asm {
+        align           16
+        mov             esi, PixelPtr
+        mov             edi, temp_ptr
+        mov             ecx, BoundingValuePtr
+
+        push            ebp
+        push            ebx
+        mov             ebp, LineLength
+        mov             ebx, 8
+        
+        /* It */     
+    loop_start:
+        movzx           eax, BYTE PTR [esi]
+        mov             [edi+64], ax
+
+        movzx           edx, BYTE PTR [esi+1]
+        mov             [edi+66], dx
+        sub             eax, edx
+        add             edx, edx
+        sub             eax, edx
+
+        movzx           edx, BYTE PTR [esi+2]
+        mov             [edi+68], dx
+        add             eax, edx
+        add             edx, edx
+        add             eax, edx
+
+        movzx           edx, BYTE PTR [esi+3]
+        mov             [edi+70], dx
+        sub             eax, edx
+
+        add             eax, 4
+        sar             eax, 3
+        sal             eax, 2
+
+        mov             eax, [eax + ecx]
+
+        mov             WORD PTR [edi], 0
+        mov             [edi + 2], ax
+        neg             ax
+        mov             [edi + 4], ax
+        mov             WORD PTR [edi + 6], 0
+
+        
+
+        add             edi, 8
+        add             esi, ebp
+
+        sub             ebx, 1
+        jnz     loop_start
+
+        sub             edi, 64
+        shl             ebp, 3
+        sub             esi, ebp
+        shr             ebp, 3
+
+        movdqa          xmm1, [edi]
+        movdqa          xmm2, [edi + 16]
+        movdqa          xmm3, [edi + 32]
+        movdqa          xmm4, [edi + 48]
+
+
+        movdqa          xmm5, [edi + 64]
+        movdqa          xmm6, [edi + 80]
+        movdqa          xmm7, [edi + 96]
+        movdqa          xmm0, [edi + 112]
+
+        paddsw            xmm1, xmm5
+        paddsw            xmm2, xmm6
+        paddsw            xmm3, xmm7
+        paddsw            xmm4, xmm0
+
+        packuswb        xmm1, xmm1
+        movdqa          [edi], xmm1
+        packuswb        xmm2, xmm2
+        movdqa          [edi + 16], xmm2
+        packuswb        xmm3, xmm3
+        movdqa          [edi + 32], xmm3
+        packuswb        xmm4, xmm4
+
+        movdqa          [edi + 48], xmm4
+
+
+
+        mov             ebx, 4
+    write_loop_start:
+        mov             eax, [edi]
+        mov             edx, [edi + 4]
+        mov             [esi], eax
+        mov             [esi + ebp], edx
+
+        lea             esi, [esi + 2*ebp]
+        add             edi, 16
+
+        sub             ebx, 1
+        jnz     write_loop_start
+
+
+
+        pop             ebx
+        pop             ebp
+    }
+    PERF_BLOCK_END("filter horiz sse2", perf_filter_horiz_time, perf_filter_horiz_count, perf_filter_horiz_min, 10000);
+#endif
+}
+
+static void FilterVert__sse2(unsigned char * PixelPtr,
+                ogg_int32_t LineLength,
+                ogg_int32_t *BoundingValuePtr){
+  ogg_int32_t j;
+  ogg_int32_t FiltVal;
+
+  /* the math was correct, but negative array indicies are forbidden
+     by ANSI/C99 and will break optimization on several modern
+     compilers */
+
+  PixelPtr -= 2*LineLength;
+
+  for ( j = 0; j < 8; j++ ) {
+    FiltVal = ( (ogg_int32_t)PixelPtr[0] ) -
+      ( (ogg_int32_t)PixelPtr[LineLength] * 3 ) +
+      ( (ogg_int32_t)PixelPtr[2 * LineLength] * 3 ) -
+      ( (ogg_int32_t)PixelPtr[3 * LineLength] );
+
+    FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
+
+    PixelPtr[LineLength] = clamp255(PixelPtr[LineLength] + FiltVal);
+    PixelPtr[2 * LineLength] = clamp255(PixelPtr[2*LineLength] - FiltVal);
+
+    PixelPtr ++;
+  }
+}
+
+void dsp_sse2_dct_decode_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
+
+  perf_filter_horiz_time = 0;
+  perf_filter_horiz_min = -1;
+  perf_filter_horiz_count = 0;
+  funcs->FilterHoriz = FilterHoriz__sse2;
+
+}
\ No newline at end of file

Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-07 15:55:33 UTC (rev 11536)
@@ -19,6 +19,8 @@
 #include "codec_internal.h"
 #include "quant_lookup.h"
 
+#include "perf_helper.h"
+
 #define IdctAdjustBeforeShift 8
 
 /* cos(n*pi/16) or sin(8-n)*pi/16) */
@@ -30,16 +32,23 @@
 #define xC6S2 25080
 #define xC7S1 12785
 
+static unsigned __int64 perf_dequant_slow_time;
+static unsigned __int64 perf_dequant_slow_count;
+static unsigned __int64 perf_dequant_slow_min;
 
 
 static void dequant_slow__sse2( ogg_int16_t * dequant_coeffs,
                    ogg_int16_t * quantized_list,
                    ogg_int32_t * DCT_block) 
 {
-#if 1
+#if 0
+
   int i;
+    PERF_BLOCK_START();
   for(i=0;i<64;i++)
     DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+
+  PERF_BLOCK_END("dequant_slow C", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
 #else
 
     static __declspec(align(16)) ogg_int32_t temp_block[64];
@@ -48,7 +57,7 @@
 
     /*      quantized list is not aligned */
 
-
+    PERF_BLOCK_START();
     __asm {
         align       16
 
@@ -147,6 +156,7 @@
     jnz         write_loop_start
 
     };
+    PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
 #endif
 }
 
@@ -590,6 +600,11 @@
 
 void dsp_sse2_idct_init (DspFunctions *funcs)
 {
+
+
+    perf_dequant_slow_time = 0;
+    perf_dequant_slow_count = 0;
+    perf_dequant_slow_min = -1;
     /* TODO::: Match function order */
   funcs->dequant_slow = dequant_slow__sse2;
   funcs->IDct1 = IDct1__sse2;

Added: branches/theora-playtime/lib/x86_32_vs/perf_helper.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,19 @@
+#include "perf_helper.h"
+unsigned __int64 GetCPUTime()
+{
+    unsigned long upper;
+    unsigned long lower;
+    unsigned __int64 ret;
+    __asm {
+        RDTSC
+        mov     upper, edx
+        mov     lower, eax
+    }
+
+    ret = upper;
+    ret <<= 32;
+    ret += lower;
+    return ret;
+
+
+}

Added: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-07 15:55:33 UTC (rev 11536)
@@ -0,0 +1,24 @@
+#ifdef WIN32
+
+#include <windows.h>
+#include <stdio.h>
+static unsigned __int64 perf_start_time[64];
+static unsigned __int64 perf_temp;
+static unsigned long depth = 0;
+
+
+extern unsigned __int64 GetCPUTime();
+
+
+#define PERF_BLOCK_START()  perf_start_time[depth++] = GetCPUTime();
+
+#define PERF_BLOCK_END(s, x, y, l, z)    perf_temp = (GetCPUTime() - perf_start_time[--depth]); (l) = ((l) > perf_temp) ? perf_temp : (l); x += perf_temp; (y)++;     \
+  if (((y) % (z)) == 0)                                                                             \
+  {                                                                                                 \
+    printf(s " - %lld from %lld iterations -- @%lld cycles -- min(%lld)\n", x, y, (x) / (y), l);                           \
+  }                                                                                                 
+
+
+
+
+#endif
\ No newline at end of file

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-07 09:46:15 UTC (rev 11535)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-07 15:55:33 UTC (rev 11536)
@@ -401,6 +401,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\dct_decode_sse2.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\dct_encode.c"
 				>
 			</File>
@@ -473,6 +477,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\perf_helper.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\pp.c"
 				>
 			</File>
@@ -543,6 +551,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\perf_helper.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\pp.h"
 				>
 			</File>



More information about the commits mailing list