[xiph-commits] r11563 - in branches/theora-playtime: lib/x86_32_vs win32/VS2005/encoder_example win32/VS2005/libtheora

illiminable at svn.xiph.org illiminable at svn.xiph.org
Mon Jun 12 09:55:17 PDT 2006


Author: illiminable
Date: 2006-06-12 09:54:54 -0700 (Mon, 12 Jun 2006)
New Revision: 11563

Modified:
   branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
   branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
   branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
   branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.c
   branches/theora-playtime/lib/x86_32_vs/perf_helper.h
   branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
   branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
   branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Loop version of sad8x8 is just as fast as the unrolled version

Modified: branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -26,7 +26,7 @@
 
   ogg_int32_t j;
   ogg_int32_t FiltVal;
-  PERF_BLOCK_START();
+  //PERF_BLOCK_START();
   for ( j = 0; j < 8; j++ ){
     FiltVal =
       ( PixelPtr[0] ) -
@@ -42,7 +42,7 @@
     PixelPtr += LineLength;
     
   }
-  PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
+  //PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
 
 #else
     static __declspec(align(16)) unsigned char temp[128];

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -19,6 +19,7 @@
 
 #include "codec_internal.h"
 #include "dsp.h"
+#include "perf_helper.h"
 
 #if 0
 //These are to let me selectively enable the C versions, these are needed
@@ -30,6 +31,11 @@
 
 static const ogg_int64_t V128 = 0x0080008000800080LL;
 
+
+static unsigned __int64 perf_sad8x8_time;
+static unsigned __int64 perf_sad8x8_count;
+static unsigned __int64 perf_sad8x8_min;
+
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
                   ogg_uint32_t ReconPixelsPerLine) 
@@ -934,6 +940,7 @@
 #else
   ogg_uint32_t  DiffVal;
 
+  PERF_BLOCK_START();
   __asm {
     align  16
 
@@ -1101,6 +1108,7 @@
     mov         DiffVal, eax
   };
 
+  PERF_BLOCK_END("sad8x8 mmx - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
   return DiffVal;
 
  
@@ -1602,5 +1610,11 @@
   funcs->intra8x8_err = intra8x8_err__mmx;
   funcs->inter8x8_err = inter8x8_err__mmx;
   funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+
+  perf_sad8x8_time = 0;
+ perf_sad8x8_count = 0;
+perf_sad8x8_min = -1;
 }
 
+
+

Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -15,6 +15,7 @@
 
 #include "codec_internal.h"
 #include "dsp.h"
+#include "perf_helper.h"
 
 #if 1
 //These are to let me selectively enable the C versions, these are needed
@@ -24,6 +25,11 @@
 #endif
 
 
+
+static unsigned __int64 perf_sad8x8_time;
+static unsigned __int64 perf_sad8x8_count;
+static unsigned __int64 perf_sad8x8_min;
+
 //static const ogg_int64_t V128 = 0x0080008000800080LL;
 
 static __declspec(align(16)) const unsigned int V128_8x16bits[4] = { 0x00800080, 0x00800080, 0x00800080, 0x00800080 };
@@ -873,6 +879,8 @@
   ogg_uint32_t  i;
   ogg_uint32_t  sad = 0;
 
+  PERF_BLOCK_START();
+
   for (i=8; i; i--) {
     sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
     sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
@@ -888,18 +896,112 @@
     ptr2 += stride2;
   }
 
+  PERF_BLOCK_END("sad8x8 C - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
+
   return sad;
+#elif 1
+  ogg_uint32_t  DiffVal;
+
+ PERF_BLOCK_START();
+  __asm {
+    align  16
+
+    mov         eax, ptr1
+    mov         ebx, ptr2
+
+
+    mov         ecx, stride1
+    mov         edx, stride2
+
+    pxor      xmm2, xmm2 /* Result */    
+    pxor      xmm3, xmm3
+
+    mov         edi, 4
+
+loop_start:
+        movq      xmm0, QWORD PTR [eax]
+        movq      xmm1, QWORD PTR [eax + ecx]
+
+
+        movq      xmm4, QWORD PTR [ebx]
+        movq      xmm5, QWORD PTR [ebx + edx]
+
+        /* Absolute difference */
+        movq        xmm6, xmm0
+        movq        xmm7, xmm1
+        psubusb     xmm0, xmm4
+        psubusb     xmm1, xmm5
+        psubusb     xmm4, xmm6
+        psubusb     xmm5, xmm7
+        por         xmm0, xmm4
+        por         xmm1, xmm5
+
+        /* Expand to 16 bits */
+        punpcklbw   xmm0, xmm3
+        punpcklbw   xmm1, xmm3
+
+        /* Accumulate */
+        paddw       xmm0, xmm1
+        paddw       xmm2, xmm0
+
+        lea         eax, [eax + 2*ecx]
+        lea         ebx, [ebx + 2*edx]
+        sub     edi, 1
+        jnz     loop_start
+
+        
+    /*---------------------------*/
+
+
+    /* Add the items in the result */
+    movdqa      xmm0, xmm2
+    psrlq       xmm2, 32
+
+    paddw       xmm0, xmm2
+
+
+    movdqa      xmm2, xmm0
+    psrlq       xmm0, 16
+
+    paddw       xmm2, xmm0
+
+    movdqa      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddw       xmm0, xmm2
+
+    /* Put it in the return variable */
+
+    movd        eax, xmm0
+    and         eax, 0xffff
+    mov         DiffVal, eax
+
+
+  };
+
+ PERF_BLOCK_END("sad8x8 sse2 - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
+    return DiffVal;
+   
+ 
+
+
 #else
 
+   
+
   ogg_uint32_t  DiffVal;
 
+ PERF_BLOCK_START();
   __asm {
     align  16
 
     mov         eax, ptr1
     mov         ebx, ptr2
+
+
     mov         ecx, stride1
     mov         edx, stride2
+
+
     lea         edi, [ecx + ecx*2]
     lea         esi, [edx + edx*2]
 
@@ -1023,6 +1125,9 @@
         
     /*---------------------------*/
 
+    /* Load the address of temp */
+    //mov         edx, temp_ptr
+
     /* Add the items in the result */
     movdqa      xmm0, xmm2
     psrlq       xmm2, 32
@@ -1040,13 +1145,17 @@
     paddw       xmm0, xmm2
 
     /* Put it in the return variable */
+
     movd        eax, xmm0
     and         eax, 0xffff
     mov         DiffVal, eax
 
 
   };
+
+ PERF_BLOCK_END("sad8x8 sse2 - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
     return DiffVal;
+   
  
 
 #endif
@@ -1532,11 +1641,24 @@
   funcs->sub8x8avg2 = sub8x8avg2__sse2;
   funcs->row_sad8 = row_sad8__sse2;
   funcs->col_sad8x8 = col_sad8x8__sse2;
+  
+  
+  /* The mmx versions are faster right now */
   funcs->sad8x8 = sad8x8__sse2;
   funcs->sad8x8_thres = sad8x8_thres__sse2;
+ 
+  
+  
   //funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__sse2;
   //funcs->intra8x8_err = intra8x8_err__sse2;
   //funcs->inter8x8_err = inter8x8_err__sse2;
   //funcs->inter8x8_err_xy2 = inter8x8_err_xy2__sse2;
+
+
+
+  
+perf_sad8x8_time = 0;
+ perf_sad8x8_count = 0;
+perf_sad8x8_min = -1;
 }
 

Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -52,7 +52,7 @@
 #if 0
 
   int i;
-    PERF_BLOCK_START();
+    //PERF_BLOCK_START();
   for(i=0;i<64;i++)
     DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
 
@@ -65,7 +65,7 @@
 
     /*      quantized list is not aligned */
 
-    PERF_BLOCK_START();
+   // PERF_BLOCK_START();
     __asm {
         align       16
 
@@ -167,7 +167,7 @@
 
     pop     ebx
     };
-    PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
+   // PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
 #endif
 }
 
@@ -400,7 +400,7 @@
 
 #if 0
   int i;
-  PERF_BLOCK_START();
+  //PERF_BLOCK_START();
   memset(DCT_block,0, 128);
   for(i=0;i<10;i++)
     DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
@@ -412,7 +412,7 @@
     static unsigned char* temp_block_ptr = temp_block;
     static ogg_int32_t* zigzag_ptr = dezigzag_index;
 
-    PERF_BLOCK_START();
+    //PERF_BLOCK_START();
      __asm {
 
         align       16
@@ -509,7 +509,7 @@
 
 
      }
-     PERF_BLOCK_END("dequant_slow10 sse2", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 5000);
+     //PERF_BLOCK_END("dequant_slow10 sse2", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 5000);
 #endif
 
 }
@@ -725,7 +725,7 @@
     static __declspec(align(16)) unsigned char temp[16];
     static unsigned char* temp_ptr = temp;
 
-    PERF_BLOCK_START();
+    //PERF_BLOCK_START();
     __asm {
         align       16
 
@@ -770,7 +770,7 @@
 
 
     }
-    PERF_BLOCK_END("IDct1 sse2", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
+    //PERF_BLOCK_END("IDct1 sse2", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
 #endif
 
 }

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -5,6 +5,7 @@
     unsigned long lower;
     unsigned __int64 ret;
     __asm {
+        align 16
         RDTSC
         mov     upper, edx
         mov     lower, eax

Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h	2006-06-12 16:54:54 UTC (rev 11563)
@@ -2,7 +2,7 @@
 
 #include <windows.h>
 #include <stdio.h>
-static unsigned __int64 perf_start_time[64];
+static unsigned __int64 perf_start_time[4096];
 static unsigned __int64 perf_temp;
 static unsigned long depth = 0;
 
@@ -16,7 +16,7 @@
 */
 
 extern unsigned __int64 GetCPUTime();
-//#define PERF_DATA_ON
+#define PERF_DATA_ON
 #ifdef PERF_DATA_ON
 
 

Modified: branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/quant_sse2.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/quant_sse2.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -31,7 +31,7 @@
                ogg_int16_t * DCT_block,
                Q_LIST_ENTRY * quantized_list){
 
-#if 0
+#if 1
   ogg_uint32_t  i;              /* Row index */
   Q_LIST_ENTRY  val;            /* Quantised value. */
 
@@ -175,7 +175,7 @@
   ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
   
 
-  PERF_BLOCK_START();
+  //PERF_BLOCK_START();
  
 
   __asm {
@@ -430,7 +430,7 @@
 
   }
 
-  PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+  //PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
 
 #endif
 }

Modified: branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/scan_sse2.c	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/scan_sse2.c	2006-06-12 16:54:54 UTC (rev 11563)
@@ -722,7 +722,7 @@
   ogg_int32_t    FragChangedPixels;
 
   ogg_int16_t Diff;     /* Temp local workspace. */
-  PERF_BLOCK_START();
+  //PERF_BLOCK_START();
   /* Cannot use kernel if at edge or if PAK disabled */
   if ( (!ppi->PAKEnabled) || EdgeRow ){
     for ( i = 0; i < ppi->PlaneWidth; i += HFRAGPIXELS ){
@@ -921,7 +921,7 @@
 
   }
 
-  PERF_BLOCK_END("RowDiffScan ", perf_rds_datmf_time, perf_rds_datmf_count, perf_rds_datmf_min, 10000);
+  //PERF_BLOCK_END("RowDiffScan ", perf_rds_datmf_time, perf_rds_datmf_count, perf_rds_datmf_min, 10000);
 }
 
 

Modified: branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj	2006-06-12 16:54:54 UTC (rev 11563)
@@ -119,14 +119,14 @@
 				InlineFunctionExpansion="2"
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
-				WholeProgramOptimization="false"
+				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\..\libvorbis\include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="0"
 				UsePrecompiledHeader="0"
 				WarningLevel="4"
 				Detect64BitPortabilityProblems="true"
-				DebugInformationFormat="3"
+				DebugInformationFormat="0"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
@@ -140,7 +140,7 @@
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="1"
-				GenerateDebugInformation="true"
+				GenerateDebugInformation="false"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-12 16:54:54 UTC (rev 11563)
@@ -19,7 +19,7 @@
 			Name="Debug|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="Debug"
-			ConfigurationType="2"
+			ConfigurationType="4"
 			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
 			CharacterSet="2"
 			>
@@ -63,23 +63,12 @@
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
-				Name="VCLinkerTool"
-				OutputFile="$(OutDir)/libtheora.dll"
-				LinkIncremental="2"
-				ModuleDefinitionFile="..\..\libtheora.def"
-				GenerateDebugInformation="true"
-				ProgramDatabaseFile="$(OutDir)/libtheora.pdb"
-				SubSystem="2"
-				ImportLibrary="$(OutDir)/libtheora.lib"
-				TargetMachine="1"
+				Name="VCLibrarianTool"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
@@ -89,12 +78,6 @@
 				Name="VCFxCopTool"
 			/>
 			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCWebDeploymentTool"
-			/>
-			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
@@ -129,9 +112,9 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				OmitFramePointers="true"
-				WholeProgramOptimization="false"
+				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM; USE_NO_SSE"
 				StringPooling="true"
 				ExceptionHandling="0"
 				RuntimeLibrary="0"



More information about the commits mailing list