[xiph-commits] r11563 - in branches/theora-playtime: lib/x86_32_vs
win32/VS2005/encoder_example win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Mon Jun 12 09:55:17 PDT 2006
Author: illiminable
Date: 2006-06-12 09:54:54 -0700 (Mon, 12 Jun 2006)
New Revision: 11563
Modified:
branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.c
branches/theora-playtime/lib/x86_32_vs/perf_helper.h
branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Loop version of sad8x8 is just as fast as the unrolled version
Modified: branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dct_decode_sse2.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -26,7 +26,7 @@
ogg_int32_t j;
ogg_int32_t FiltVal;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
for ( j = 0; j < 8; j++ ){
FiltVal =
( PixelPtr[0] ) -
@@ -42,7 +42,7 @@
PixelPtr += LineLength;
}
- PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
+ //PERF_BLOCK_END("filter horiz C", perf_filter_horiz_time, perf_filter_horiz_count,perf_filter_horiz_min, 10000);
#else
static __declspec(align(16)) unsigned char temp[128];
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_mmx.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -19,6 +19,7 @@
#include "codec_internal.h"
#include "dsp.h"
+#include "perf_helper.h"
#if 0
//These are to let me selectively enable the C versions, these are needed
@@ -30,6 +31,11 @@
static const ogg_int64_t V128 = 0x0080008000800080LL;
+
+static unsigned __int64 perf_sad8x8_time;
+static unsigned __int64 perf_sad8x8_count;
+static unsigned __int64 perf_sad8x8_min;
+
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
ogg_uint32_t ReconPixelsPerLine)
@@ -934,6 +940,7 @@
#else
ogg_uint32_t DiffVal;
+ PERF_BLOCK_START();
__asm {
align 16
@@ -1101,6 +1108,7 @@
mov DiffVal, eax
};
+ PERF_BLOCK_END("sad8x8 mmx - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
return DiffVal;
@@ -1602,5 +1610,11 @@
funcs->intra8x8_err = intra8x8_err__mmx;
funcs->inter8x8_err = inter8x8_err__mmx;
funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+
+ perf_sad8x8_time = 0;
+ perf_sad8x8_count = 0;
+perf_sad8x8_min = -1;
}
+
+
Modified: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -15,6 +15,7 @@
#include "codec_internal.h"
#include "dsp.h"
+#include "perf_helper.h"
#if 1
//These are to let me selectively enable the C versions, these are needed
@@ -24,6 +25,11 @@
#endif
+
+static unsigned __int64 perf_sad8x8_time;
+static unsigned __int64 perf_sad8x8_count;
+static unsigned __int64 perf_sad8x8_min;
+
//static const ogg_int64_t V128 = 0x0080008000800080LL;
static __declspec(align(16)) const unsigned int V128_8x16bits[4] = { 0x00800080, 0x00800080, 0x00800080, 0x00800080 };
@@ -873,6 +879,8 @@
ogg_uint32_t i;
ogg_uint32_t sad = 0;
+ PERF_BLOCK_START();
+
for (i=8; i; i--) {
sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
@@ -888,18 +896,112 @@
ptr2 += stride2;
}
+ PERF_BLOCK_END("sad8x8 C - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
+
return sad;
+#elif 1
+ ogg_uint32_t DiffVal;
+
+ PERF_BLOCK_START();
+ __asm {
+ align 16
+
+ mov eax, ptr1
+ mov ebx, ptr2
+
+
+ mov ecx, stride1
+ mov edx, stride2
+
+ pxor xmm2, xmm2 /* Result */
+ pxor xmm3, xmm3
+
+ mov edi, 4
+
+loop_start:
+ movq xmm0, QWORD PTR [eax]
+ movq xmm1, QWORD PTR [eax + ecx]
+
+
+ movq xmm4, QWORD PTR [ebx]
+ movq xmm5, QWORD PTR [ebx + edx]
+
+ /* Absolute difference */
+ movq xmm6, xmm0
+ movq xmm7, xmm1
+ psubusb xmm0, xmm4
+ psubusb xmm1, xmm5
+ psubusb xmm4, xmm6
+ psubusb xmm5, xmm7
+ por xmm0, xmm4
+ por xmm1, xmm5
+
+ /* Expand to 16 bits */
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm3
+
+ /* Accumulate */
+ paddw xmm0, xmm1
+ paddw xmm2, xmm0
+
+ lea eax, [eax + 2*ecx]
+ lea ebx, [ebx + 2*edx]
+ sub edi, 1
+ jnz loop_start
+
+
+ /*---------------------------*/
+
+
+ /* Add the items in the result */
+ movdqa xmm0, xmm2
+ psrlq xmm2, 32
+
+ paddw xmm0, xmm2
+
+
+ movdqa xmm2, xmm0
+ psrlq xmm0, 16
+
+ paddw xmm2, xmm0
+
+ movdqa xmm0, xmm2
+ psrldq xmm2, 8
+ paddw xmm0, xmm2
+
+ /* Put it in the return variable */
+
+ movd eax, xmm0
+ and eax, 0xffff
+ mov DiffVal, eax
+
+
+ };
+
+ PERF_BLOCK_END("sad8x8 sse2 - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
+ return DiffVal;
+
+
+
+
#else
+
+
ogg_uint32_t DiffVal;
+ PERF_BLOCK_START();
__asm {
align 16
mov eax, ptr1
mov ebx, ptr2
+
+
mov ecx, stride1
mov edx, stride2
+
+
lea edi, [ecx + ecx*2]
lea esi, [edx + edx*2]
@@ -1023,6 +1125,9 @@
/*---------------------------*/
+ /* Load the address of temp */
+ //mov edx, temp_ptr
+
/* Add the items in the result */
movdqa xmm0, xmm2
psrlq xmm2, 32
@@ -1040,13 +1145,17 @@
paddw xmm0, xmm2
/* Put it in the return variable */
+
movd eax, xmm0
and eax, 0xffff
mov DiffVal, eax
};
+
+ PERF_BLOCK_END("sad8x8 sse2 - ", perf_sad8x8_time, perf_sad8x8_count,perf_sad8x8_min, 50000);
return DiffVal;
+
#endif
@@ -1532,11 +1641,24 @@
funcs->sub8x8avg2 = sub8x8avg2__sse2;
funcs->row_sad8 = row_sad8__sse2;
funcs->col_sad8x8 = col_sad8x8__sse2;
+
+
+ /* The mmx versions are faster right now */
funcs->sad8x8 = sad8x8__sse2;
funcs->sad8x8_thres = sad8x8_thres__sse2;
+
+
+
//funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__sse2;
//funcs->intra8x8_err = intra8x8_err__sse2;
//funcs->inter8x8_err = inter8x8_err__sse2;
//funcs->inter8x8_err_xy2 = inter8x8_err_xy2__sse2;
+
+
+
+
+perf_sad8x8_time = 0;
+ perf_sad8x8_count = 0;
+perf_sad8x8_min = -1;
}
Modified: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -52,7 +52,7 @@
#if 0
int i;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
for(i=0;i<64;i++)
DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
@@ -65,7 +65,7 @@
/* quantized list is not aligned */
- PERF_BLOCK_START();
+ // PERF_BLOCK_START();
__asm {
align 16
@@ -167,7 +167,7 @@
pop ebx
};
- PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
+ // PERF_BLOCK_END("dequant_slow sse2", perf_dequant_slow_time, perf_dequant_slow_count,perf_dequant_slow_min, 5000);
#endif
}
@@ -400,7 +400,7 @@
#if 0
int i;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
memset(DCT_block,0, 128);
for(i=0;i<10;i++)
DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
@@ -412,7 +412,7 @@
static unsigned char* temp_block_ptr = temp_block;
static ogg_int32_t* zigzag_ptr = dezigzag_index;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
__asm {
align 16
@@ -509,7 +509,7 @@
}
- PERF_BLOCK_END("dequant_slow10 sse2", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 5000);
+ //PERF_BLOCK_END("dequant_slow10 sse2", perf_dequant_slow10_time, perf_dequant_slow10_count,perf_dequant_slow10_min, 5000);
#endif
}
@@ -725,7 +725,7 @@
static __declspec(align(16)) unsigned char temp[16];
static unsigned char* temp_ptr = temp;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
__asm {
align 16
@@ -770,7 +770,7 @@
}
- PERF_BLOCK_END("IDct1 sse2", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
+ //PERF_BLOCK_END("IDct1 sse2", perf_idct1_time, perf_idct1_count,perf_idct1_min, 10000);
#endif
}
Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -5,6 +5,7 @@
unsigned long lower;
unsigned __int64 ret;
__asm {
+ align 16
RDTSC
mov upper, edx
mov lower, eax
Modified: branches/theora-playtime/lib/x86_32_vs/perf_helper.h
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/perf_helper.h 2006-06-12 16:54:54 UTC (rev 11563)
@@ -2,7 +2,7 @@
#include <windows.h>
#include <stdio.h>
-static unsigned __int64 perf_start_time[64];
+static unsigned __int64 perf_start_time[4096];
static unsigned __int64 perf_temp;
static unsigned long depth = 0;
@@ -16,7 +16,7 @@
*/
extern unsigned __int64 GetCPUTime();
-//#define PERF_DATA_ON
+#define PERF_DATA_ON
#ifdef PERF_DATA_ON
Modified: branches/theora-playtime/lib/x86_32_vs/quant_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/quant_sse2.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/quant_sse2.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -31,7 +31,7 @@
ogg_int16_t * DCT_block,
Q_LIST_ENTRY * quantized_list){
-#if 0
+#if 1
ogg_uint32_t i; /* Row index */
Q_LIST_ENTRY val; /* Quantised value. */
@@ -175,7 +175,7 @@
ogg_uint32_t * ZigZagPtr = (ogg_uint32_t *)pbi->zigzag_index;
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
__asm {
@@ -430,7 +430,7 @@
}
- PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
+ //PERF_BLOCK_END("quantize sse2", perf_quant_time, perf_quant_count, perf_quant_min, 20000);
#endif
}
Modified: branches/theora-playtime/lib/x86_32_vs/scan_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/lib/x86_32_vs/scan_sse2.c 2006-06-12 16:54:54 UTC (rev 11563)
@@ -722,7 +722,7 @@
ogg_int32_t FragChangedPixels;
ogg_int16_t Diff; /* Temp local workspace. */
- PERF_BLOCK_START();
+ //PERF_BLOCK_START();
/* Cannot use kernel if at edge or if PAK disabled */
if ( (!ppi->PAKEnabled) || EdgeRow ){
for ( i = 0; i < ppi->PlaneWidth; i += HFRAGPIXELS ){
@@ -921,7 +921,7 @@
}
- PERF_BLOCK_END("RowDiffScan ", perf_rds_datmf_time, perf_rds_datmf_count, perf_rds_datmf_min, 10000);
+ //PERF_BLOCK_END("RowDiffScan ", perf_rds_datmf_time, perf_rds_datmf_count, perf_rds_datmf_min, 10000);
}
Modified: branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/win32/VS2005/encoder_example/encoder_example.vcproj 2006-06-12 16:54:54 UTC (rev 11563)
@@ -119,14 +119,14 @@
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
- WholeProgramOptimization="false"
+ WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\..\libvorbis\include"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
RuntimeLibrary="0"
UsePrecompiledHeader="0"
WarningLevel="4"
Detect64BitPortabilityProblems="true"
- DebugInformationFormat="3"
+ DebugInformationFormat="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
@@ -140,7 +140,7 @@
<Tool
Name="VCLinkerTool"
LinkIncremental="1"
- GenerateDebugInformation="true"
+ GenerateDebugInformation="false"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-12 15:01:49 UTC (rev 11562)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-12 16:54:54 UTC (rev 11563)
@@ -19,7 +19,7 @@
Name="Debug|Win32"
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
IntermediateDirectory="Debug"
- ConfigurationType="2"
+ ConfigurationType="4"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
CharacterSet="2"
>
@@ -63,23 +63,12 @@
Name="VCPreLinkEventTool"
/>
<Tool
- Name="VCLinkerTool"
- OutputFile="$(OutDir)/libtheora.dll"
- LinkIncremental="2"
- ModuleDefinitionFile="..\..\libtheora.def"
- GenerateDebugInformation="true"
- ProgramDatabaseFile="$(OutDir)/libtheora.pdb"
- SubSystem="2"
- ImportLibrary="$(OutDir)/libtheora.lib"
- TargetMachine="1"
+ Name="VCLibrarianTool"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
- Name="VCManifestTool"
- />
- <Tool
Name="VCXDCMakeTool"
/>
<Tool
@@ -89,12 +78,6 @@
Name="VCFxCopTool"
/>
<Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCWebDeploymentTool"
- />
- <Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
@@ -129,9 +112,9 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
OmitFramePointers="true"
- WholeProgramOptimization="false"
+ WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\..\libogg\include;..\..\..\lib;G:\Dev\xiph\zens_sdk\lib\libtheora-playtime\lib\x86_32_vs"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS; USE_ASM; USE_NO_SSE"
StringPooling="true"
ExceptionHandling="0"
RuntimeLibrary="0"
More information about the commits
mailing list