[xiph-commits] r11497 - in branches/theora-playtime: lib
lib/x86_32_vs win32/VS2005/libtheora
illiminable at svn.xiph.org
illiminable at svn.xiph.org
Fri Jun 2 09:08:39 PDT 2006
Author: illiminable
Date: 2006-06-02 09:08:20 -0700 (Fri, 02 Jun 2006)
New Revision: 11497
Modified:
branches/theora-playtime/lib/codec_internal.h
branches/theora-playtime/lib/dsp.h
branches/theora-playtime/lib/frinit.c
branches/theora-playtime/lib/pb.c
branches/theora-playtime/lib/reconstruct.c
branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* make recon_intra8x8_sse2 use aligned loads
* add recon_inter8x8_sse2
* Temporary 16 byte aligned malloc
* macro'd some mallocs that should be aligned
* Add sse2 setup function to dsp and reconstruct
Modified: branches/theora-playtime/lib/codec_internal.h
===================================================================
--- branches/theora-playtime/lib/codec_internal.h 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/codec_internal.h 2006-06-02 16:08:20 UTC (rev 11497)
@@ -26,6 +26,13 @@
#include "huffman.h"
#include "dsp.h"
+#if defined(USE_ASM) && defined(_MSC_VER)
+#define _theora_16_byte_aligned_malloc(x) _aligned_malloc((x),16)
+#define _theora_16_byte_aligned_free(x) _aligned_free((x))
+#else
+#define _theora_16_byte_aligned_malloc _ogg_malloc
+#endif
+
#ifndef LIBOGG2
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
#else
Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/dsp.h 2006-06-02 16:08:20 UTC (rev 11497)
@@ -90,6 +90,7 @@
extern void dsp_mmxext_init(DspFunctions *funcs);
extern void dsp_mmx_fdct_init(DspFunctions *funcs);
extern void dsp_mmx_recon_init(DspFunctions *funcs);
+extern void dsp_sse2_recon_init(DspFunctions *funcs);
#endif
#define dsp_save_fpu(funcs) (funcs.save_fpu ())
Modified: branches/theora-playtime/lib/frinit.c
===================================================================
--- branches/theora-playtime/lib/frinit.c 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/frinit.c 2006-06-02 16:08:20 UTC (rev 11497)
@@ -277,11 +277,11 @@
void ClearFrameInfo(PB_INSTANCE * pbi){
if(pbi->ThisFrameRecon )
- _ogg_free(pbi->ThisFrameRecon );
+ _theora_16_byte_aligned_free(pbi->ThisFrameRecon );
if(pbi->GoldenFrame)
_ogg_free(pbi->GoldenFrame);
if(pbi->LastFrameRecon)
- _ogg_free(pbi->LastFrameRecon);
+ _theora_16_byte_aligned_free(pbi->LastFrameRecon);
if(pbi->PostProcessBuffer)
_ogg_free(pbi->PostProcessBuffer);
@@ -306,13 +306,13 @@
/* allocate frames */
pbi->ThisFrameRecon =
- _ogg_malloc(FrameSize*sizeof(*pbi->ThisFrameRecon));
+ _theora_16_byte_aligned_malloc(FrameSize*sizeof(*pbi->ThisFrameRecon));
pbi->GoldenFrame =
_ogg_malloc(FrameSize*sizeof(*pbi->GoldenFrame));
pbi->LastFrameRecon =
- _ogg_malloc(FrameSize*sizeof(*pbi->LastFrameRecon));
+ _theora_16_byte_aligned_malloc(FrameSize*sizeof(*pbi->LastFrameRecon));
pbi->PostProcessBuffer =
_ogg_malloc(FrameSize*sizeof(*pbi->PostProcessBuffer));
Modified: branches/theora-playtime/lib/pb.c
===================================================================
--- branches/theora-playtime/lib/pb.c 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/pb.c 2006-06-02 16:08:20 UTC (rev 11497)
@@ -22,7 +22,7 @@
void ClearTmpBuffers(PB_INSTANCE * pbi){
if(pbi->ReconDataBuffer)
- _ogg_free(pbi->ReconDataBuffer);
+ _theora_16_byte_aligned_free(pbi->ReconDataBuffer);
if(pbi->DequantBuffer)
_ogg_free(pbi->DequantBuffer);
if(pbi->TmpDataBuffer)
@@ -63,7 +63,7 @@
/* Adjust the position of all of our temporary */
pbi->ReconDataBuffer =
- _ogg_malloc(64*sizeof(*pbi->ReconDataBuffer));
+ _theora_16_byte_aligned_malloc(64*sizeof(*pbi->ReconDataBuffer));
pbi->DequantBuffer =
_ogg_malloc(64 * sizeof(*pbi->DequantBuffer));
Modified: branches/theora-playtime/lib/reconstruct.c
===================================================================
--- branches/theora-playtime/lib/reconstruct.c 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/reconstruct.c 2006-06-02 16:08:20 UTC (rev 11497)
@@ -105,7 +105,9 @@
funcs->recon_inter8x8 = recon_inter8x8__c;
funcs->recon_inter8x8_half = recon_inter8x8_half__c;
#if defined(USE_ASM)
- if (cpu_flags & CPU_X86_MMX) {
+ if (cpu_flags & CPU_X86_SSE2) {
+ dsp_sse2_recon_init(funcs);
+ } else if (cpu_flags & CPU_X86_MMX) {
dsp_mmx_recon_init(funcs);
}
#endif
Modified: branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_sse2.c 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/x86_32_vs/recon_sse2.c 2006-06-02 16:08:20 UTC (rev 11497)
@@ -14,7 +14,7 @@
#include "dsp.h"
#include "cpu.h"
-static const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
+static __declspec(align(16)) const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
static const unsigned int* V128x16Ptr = V128x16;
static void copy8x8__sse2 (unsigned char *src,
@@ -66,12 +66,6 @@
lea edx, [ecx + ecx * 4]
lea esi, [ecx + edi * 2]
- /*
- TODO::: If we can somehow ensure each addressed element of src
- and dest, were 16 byte aligned could maybe use movdqa which might be
- faster. That requires that the base pointer is aligned,
- and that the stride is a multiple of 16
- */
/* Load all 8 registers */
movq xmm0, QWORD PTR [eax]
@@ -143,6 +137,12 @@
7 CCCC CCCC .... .... ....
*/
+
+
+ /* ChangePtr must be 16 byte aligned */
+ /* LineStep must be a multiple of 16 */
+
+
__asm {
align 16
@@ -152,34 +152,34 @@
mov ecx, LineStep
mov edx, V128x16Ptr
- /* Check whether we can use movdqa for 16 byte alignment */
- movdqu xmm7, [edx]
+ movdqa xmm7, [edx]
/* 8 lots of int16 per register on the first mov */
/* Then packs those 8 + another 8 down to 16x 8 bits */
/* Loads the data in only 4 iterations into different registers */
/* Maybe just make all the loads offsetted adress and no lea? */
/* Iteration 1 - xmm0 */
- movdqu xmm0, [ebx]
- packsswb xmm0, [ebx + 16]
+ movdqa xmm0, [ebx]
+ movdqa xmm6, [ebx + 16]
+ packsswb xmm0, xmm6 /*[ebx + 16]*/
pxor xmm0, xmm7
lea ebx, [ebx + 32]
/* Iteration 2 - xmm1*/
- movdqu xmm1, [ebx]
+ movdqa xmm1, [ebx]
packsswb xmm1, [ebx + 16]
pxor xmm1, xmm7
lea ebx, [ebx + 32]
/* Iteration 3 - xmm2 */
- movdqu xmm2, [ebx]
+ movdqa xmm2, [ebx]
packsswb xmm2, [ebx + 16]
pxor xmm2, xmm7
lea ebx, [ebx + 32]
/* Iteration 4 - xmm3 */
- movdqu xmm3, [ebx]
+ movdqa xmm3, [ebx]
packsswb xmm3, [ebx + 16]
pxor xmm3, xmm7
/* lea ebx, [ebx + 16] */
@@ -220,6 +220,8 @@
static void recon_inter8x8__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr,
ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
+
+#if 0
ogg_uint32_t i;
for (i = 8; i; i--){
@@ -236,6 +238,175 @@
ReconPtr += LineStep;
RefPtr += LineStep;
}
+#else
+
+ /*
+
+ @RefPtr
+ <--- Line Step -------->
+ 0 RRRR RRRR .... .... ....
+ ...
+ 7 RRRR RRRR .... .... ....
+
+
+ @ChangePtr
+ 0 HLHL HLHL HLHL HLHL
+ ...
+ 7 HLHL HLHL HLHL HLHL
+
+
+ @ReconPtr
+ <--- Line Step -------->
+ 0 XXXX XXXX .... .... ....
+ ...
+ 7 XXXX XXXX .... .... ....
+
+
+ Y = HL (16 bits, H-high 8, L-low 8)
+
+ X = clamp255(Y+R)
+
+ */
+
+
+
+ /* TODO - It could be better on this one to interleave the writes
+ as it goes */
+
+ __asm {
+ align 16
+
+ /* Setup params */
+ mov eax, ReconPtr
+ mov ebx, ChangePtr
+ mov ecx, LineStep
+ mov edx, RefPtr /* This pointer isn't properly aligned - recheck this*/
+
+ /* xmm0 = 0 */
+ pxor xmm0, xmm0
+
+
+ /* Iteration 1&2 */
+ movdqa xmm1, [ebx]
+ movq xmm2, QWORD PTR [edx]
+ punpcklbw xmm2, xmm0
+ paddsw xmm1, xmm2
+
+ /* xmm1 holds 8x8bit of output spread into 8x16bit */
+
+ movdqa xmm2, [ebx + 16]
+ movq xmm3, QWORD PTR [edx + ecx]
+ punpcklbw xmm3, xmm0
+ paddsw xmm2, xmm3
+
+ /* xmm2 holds 8x8bit of output spread into 8x16bit */
+
+
+ /* Advance Pointers */
+ lea ebx, [ebx + 32]
+ lea edx, [edx + ecx * 2]
+
+
+ /* Iteration 3&4 */
+ movdqa xmm3, [ebx]
+ movq xmm4, QWORD PTR [edx]
+ punpcklbw xmm4, xmm0
+ paddsw xmm3, xmm4
+
+ /* xmm3 holds 8x8bit of output spread into 8x16bit */
+
+ movdqa xmm4, [ebx + 16]
+ movq xmm5, QWORD PTR [edx + ecx]
+ punpcklbw xmm5, xmm0
+ paddsw xmm4, xmm5
+
+ /* xmm4 holds 8x8bit of output spread into 8x16bit */
+
+
+ /* Advance Pointers */
+ lea ebx, [ebx + 32]
+ lea edx, [edx + ecx * 2]
+
+
+ /* Consolidate the results from 4 registers of 16bits to 2 of 8bits */
+ packuswb xmm1, xmm2
+ packuswb xmm3, xmm4
+
+
+
+ /* Iteration 5&6 */
+ movdqa xmm6, [ebx]
+ movq xmm2, QWORD PTR [edx]
+ punpcklbw xmm2, xmm0
+ paddsw xmm6, xmm2
+
+ /* xmm6 holds 8x8bit of output spread into 8x16bit */
+
+ movdqa xmm2, [ebx + 16]
+ movq xmm7, QWORD PTR [edx + ecx]
+ punpcklbw xmm7, xmm0
+ paddsw xmm2, xmm7
+
+ /* xmm2 holds 8x8bit of output spread into 8x16bit */
+
+
+ /* Advance Pointers */
+ lea ebx, [ebx + 32]
+ lea edx, [edx + ecx * 2]
+
+
+ /* Iteration 7&8 */
+ movdqa xmm7, [ebx]
+ movq xmm4, QWORD PTR [edx]
+ punpcklbw xmm4, xmm0
+ paddsw xmm7, xmm4
+
+ /* xmm7 holds 8x8bit of output spread into 8x16bit */
+
+ movdqa xmm4, [ebx + 16]
+ movq xmm5, QWORD PTR [edx + ecx]
+ punpcklbw xmm5, xmm0
+ paddsw xmm4, xmm5
+
+ /* xmm4 holds 8x8bit of output spread into 8x16bit */
+
+
+ /* Consolidate the results from 4 registers of 16bits to 2 of 8bits */
+ packuswb xmm6, xmm2
+ packuswb xmm7, xmm4
+
+
+ /* Write the results out */
+
+ /* Iteration 1&2 - xmm1 */
+ movq QWORD PTR [eax], xmm1
+ psrldq xmm1, 8
+ movq QWORD PTR [eax + ecx], xmm1
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 3&4 - xmm3 */
+ movq QWORD PTR [eax], xmm3
+ psrldq xmm3, 8
+ movq QWORD PTR [eax + ecx], xmm3
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 5&6 - xmm6 */
+ movq QWORD PTR [eax], xmm6
+ psrldq xmm6, 8
+ movq QWORD PTR [eax + ecx], xmm6
+ lea eax, [eax + ecx * 2]
+
+ /* Iteration 7&8 - xmm7 */
+ movq QWORD PTR [eax], xmm7
+ psrldq xmm7, 8
+ movq QWORD PTR [eax + ecx], xmm7
+ /* lea eax, [eax + ecx]*/
+
+
+
+ };
+
+#endif
}
static void recon_inter8x8_half__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr1,
Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj 2006-06-02 16:08:20 UTC (rev 11497)
@@ -46,6 +46,7 @@
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="1"
+ EnableEnhancedInstructionSet="0"
UsePrecompiledHeader="0"
WarningLevel="3"
Detect64BitPortabilityProblems="true"
@@ -470,6 +471,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\x86_32_vs\recon_sse2.c"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\reconstruct.c"
>
</File>
@@ -492,6 +497,10 @@
>
</File>
<File
+ RelativePath="..\..\..\lib\codec_internal.h"
+ >
+ </File>
+ <File
RelativePath="..\..\..\lib\cpu.h"
>
</File>
More information about the commits
mailing list