[xiph-commits] r11497 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/libtheora

illiminable at svn.xiph.org illiminable at svn.xiph.org
Fri Jun 2 09:08:39 PDT 2006


Author: illiminable
Date: 2006-06-02 09:08:20 -0700 (Fri, 02 Jun 2006)
New Revision: 11497

Modified:
   branches/theora-playtime/lib/codec_internal.h
   branches/theora-playtime/lib/dsp.h
   branches/theora-playtime/lib/frinit.c
   branches/theora-playtime/lib/pb.c
   branches/theora-playtime/lib/reconstruct.c
   branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* make recon_intra8x8_sse2 use aligned loads
* add recon_inter8x8_sse2
* Temporary 16 byte aligned malloc
* macro'd some mallocs that should be aligned
* Add sse2 setup function to dsp and reconstruct


Modified: branches/theora-playtime/lib/codec_internal.h
===================================================================
--- branches/theora-playtime/lib/codec_internal.h	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/codec_internal.h	2006-06-02 16:08:20 UTC (rev 11497)
@@ -26,6 +26,13 @@
 #include "huffman.h"
 #include "dsp.h"
 
+#if defined(USE_ASM) && defined(_MSC_VER)
+#define _theora_16_byte_aligned_malloc(x) _aligned_malloc((x),16)
+#define _theora_16_byte_aligned_free(x) _aligned_free((x))
+#else
+#define _theora_16_byte_aligned_malloc _ogg_malloc
+#endif
+
 #ifndef LIBOGG2
 #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
 #else

Modified: branches/theora-playtime/lib/dsp.h
===================================================================
--- branches/theora-playtime/lib/dsp.h	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/dsp.h	2006-06-02 16:08:20 UTC (rev 11497)
@@ -90,6 +90,7 @@
 extern void dsp_mmxext_init(DspFunctions *funcs);
 extern void dsp_mmx_fdct_init(DspFunctions *funcs);
 extern void dsp_mmx_recon_init(DspFunctions *funcs);
+extern void dsp_sse2_recon_init(DspFunctions *funcs);
 #endif
 
 #define dsp_save_fpu(funcs) (funcs.save_fpu ())

Modified: branches/theora-playtime/lib/frinit.c
===================================================================
--- branches/theora-playtime/lib/frinit.c	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/frinit.c	2006-06-02 16:08:20 UTC (rev 11497)
@@ -277,11 +277,11 @@
 
 void ClearFrameInfo(PB_INSTANCE * pbi){
   if(pbi->ThisFrameRecon )
-    _ogg_free(pbi->ThisFrameRecon );
+    _theora_16_byte_aligned_free(pbi->ThisFrameRecon );
   if(pbi->GoldenFrame)
     _ogg_free(pbi->GoldenFrame);
   if(pbi->LastFrameRecon)
-    _ogg_free(pbi->LastFrameRecon);
+    _theora_16_byte_aligned_free(pbi->LastFrameRecon);
   if(pbi->PostProcessBuffer)
     _ogg_free(pbi->PostProcessBuffer);
 
@@ -306,13 +306,13 @@
 
   /* allocate frames */
   pbi->ThisFrameRecon =
-    _ogg_malloc(FrameSize*sizeof(*pbi->ThisFrameRecon));
+    _theora_16_byte_aligned_malloc(FrameSize*sizeof(*pbi->ThisFrameRecon));
 
   pbi->GoldenFrame =
     _ogg_malloc(FrameSize*sizeof(*pbi->GoldenFrame));
 
   pbi->LastFrameRecon =
-    _ogg_malloc(FrameSize*sizeof(*pbi->LastFrameRecon));
+    _theora_16_byte_aligned_malloc(FrameSize*sizeof(*pbi->LastFrameRecon));
 
   pbi->PostProcessBuffer =
     _ogg_malloc(FrameSize*sizeof(*pbi->PostProcessBuffer));

Modified: branches/theora-playtime/lib/pb.c
===================================================================
--- branches/theora-playtime/lib/pb.c	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/pb.c	2006-06-02 16:08:20 UTC (rev 11497)
@@ -22,7 +22,7 @@
 void ClearTmpBuffers(PB_INSTANCE * pbi){
 
   if(pbi->ReconDataBuffer)
-    _ogg_free(pbi->ReconDataBuffer);
+    _theora_16_byte_aligned_free(pbi->ReconDataBuffer);
   if(pbi->DequantBuffer)
     _ogg_free(pbi->DequantBuffer);
   if(pbi->TmpDataBuffer)
@@ -63,7 +63,7 @@
 
   /* Adjust the position of all of our temporary */
   pbi->ReconDataBuffer      =
-    _ogg_malloc(64*sizeof(*pbi->ReconDataBuffer));
+    _theora_16_byte_aligned_malloc(64*sizeof(*pbi->ReconDataBuffer));
 
   pbi->DequantBuffer        =
     _ogg_malloc(64 * sizeof(*pbi->DequantBuffer));

Modified: branches/theora-playtime/lib/reconstruct.c
===================================================================
--- branches/theora-playtime/lib/reconstruct.c	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/reconstruct.c	2006-06-02 16:08:20 UTC (rev 11497)
@@ -105,7 +105,9 @@
   funcs->recon_inter8x8 = recon_inter8x8__c;
   funcs->recon_inter8x8_half = recon_inter8x8_half__c;
 #if defined(USE_ASM)
-  if (cpu_flags & CPU_X86_MMX) {
+  if (cpu_flags & CPU_X86_SSE2) {
+    dsp_sse2_recon_init(funcs);
+  } else if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_recon_init(funcs);
   }
 #endif

Modified: branches/theora-playtime/lib/x86_32_vs/recon_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/recon_sse2.c	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/lib/x86_32_vs/recon_sse2.c	2006-06-02 16:08:20 UTC (rev 11497)
@@ -14,7 +14,7 @@
 #include "dsp.h"
 #include "cpu.h"
 
-static const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
+static __declspec(align(16)) const unsigned int V128x16[4] = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 };
 static const unsigned int* V128x16Ptr = V128x16;
 
 static void copy8x8__sse2 (unsigned char *src,
@@ -66,12 +66,6 @@
         lea         edx, [ecx + ecx * 4]
         lea         esi, [ecx + edi * 2]
 
-        /* 
-            TODO::: If we can somehow ensure each addressed element of src 
-            and dest, were 16 byte aligned could maybe use movdqa which might be
-            faster. That requires that the base pointer is aligned,
-            and that the stride is a multiple of 16
-            */
 
         /* Load all 8 registers */
         movq      xmm0, QWORD PTR [eax]
@@ -143,6 +137,12 @@
     7   CCCC CCCC .... .... .... 
     */
 
+
+
+    /* ChangePtr must be 16 byte aligned */
+    /* LineStep must be a multiple of 16 */
+
+
     __asm {
 
         align 16
@@ -152,34 +152,34 @@
         mov     ecx, LineStep
         mov     edx, V128x16Ptr
 
-        /* Check whether we can use movdqa for 16 byte alignment */
 
-        movdqu      xmm7, [edx]
+        movdqa      xmm7, [edx]
         /* 8 lots of int16 per register on the first mov */
         /* Then packs those 8 + another 8 down to 16x 8 bits */
         /* Loads the data in only 4 iterations into different registers */
         /* Maybe just make all the loads offsetted adress and no lea? */
         
         /* Iteration 1 - xmm0 */
-        movdqu      xmm0, [ebx]
-        packsswb    xmm0, [ebx + 16]
+        movdqa      xmm0, [ebx]
+        movdqa      xmm6, [ebx + 16]
+        packsswb    xmm0, xmm6  /*[ebx + 16]*/
         pxor        xmm0, xmm7
         lea         ebx, [ebx + 32]
 
         /* Iteration 2 - xmm1*/
-        movdqu      xmm1, [ebx]
+        movdqa      xmm1, [ebx]
         packsswb    xmm1, [ebx + 16]
         pxor        xmm1, xmm7
         lea         ebx, [ebx + 32]
 
         /* Iteration 3 - xmm2 */
-        movdqu      xmm2, [ebx]
+        movdqa      xmm2, [ebx]
         packsswb    xmm2, [ebx + 16]
         pxor        xmm2, xmm7
         lea         ebx, [ebx + 32]
 
         /* Iteration 4 - xmm3 */
-        movdqu      xmm3, [ebx]
+        movdqa      xmm3, [ebx]
         packsswb    xmm3, [ebx + 16]
         pxor        xmm3, xmm7
         /* lea         ebx, [ebx + 16] */
@@ -220,6 +220,8 @@
 static void recon_inter8x8__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr,
 		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
 {
+
+#if 0
   ogg_uint32_t i;
 
   for (i = 8; i; i--){
@@ -236,6 +238,175 @@
     ReconPtr += LineStep;
     RefPtr += LineStep;
   }
+#else
+    
+    /*
+
+            @RefPtr
+            <--- Line Step -------->
+    0       RRRR RRRR .... .... ....
+    ...
+    7       RRRR RRRR .... .... ....
+
+
+            @ChangePtr
+    0       HLHL HLHL HLHL HLHL
+    ...
+    7       HLHL HLHL HLHL HLHL
+
+
+            @ReconPtr
+            <--- Line Step -------->
+    0       XXXX XXXX .... .... ....
+    ...
+    7       XXXX XXXX .... .... ....
+
+
+    Y = HL (16 bits, H-high 8, L-low 8)
+
+    X = clamp255(Y+R)
+
+    */
+
+
+
+    /* TODO - It could be better on this one to interleave the writes
+        as it goes */
+
+    __asm {
+        align   16
+        
+        /* Setup params */
+        mov         eax, ReconPtr
+        mov         ebx, ChangePtr
+        mov         ecx, LineStep
+        mov         edx, RefPtr     /* This pointer isn't properly aligned - recheck this*/
+
+        /* xmm0 = 0 */
+        pxor        xmm0, xmm0
+
+
+        /* Iteration 1&2 */
+        movdqa      xmm1, [ebx]
+        movq        xmm2, QWORD PTR [edx]
+        punpcklbw   xmm2, xmm0
+        paddsw      xmm1, xmm2
+
+        /* xmm1 holds 8x8bit of output spread into 8x16bit */
+
+        movdqa      xmm2, [ebx + 16]
+        movq        xmm3, QWORD PTR [edx + ecx]
+        punpcklbw   xmm3, xmm0
+        paddsw      xmm2, xmm3
+
+        /* xmm2 holds 8x8bit of output spread into 8x16bit */
+
+        
+        /* Advance Pointers */
+        lea         ebx, [ebx + 32]
+        lea         edx, [edx + ecx * 2]
+
+
+        /* Iteration 3&4 */
+        movdqa      xmm3, [ebx]
+        movq        xmm4, QWORD PTR [edx]
+        punpcklbw   xmm4, xmm0
+        paddsw      xmm3, xmm4
+
+        /* xmm3 holds 8x8bit of output spread into 8x16bit */
+
+        movdqa      xmm4, [ebx + 16]
+        movq        xmm5, QWORD PTR [edx + ecx]
+        punpcklbw   xmm5, xmm0
+        paddsw      xmm4, xmm5
+
+        /* xmm4 holds 8x8bit of output spread into 8x16bit */
+
+        
+        /* Advance Pointers */
+        lea         ebx, [ebx + 32]
+        lea         edx, [edx + ecx * 2]
+
+
+        /* Consolidate the results from 4 registers of 16bits to 2 of 8bits */
+        packuswb    xmm1, xmm2
+        packuswb    xmm3, xmm4
+
+
+
+        /* Iteration 5&6 */
+        movdqa      xmm6, [ebx]
+        movq        xmm2, QWORD PTR [edx]
+        punpcklbw   xmm2, xmm0
+        paddsw      xmm6, xmm2
+
+        /* xmm6 holds 8x8bit of output spread into 8x16bit */
+
+        movdqa      xmm2, [ebx + 16]
+        movq        xmm7, QWORD PTR [edx + ecx]
+        punpcklbw   xmm7, xmm0
+        paddsw      xmm2, xmm7
+
+        /* xmm2 holds 8x8bit of output spread into 8x16bit */
+
+        
+        /* Advance Pointers */
+        lea         ebx, [ebx + 32]
+        lea         edx, [edx + ecx * 2]
+
+
+        /* Iteration 7&8 */
+        movdqa      xmm7, [ebx]
+        movq        xmm4, QWORD PTR [edx]
+        punpcklbw   xmm4, xmm0
+        paddsw      xmm7, xmm4
+
+        /* xmm7 holds 8x8bit of output spread into 8x16bit */
+
+        movdqa      xmm4, [ebx + 16]
+        movq        xmm5, QWORD PTR [edx + ecx]
+        punpcklbw   xmm5, xmm0
+        paddsw      xmm4, xmm5
+
+        /* xmm4 holds 8x8bit of output spread into 8x16bit */
+
+        
+        /* Consolidate the results from 4 registers of 16bits to 2 of 8bits */
+        packuswb    xmm6, xmm2
+        packuswb    xmm7, xmm4
+        
+
+        /* Write the results out */
+
+        /* Iteration 1&2 - xmm1 */
+        movq        QWORD PTR [eax], xmm1
+        psrldq      xmm1, 8
+        movq        QWORD PTR [eax + ecx], xmm1
+        lea         eax, [eax + ecx * 2]
+        
+        /* Iteration 3&4 - xmm3 */
+        movq        QWORD PTR [eax], xmm3
+        psrldq      xmm3, 8
+        movq        QWORD PTR [eax + ecx], xmm3
+        lea         eax, [eax + ecx * 2]
+
+        /* Iteration 5&6 - xmm6 */
+        movq        QWORD PTR [eax], xmm6
+        psrldq      xmm6, 8
+        movq        QWORD PTR [eax + ecx], xmm6
+        lea         eax, [eax + ecx * 2]
+
+        /* Iteration 7&8 - xmm7 */
+        movq        QWORD PTR [eax], xmm7
+        psrldq      xmm7, 8
+        movq        QWORD PTR [eax + ecx], xmm7
+        /* lea         eax, [eax + ecx]*/
+
+
+ 
+    };
+
+#endif
 }
 
 static void recon_inter8x8_half__sse2 (unsigned char *ReconPtr, unsigned char *RefPtr1,

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-02 00:09:55 UTC (rev 11496)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-02 16:08:20 UTC (rev 11497)
@@ -46,6 +46,7 @@
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="1"
+				EnableEnhancedInstructionSet="0"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				Detect64BitPortabilityProblems="true"
@@ -470,6 +471,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\recon_sse2.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\reconstruct.c"
 				>
 			</File>
@@ -492,6 +497,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\codec_internal.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\cpu.h"
 				>
 			</File>



More information about the commits mailing list