[xiph-commits] r11516 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/libtheora

Sun Jun 4 06:30:09 PDT 2006

Author: illiminable
Date: 2006-06-04 06:29:50 -0700 (Sun, 04 Jun 2006)
New Revision: 11516

Added:
   branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
Modified:
   branches/theora-playtime/lib/dsp.h
   branches/theora-playtime/lib/encoder_toplevel.c
   branches/theora-playtime/lib/idct.c
   branches/theora-playtime/lib/pb.c
   branches/theora-playtime/lib/quant_lookup.h
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* Force alignment of memory used by idct.c
* Align the dezigzag table
* Implemnet dequant_slow__sse2


Modified: branches/theora-playtime/lib/dsp.h
===================================================================

--- branches/theora-playtime/lib/dsp.h	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/dsp.h	2006-06-04 13:29:50 UTC (rev 11516)
@@ -83,23 +83,23 @@
   /* iDCT Functions */
 
 
-  void (*IDct1)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData, ogg_int16_t *QuantMatrix,
+  void (*IDct1)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData, ogg_int16_t *QuantMatrix,
                         ogg_int16_t * OutputData );
 
-  void (*dequant_slow10)( ogg_int16_t * dequant_coeffs,
-                     ogg_int16_t * quantized_list,
+  void (*dequant_slow10)( ogg_int16_t * dequant_coeffs,
+                     ogg_int16_t * quantized_list,
                      ogg_int32_t * DCT_block);
 
-  void (*IDct10)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
-             ogg_int16_t *QuantMatrix,
+  void (*IDct10)( ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+             ogg_int16_t *QuantMatrix,
              ogg_int16_t * OutputData );
 
-  void (*IDctSlow)(  ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
-                ogg_int16_t *QuantMatrix,
+  void (*IDctSlow)(  ogg_int16_t/*Q_LIST_ENTRY*/ * InputData,
+                ogg_int16_t *QuantMatrix,
                 ogg_int16_t * OutputData );
 
-  void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
-                   ogg_int16_t * quantized_list,
+  void (*dequant_slow)( ogg_int16_t * dequant_coeffs,
+                   ogg_int16_t * quantized_list,
                    ogg_int32_t * DCT_block);
 
 

Modified: branches/theora-playtime/lib/encoder_toplevel.c
===================================================================
--- branches/theora-playtime/lib/encoder_toplevel.c	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/encoder_toplevel.c	2006-06-04 13:29:50 UTC (rev 11516)
@@ -51,7 +51,7 @@
   if(cpi->DCTDataBuffer )
     _theora_16_byte_aligned_free( cpi->DCTDataBuffer);
   if(cpi->quantized_list)
-    _ogg_free( cpi->quantized_list);
+    _theora_16_byte_aligned_free( cpi->quantized_list);
   if(cpi->OriginalDC)
     _ogg_free( cpi->OriginalDC);
   if(cpi->PartiallyCodedFlags)
@@ -143,7 +143,7 @@
     _theora_16_byte_aligned_malloc(64*
                 sizeof(*cpi->DCTDataBuffer));
   cpi->quantized_list =
-    _ogg_malloc(64*
+    _theora_16_byte_aligned_malloc(64*
                 sizeof(*cpi->quantized_list));
   cpi->PartiallyCodedFlags =
     _ogg_malloc(cpi->pb.MacroBlocks*

Modified: branches/theora-playtime/lib/idct.c
===================================================================
--- branches/theora-playtime/lib/idct.c	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/idct.c	2006-06-04 13:29:50 UTC (rev 11516)
@@ -16,6 +16,8 @@
  ********************************************************************/
 
 #include <string.h>
+#include "dsp.h"
+#include "cpu.h"
 #include "codec_internal.h"
 #include "quant_lookup.h"
 
@@ -571,8 +573,8 @@
   //  dsp_mmx_idct_init(funcs);
   //}
 
-  //if (cpu_flags & CPU_X86_SSE2) {
-  //  dsp_sse2_idct_init(funcs);
-  //}
+  if (cpu_flags & CPU_X86_SSE2) {
+    dsp_sse2_idct_init(funcs);
+  }
 #endif
 }
\ No newline at end of file

Modified: branches/theora-playtime/lib/pb.c
===================================================================
--- branches/theora-playtime/lib/pb.c	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/pb.c	2006-06-04 13:29:50 UTC (rev 11516)
@@ -24,23 +24,23 @@
   if(pbi->ReconDataBuffer)
     _theora_16_byte_aligned_free(pbi->ReconDataBuffer);
   if(pbi->DequantBuffer)
-    _ogg_free(pbi->DequantBuffer);
+    _theora_16_byte_aligned_free(pbi->DequantBuffer);
   if(pbi->TmpDataBuffer)
-    _ogg_free(pbi->TmpDataBuffer);
+    _theora_16_byte_aligned_free(pbi->TmpDataBuffer);
   if(pbi->TmpReconBuffer)
-    _ogg_free(pbi->TmpReconBuffer);
+    _theora_16_byte_aligned_free(pbi->TmpReconBuffer);
   if(pbi->dequant_Y_coeffs)
-    _ogg_free(pbi->dequant_Y_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_Y_coeffs);
   if(pbi->dequant_U_coeffs)
-    _ogg_free(pbi->dequant_U_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_U_coeffs);
   if(pbi->dequant_V_coeffs)
-    _ogg_free(pbi->dequant_V_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_V_coeffs);
   if(pbi->dequant_InterY_coeffs)
-    _ogg_free(pbi->dequant_InterY_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_InterY_coeffs);
   if(pbi->dequant_InterU_coeffs)
-    _ogg_free(pbi->dequant_InterU_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_InterU_coeffs);
   if(pbi->dequant_InterV_coeffs)
-    _ogg_free(pbi->dequant_InterV_coeffs);
+    _theora_16_byte_aligned_free(pbi->dequant_InterV_coeffs);
 
 
   pbi->ReconDataBuffer=0;
@@ -66,31 +66,31 @@
     _theora_16_byte_aligned_malloc(64*sizeof(*pbi->ReconDataBuffer));
 
   pbi->DequantBuffer        =
-    _ogg_malloc(64 * sizeof(*pbi->DequantBuffer));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->DequantBuffer));
 
   pbi->TmpDataBuffer        =
-    _ogg_malloc(64 * sizeof(*pbi->TmpDataBuffer));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->TmpDataBuffer));
 
   pbi->TmpReconBuffer       =
-    _ogg_malloc(64 * sizeof(*pbi->TmpReconBuffer));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->TmpReconBuffer));
 
   pbi->dequant_Y_coeffs     =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_Y_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_Y_coeffs));
 
   pbi->dequant_U_coeffs    =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_U_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_U_coeffs));
 
   pbi->dequant_V_coeffs    =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_V_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_V_coeffs));
 
   pbi->dequant_InterY_coeffs =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_InterY_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterY_coeffs));
 
   pbi->dequant_InterU_coeffs =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_InterU_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterU_coeffs));
 
   pbi->dequant_InterV_coeffs =
-    _ogg_malloc(64 * sizeof(*pbi->dequant_InterV_coeffs));
+    _theora_16_byte_aligned_malloc(64 * sizeof(*pbi->dequant_InterV_coeffs));
 
 }
 

Modified: branches/theora-playtime/lib/quant_lookup.h
===================================================================
--- branches/theora-playtime/lib/quant_lookup.h	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/quant_lookup.h	2006-06-04 13:29:50 UTC (rev 11516)
@@ -25,7 +25,21 @@
 #define IDCT_SCALE_FACTOR     2 /* Shift left bits to improve IDCT precision */
 #define OLD_SCHEME            1
 
+#if defined(USE_ASM) && defined(WIN32)
 /* lookup table for DCT coefficient zig-zag ordering */
+static const __declspec(align(16)) ogg_uint32_t dezigzag_index[64] = {
+  0,  1,  8,  16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63
+};
+
+#else
+/* lookup table for DCT coefficient zig-zag ordering */
 static const ogg_uint32_t dezigzag_index[64] = {
   0,  1,  8,  16,  9,  2,  3, 10,
   17, 24, 32, 25, 18, 11,  4,  5,
@@ -36,3 +50,4 @@
   58, 59, 52, 45, 38, 31, 39, 46,
   53, 60, 61, 54, 47, 55, 62, 63
 };
+#endif

Added: branches/theora-playtime/lib/x86_32_vs/idct_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/lib/x86_32_vs/idct_sse2.c	2006-06-04 13:29:50 UTC (rev 11516)
@@ -0,0 +1,721 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: SSE2 implementation of the Theora iDCT
+  last mod: $Id: idct_sse2.c 11513 2006-06-04 09:46:34Z illiminable $
+
+ ********************************************************************/
+
+#include <string.h>
+#include "codec_internal.h"
+#include "quant_lookup.h"
+
+#define IdctAdjustBeforeShift 8
+
+/* cos(n*pi/16) or sin(8-n)*pi/16) */
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+
+
+static void dequant_slow__sse2( ogg_int16_t * dequant_coeffs,
+                   ogg_int16_t * quantized_list,
+                   ogg_int32_t * DCT_block) 
+{
+#if 0
+  int i;
+  for(i=0;i<64;i++)
+    DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+#else
+
+    static __declspec(align(16)) ogg_int32_t temp_block[64];
+    static ogg_int32_t* temp_block_ptr = temp_block;
+    static ogg_int32_t* zigzag_ptr = dezigzag_index;
+
+    /*      quantized list is not aligned */
+
+
+    __asm {
+        align       16
+
+        mov     edi, DCT_block          /* int32 */
+        mov     edx, zigzag_ptr          /* int32 */
+        mov     esi, quantized_list     /* int16 */
+        mov     ebx, dequant_coeffs     /* int16 */
+        mov     eax, temp_block_ptr
+
+
+        /*      
+                The repeated blocks of 16 iterations are identical except 
+                for the offsets in the writes at the end and the reads at start
+         */
+
+        /* 16 Iterations at a time  */
+            /* Read 16x16 bits of quatized_list and dequant_coeffs */
+            movdqu      xmm1, [esi]
+            movdqu      xmm5, [esi + 16]
+
+            movdqa      xmm2, [ebx]
+            movdqa      xmm6, [ebx + 16]
+
+            /* Make a copy of xmm1 and xmm5 */
+            movdqa      xmm7, xmm1
+            movdqa      xmm0, xmm5
+
+            /* Multiply */
+            pmullw      xmm1, xmm2
+            pmulhw      xmm2, xmm7
+
+            pmullw      xmm5, xmm6
+            pmulhw      xmm6, xmm0
+
+            /* Interleave the multiplicataion results */
+            movdqa      xmm0, xmm1
+            punpcklwd   xmm1, xmm2      /* Now the low 4 x 32 bits */
+            punpckhwd   xmm0, xmm2      /* The high 4x32 bits */
+
+            movdqa      xmm2, xmm5
+            punpcklwd   xmm5, xmm6
+            punpckhwd   xmm2, xmm6
+
+            /* Write the 16x32 bits of output to temp space */
+            movdqa      [eax], xmm1
+            movdqa      [eax + 16], xmm0
+            movdqa      [eax + 32], xmm5
+            movdqa      [eax + 48], xmm2
+
+
+
+        /* 16 Iterations at a time  */
+            /* Read 16x16 bits of quatized_list and dequant_coeffs */
+            movdqu      xmm1, [esi + 32]
+            movdqu      xmm5, [esi + 48]
+
+            movdqa      xmm2, [ebx + 32]
+            movdqa      xmm6, [ebx + 48]
+
+            /* Make a copy of xmm1 and xmm5 */
+            movdqa      xmm7, xmm1
+            movdqa      xmm0, xmm5
+
+            /* Multiply */
+            pmullw      xmm1, xmm2
+            pmulhw      xmm2, xmm7
+
+            pmullw      xmm5, xmm6
+            pmulhw      xmm6, xmm0
+
+            /* Interleave the multiplicataion results */
+            movdqa      xmm0, xmm1
+            punpcklwd   xmm1, xmm2      /* Now the low 4 x 32 bits */
+            punpckhwd   xmm0, xmm2      /* The high 4x32 bits */
+
+            movdqa      xmm2, xmm5
+            punpcklwd   xmm5, xmm6
+            punpckhwd   xmm2, xmm6
+
+            /* Write the 16x32 bits of output to temp space */
+            movdqa      [eax + 64], xmm1
+            movdqa      [eax + 80], xmm0
+            movdqa      [eax + 96], xmm5
+            movdqa      [eax + 112], xmm2
+
+        /* 16 Iterations at a time  */
+            /* Read 16x16 bits of quatized_list and dequant_coeffs */
+            movdqu      xmm1, [esi + 64]
+            movdqu      xmm5, [esi + 80]
+
+            movdqa      xmm2, [ebx + 64]
+            movdqa      xmm6, [ebx + 80]
+
+            /* Make a copy of xmm1 and xmm5 */
+            movdqa      xmm7, xmm1
+            movdqa      xmm0, xmm5
+
+            /* Multiply */
+            pmullw      xmm1, xmm2
+            pmulhw      xmm2, xmm7
+
+            pmullw      xmm5, xmm6
+            pmulhw      xmm6, xmm0
+
+            /* Interleave the multiplicataion results */
+            movdqa      xmm0, xmm1
+            punpcklwd   xmm1, xmm2      /* Now the low 4 x 32 bits */
+            punpckhwd   xmm0, xmm2      /* The high 4x32 bits */
+
+            movdqa      xmm2, xmm5
+            punpcklwd   xmm5, xmm6
+            punpckhwd   xmm2, xmm6
+
+            /* Write the 16x32 bits of output to temp space */
+            movdqa      [eax + 128], xmm1
+            movdqa      [eax + 144], xmm0
+            movdqa      [eax + 160], xmm5
+            movdqa      [eax + 176], xmm2
+
+
+
+        /* 16 Iterations at a time  */
+            /* Read 16x16 bits of quatized_list and dequant_coeffs */
+            movdqu      xmm1, [esi + 96]
+            movdqu      xmm5, [esi + 112]
+
+            movdqa      xmm2, [ebx + 96]
+            movdqa      xmm6, [ebx + 112]
+
+            /* Make a copy of xmm1 and xmm5 */
+            movdqa      xmm7, xmm1
+            movdqa      xmm0, xmm5
+
+            /* Multiply */
+            pmullw      xmm1, xmm2
+            pmulhw      xmm2, xmm7
+
+            pmullw      xmm5, xmm6
+            pmulhw      xmm6, xmm0
+
+            /* Interleave the multiplicataion results */
+            movdqa      xmm0, xmm1
+            punpcklwd   xmm1, xmm2      /* Now the low 4 x 32 bits */
+            punpckhwd   xmm0, xmm2      /* The high 4x32 bits */
+
+            movdqa      xmm2, xmm5
+            punpcklwd   xmm5, xmm6
+            punpckhwd   xmm2, xmm6
+
+            /* Write the 16x32 bits of output to temp space */
+            movdqa      [eax + 192], xmm1
+            movdqa      [eax + 208], xmm0
+            movdqa      [eax + 224], xmm5
+            movdqa      [eax + 240], xmm2
+
+        /* Now follow the pattern to write - can't use simd */
+        mov         ebx, 4
+    loop_start:
+        mov         ecx         , [edx]
+        mov         esi         , [eax]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 4]
+        mov         esi         , [eax + 4]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 8]
+        mov         esi         , [eax + 8]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 12]
+        mov         esi         , [eax + 12]
+        mov         [edi + ecx*4] , esi
+
+        mov         ecx         , [edx + 16]
+        mov         esi         , [eax + 16]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 20]
+        mov         esi         , [eax + 20]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 24]
+        mov         esi         , [eax + 24]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 28]
+        mov         esi         , [eax + 28]
+        mov         [edi + ecx*4] , esi
+
+        mov         ecx         , [edx + 32]
+        mov         esi         , [eax + 32]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 36]
+        mov         esi         , [eax + 36]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 40]
+        mov         esi         , [eax + 40]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 44]
+        mov         esi         , [eax + 44]
+        mov         [edi + ecx*4] , esi
+
+        mov         ecx         , [edx + 48]
+        mov         esi         , [eax + 48]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 52]
+        mov         esi         , [eax + 52]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 56]
+        mov         esi         , [eax + 56]
+        mov         [edi + ecx*4] , esi
+        mov         ecx         , [edx + 60]
+        mov         esi         , [eax + 60]
+        mov         [edi + ecx*4] , esi
+
+        add         eax, 64
+        add         edx, 64
+
+        sub         ebx, 1
+        jnz         loop_start
+
+
+    };
+#endif
+}
+
+
+
+void IDctSlow__sse2(  Q_LIST_ENTRY * InputData,
+                ogg_int16_t *QuantMatrix,
+                ogg_int16_t * OutputData ) {
+  __declspec(align(16)) ogg_int32_t IntermediateData[64];
+  ogg_int32_t * ip = IntermediateData;
+  ogg_int16_t * op = OutputData;
+
+  ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+  ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+  ogg_int32_t t1, t2;
+
+  int loop;
+
+  dequant_slow__sse2( QuantMatrix, InputData, IntermediateData);
+
+  /* Inverse DCT on the rows now */
+  for ( loop = 0; loop < 8; loop++){
+    /* Check for non-zero values */
+    if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+      t1 = (xC1S7 * ip[1]);
+      t2 = (xC7S1 * ip[7]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _A = t1 + t2;
+
+      t1 = (xC7S1 * ip[1]);
+      t2 = (xC1S7 * ip[7]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _B = t1 - t2;
+
+      t1 = (xC3S5 * ip[3]);
+      t2 = (xC5S3 * ip[5]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _C = t1 + t2;
+
+      t1 = (xC3S5 * ip[5]);
+      t2 = (xC5S3 * ip[3]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _D = t1 - t2;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+      t1 >>= 16;
+      _Ad = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+      t1 >>= 16;
+      _Bd = t1;
+
+
+      _Cd = _A + _C;
+      _Dd = _B + _D;
+
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0] + ip[4]));
+      t1 >>= 16;
+      _E = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0] - ip[4]));
+      t1 >>= 16;
+      _F = t1;
+
+      t1 = (xC2S6 * ip[2]);
+      t2 = (xC6S2 * ip[6]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _G = t1 + t2;
+
+      t1 = (xC6S2 * ip[2]);
+      t2 = (xC2S6 * ip[6]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _H = t1 - t2;
+
+
+      _Ed = _E - _G;
+      _Gd = _E + _G;
+
+      _Add = _F + _Ad;
+      _Bdd = _Bd - _H;
+
+      _Fd = _F - _Ad;
+      _Hd = _Bd + _H;
+
+      /* Final sequence of operations over-write original inputs. */
+      ip[0] = (ogg_int16_t)((_Gd + _Cd )   >> 0);
+      ip[7] = (ogg_int16_t)((_Gd - _Cd )   >> 0);
+
+      ip[1] = (ogg_int16_t)((_Add + _Hd )  >> 0);
+      ip[2] = (ogg_int16_t)((_Add - _Hd )  >> 0);
+
+      ip[3] = (ogg_int16_t)((_Ed + _Dd )   >> 0);
+      ip[4] = (ogg_int16_t)((_Ed - _Dd )   >> 0);
+
+      ip[5] = (ogg_int16_t)((_Fd + _Bdd )  >> 0);
+      ip[6] = (ogg_int16_t)((_Fd - _Bdd )  >> 0);
+
+    }
+
+    ip += 8;                    /* next row */
+  }
+
+  ip = IntermediateData;
+
+  for ( loop = 0; loop < 8; loop++){
+    /* Check for non-zero values (bitwise or faster than ||) */
+    if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+         ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+      t1 = (xC1S7 * ip[1*8]);
+      t2 = (xC7S1 * ip[7*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _A = t1 + t2;
+
+      t1 = (xC7S1 * ip[1*8]);
+      t2 = (xC1S7 * ip[7*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _B = t1 - t2;
+
+      t1 = (xC3S5 * ip[3*8]);
+      t2 = (xC5S3 * ip[5*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _C = t1 + t2;
+
+      t1 = (xC3S5 * ip[5*8]);
+      t2 = (xC5S3 * ip[3*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _D = t1 - t2;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+      t1 >>= 16;
+      _Ad = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+      t1 >>= 16;
+      _Bd = t1;
+
+
+      _Cd = _A + _C;
+      _Dd = _B + _D;
+
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] + ip[4*8]));
+      t1 >>= 16;
+      _E = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] - ip[4*8]));
+      t1 >>= 16;
+      _F = t1;
+
+      t1 = (xC2S6 * ip[2*8]);
+      t2 = (xC6S2 * ip[6*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _G = t1 + t2;
+
+      t1 = (xC6S2 * ip[2*8]);
+      t2 = (xC2S6 * ip[6*8]);
+      t1 >>= 16;
+      t2 >>= 16;
+      _H = t1 - t2;
+
+      _Ed = _E - _G;
+      _Gd = _E + _G;
+
+      _Add = _F + _Ad;
+      _Bdd = _Bd - _H;
+
+      _Fd = _F - _Ad;
+      _Hd = _Bd + _H;
+
+      _Gd += IdctAdjustBeforeShift;
+      _Add += IdctAdjustBeforeShift;
+      _Ed += IdctAdjustBeforeShift;
+      _Fd += IdctAdjustBeforeShift;
+
+      /* Final sequence of operations over-write original inputs. */
+      op[0*8] = (ogg_int16_t)((_Gd + _Cd )   >> 4);
+      op[7*8] = (ogg_int16_t)((_Gd - _Cd )   >> 4);
+
+      op[1*8] = (ogg_int16_t)((_Add + _Hd )  >> 4);
+      op[2*8] = (ogg_int16_t)((_Add - _Hd )  >> 4);
+
+      op[3*8] = (ogg_int16_t)((_Ed + _Dd )   >> 4);
+      op[4*8] = (ogg_int16_t)((_Ed - _Dd )   >> 4);
+
+      op[5*8] = (ogg_int16_t)((_Fd + _Bdd )  >> 4);
+      op[6*8] = (ogg_int16_t)((_Fd - _Bdd )  >> 4);
+    }else{
+      op[0*8] = 0;
+      op[7*8] = 0;
+      op[1*8] = 0;
+      op[2*8] = 0;
+      op[3*8] = 0;
+      op[4*8] = 0;
+      op[5*8] = 0;
+      op[6*8] = 0;
+    }
+
+    ip++;                       /* next column */
+    op++;
+  }
+}
+
+/************************
+  x  x  x  x  0  0  0  0
+  x  x  x  0  0  0  0  0
+  x  x  0  0  0  0  0  0
+  x  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0
+  0  0  0  0  0  0  0  0
+*************************/
+
+static void dequant_slow10__sse2( ogg_int16_t * dequant_coeffs,
+                     ogg_int16_t * quantized_list,
+                     ogg_int32_t * DCT_block){
+  int i;
+  memset(DCT_block,0, 128);
+  for(i=0;i<10;i++)
+    DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
+
+}
+
+void IDct10__sse2( Q_LIST_ENTRY * InputData,
+             ogg_int16_t *QuantMatrix,
+             ogg_int16_t * OutputData ){
+  ogg_int32_t IntermediateData[64];
+  ogg_int32_t * ip = IntermediateData;
+  ogg_int16_t * op = OutputData;
+
+  ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
+  ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+  ogg_int32_t t1, t2;
+
+  int loop;
+
+  dequant_slow10__sse2( QuantMatrix, InputData, IntermediateData);
+
+  /* Inverse DCT on the rows now */
+  for ( loop = 0; loop < 4; loop++){
+    /* Check for non-zero values */
+    if ( ip[0] | ip[1] | ip[2] | ip[3] ){
+      t1 = (xC1S7 * ip[1]);
+      t1 >>= 16;
+      _A = t1;
+
+      t1 = (xC7S1 * ip[1]);
+      t1 >>= 16;
+      _B = t1 ;
+
+      t1 = (xC3S5 * ip[3]);
+      t1 >>= 16;
+      _C = t1;
+
+      t2 = (xC5S3 * ip[3]);
+      t2 >>= 16;
+      _D = -t2;
+
+
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+      t1 >>= 16;
+      _Ad = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+      t1 >>= 16;
+      _Bd = t1;
+
+
+      _Cd = _A + _C;
+      _Dd = _B + _D;
+
+      t1 = (xC4S4 * ip[0] );
+      t1 >>= 16;
+      _E = t1;
+
+      _F = t1;
+
+      t1 = (xC2S6 * ip[2]);
+      t1 >>= 16;
+      _G = t1;
+
+      t1 = (xC6S2 * ip[2]);
+      t1 >>= 16;
+      _H = t1 ;
+
+
+      _Ed = _E - _G;
+      _Gd = _E + _G;
+
+      _Add = _F + _Ad;
+      _Bdd = _Bd - _H;
+
+      _Fd = _F - _Ad;
+      _Hd = _Bd + _H;
+
+      /* Final sequence of operations over-write original inputs. */
+      ip[0] = (ogg_int16_t)((_Gd + _Cd )   >> 0);
+      ip[7] = (ogg_int16_t)((_Gd - _Cd )   >> 0);
+
+      ip[1] = (ogg_int16_t)((_Add + _Hd )  >> 0);
+      ip[2] = (ogg_int16_t)((_Add - _Hd )  >> 0);
+
+      ip[3] = (ogg_int16_t)((_Ed + _Dd )   >> 0);
+      ip[4] = (ogg_int16_t)((_Ed - _Dd )   >> 0);
+
+      ip[5] = (ogg_int16_t)((_Fd + _Bdd )  >> 0);
+      ip[6] = (ogg_int16_t)((_Fd - _Bdd )  >> 0);
+
+    }
+
+    ip += 8;                    /* next row */
+  }
+
+  ip = IntermediateData;
+
+  for ( loop = 0; loop < 8; loop++) {
+    /* Check for non-zero values (bitwise or faster than ||) */
+    if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] ) {
+
+      t1 = (xC1S7 * ip[1*8]);
+      t1 >>= 16;
+      _A = t1 ;
+
+      t1 = (xC7S1 * ip[1*8]);
+      t1 >>= 16;
+      _B = t1 ;
+
+      t1 = (xC3S5 * ip[3*8]);
+      t1 >>= 16;
+      _C = t1 ;
+
+      t2 = (xC5S3 * ip[3*8]);
+      t2 >>= 16;
+      _D = - t2;
+
+
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
+      t1 >>= 16;
+      _Ad = t1;
+
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
+      t1 >>= 16;
+      _Bd = t1;
+
+
+      _Cd = _A + _C;
+      _Dd = _B + _D;
+
+      t1 = (xC4S4 * ip[0*8]);
+      t1 >>= 16;
+      _E = t1;
+      _F = t1;
+
+      t1 = (xC2S6 * ip[2*8]);
+      t1 >>= 16;
+      _G = t1;
+
+      t1 = (xC6S2 * ip[2*8]);
+      t1 >>= 16;
+      _H = t1;
+
+
+      _Ed = _E - _G;
+      _Gd = _E + _G;
+
+      _Add = _F + _Ad;
+      _Bdd = _Bd - _H;
+
+      _Fd = _F - _Ad;
+      _Hd = _Bd + _H;
+
+      _Gd += IdctAdjustBeforeShift;
+      _Add += IdctAdjustBeforeShift;
+      _Ed += IdctAdjustBeforeShift;
+      _Fd += IdctAdjustBeforeShift;
+
+      /* Final sequence of operations over-write original inputs. */
+      op[0*8] = (ogg_int16_t)((_Gd + _Cd )   >> 4);
+      op[7*8] = (ogg_int16_t)((_Gd - _Cd )   >> 4);
+
+      op[1*8] = (ogg_int16_t)((_Add + _Hd )  >> 4);
+      op[2*8] = (ogg_int16_t)((_Add - _Hd )  >> 4);
+
+      op[3*8] = (ogg_int16_t)((_Ed + _Dd )   >> 4);
+      op[4*8] = (ogg_int16_t)((_Ed - _Dd )   >> 4);
+
+      op[5*8] = (ogg_int16_t)((_Fd + _Bdd )  >> 4);
+      op[6*8] = (ogg_int16_t)((_Fd - _Bdd )  >> 4);
+    }else{
+      op[0*8] = 0;
+      op[7*8] = 0;
+      op[1*8] = 0;
+      op[2*8] = 0;
+      op[3*8] = 0;
+      op[4*8] = 0;
+      op[5*8] = 0;
+      op[6*8] = 0;
+    }
+
+    ip++;                       /* next column */
+    op++;
+  }
+}
+
+/***************************
+  x   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+  0   0   0  0  0  0  0  0
+**************************/
+
+void IDct1__sse2( Q_LIST_ENTRY * InputData,
+            ogg_int16_t *QuantMatrix,
+            ogg_int16_t * OutputData ){
+  int loop;
+
+  ogg_int16_t  OutD;
+
+  OutD=(ogg_int16_t) ((ogg_int32_t)(InputData[0]*QuantMatrix[0]+15)>>5);
+
+  for(loop=0;loop<64;loop++)
+    OutputData[loop]=OutD;
+
+}
+
+
+void dsp_sse2_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+    /* TODO::: Match function order */
+  funcs->dequant_slow = dequant_slow__sse2;
+  funcs->IDct1 = IDct1__sse2;
+  funcs->IDct10 = IDct10__sse2;
+  funcs->dequant_slow10 = dequant_slow10__sse2;
+  funcs->IDctSlow = IDctSlow__sse2;
+  funcs->dequant_slow = dequant_slow__sse2;
+
+}
\ No newline at end of file

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-04 11:16:48 UTC (rev 11515)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-04 13:29:50 UTC (rev 11516)
@@ -447,6 +447,14 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\idct_mmx.c"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\lib\x86_32_vs\idct_sse2.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\libtheora.def"
 				>
 			</File>