[xiph-commits] r14714 - in trunk/theora: lib lib/dec lib/dec/x86_vc lib/enc win32/VS2005/libtheora

Fri Apr 11 18:04:43 PDT 2008

Author: giles
Date: 2008-04-11 18:04:43 -0700 (Fri, 11 Apr 2008)
New Revision: 14714

Added:
   trunk/theora/lib/dec/x86_vc/
   trunk/theora/lib/dec/x86_vc/mmxfrag.c
   trunk/theora/lib/dec/x86_vc/mmxidct.c
   trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
   trunk/theora/lib/dec/x86_vc/mmxstate.c
   trunk/theora/lib/dec/x86_vc/x86int.h
   trunk/theora/lib/dec/x86_vc/x86state.c
Modified:
   trunk/theora/lib/cpu.c
   trunk/theora/lib/dec/state.c
   trunk/theora/lib/enc/dct_decode.c
   trunk/theora/lib/enc/encoder_idct.c
   trunk/theora/lib/internal.h
   trunk/theora/win32/VS2005/libtheora/libtheora.vcproj
Log:
Untested merge of Nils Pipenbrinck's translation of the inline assembly
to MSVC syntax.


Modified: trunk/theora/lib/cpu.c
===================================================================

--- trunk/theora/lib/cpu.c	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/cpu.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -5,7 +5,7 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
@@ -18,18 +18,42 @@
 
  ********************************************************************/
 
-#include "cpu.h"
+#if !defined(USE_ASM)
 
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
 
-ogg_uint32_t oc_cpu_flags_get(void){
-  ogg_uint32_t flags = 0;
-#if defined(USE_ASM)
-  ogg_uint32_t eax;
-  ogg_uint32_t ebx;
-  ogg_uint32_t ecx;
-  ogg_uint32_t edx;
-#if (defined(__amd64__) || defined(__x86_64__))
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+#else /* USE_ASM */
+
+# if defined(_MSC_VER)
+/*  Visual C cpuid helper function. For VS2005 we could
+    as well use the _cpuid builtin, but that wouldn't work
+    for VS2003 users, so we do it in inline assembler */
+
+static void oc_cpuid_helper (ogg_uint32_t * CpuInfo, ogg_uint32_t op){
+  _asm {
+    mov eax, [op]
+    mov esi, CpuInfo
+    cpuid
+    mov [esi + 0], eax
+    mov [esi + 4], ebx
+    mov [esi + 8], ecx
+    mov [esi +12], edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  {                                    \
+    ogg_uint32_t nfo[4];               \
+    oc_cpuid_helper (nfo, (_op));      \
+    (_eax) = nfo[0],(_ebx) = nfo[1];   \
+    (_ecx) = nfo[2],(_edx) = nfo[3];   \
+  }
+
+# elif (defined(__amd64__) || defined(__x86_64__))
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
    "push %%rbx\n\t" \
    "cpuid\n\t" \
@@ -42,8 +66,9 @@
    :"a" (_op) \
    :"cc" \
   )
-#else
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+# else /* x86_32, GCC */
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
    "pushl %%ebx\n\t" \
    "cpuid\n\t" \
@@ -56,6 +81,18 @@
    :"a" (_op) \
    :"cc" \
   )
+
+# endif /* arch switch */
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags = 0;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+
+# if !defined(_MSC_VER) && !defined(__amd64__) && !defined(__x86_64__)
+  /* check for cpuid */
   __asm__ __volatile__(
    "pushfl\n\t"
    "pushfl\n\t"
@@ -74,7 +111,8 @@
   );
   /*No cpuid.*/
   if(eax==ebx)return 0;
-#endif
+# endif /* GCC, x86_32 */
+
   cpuid(0,eax,ebx,ecx,edx);
   if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
     /*Intel:*/
@@ -102,8 +140,8 @@
     /*Implement me.*/
     flags=0;
   }
-  
-#ifdef DEBUG
+
+# ifdef DEBUG
   if (flags) {
     TH_DEBUG("vectorized instruction sets supported:");
     if (flags & OC_CPU_X86_MMX)      TH_DEBUG(" mmx");
@@ -114,9 +152,9 @@
     if (flags & OC_CPU_X86_3DNOWEXT) TH_DEBUG(" 3dnowext");
     TH_DEBUG("\n");
   }
-#endif
-#endif
-  
+# endif
+
   return flags;
 }
 
+#endif /* USE_ASM */

Modified: trunk/theora/lib/dec/state.c
===================================================================
--- trunk/theora/lib/dec/state.c	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/dec/state.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -20,8 +20,12 @@
 #include "../internal.h"
 #include "idct.h"
 #if defined(USE_ASM)
+#if defined(_MSC_VER)
+# include "x86_vc/x86int.h"
+#else
 # include "x86/x86int.h"
 #endif
+#endif
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"

Added: trunk/theora/lib/dec/x86_vc/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxfrag.c	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxfrag.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,430 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id:
+
+ ********************************************************************/
+#include "../../internal.h"
+
+/* ------------------------------------------------------------------------
+  MMX reconstruction fragment routines for Visual Studio.
+  Tested with VS2005. Should compile for VS2003 and VC6 as well.
+
+  Initial implementation 2007 by Nils Pipenbrinck.
+  ---------------------------------------------------------------------*/
+
+#if defined(USE_ASM)
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter reconstruction step with 8 iterations
+  unrolled. The iteration for each instruction is noted by the #id in the
+  comments (in case you want to reconstruct it)
+  --------------------------------------------------------------------- */
+  _asm{
+    mov       edi, [_residue]     /* load residue ptr     */
+    mov       eax, 0x00800080     /* generate constant    */
+    mov       ebx, [_dst_ystride] /* load dst-stride      */
+    mov       edx, [_dst]         /* load dest pointer    */
+
+    /* unrolled loop begins here */
+
+    movd      mm0, eax            /* load constant        */
+    movq      mm1, [edi+ 8*0]     /* #1 load low residue  */
+    movq      mm2, [edi+ 8*1]     /* #1 load high residue */
+    punpckldq mm0, mm0            /* build constant       */
+    movq      mm3, [edi+ 8*2]     /* #2 load low residue  */
+    movq      mm4, [edi+ 8*3]     /* #2 load high residue */
+    movq      mm5, [edi+ 8*4]     /* #3 load low residue  */
+    movq      mm6, [edi+ 8*5]     /* #3 load high residue */
+    paddsw    mm1, mm0            /* #1 bias low  residue */
+    paddsw    mm2, mm0            /* #1 bias high residue */
+    packuswb  mm1, mm2            /* #1 pack to byte      */
+    paddsw    mm3, mm0            /* #2 bias low  residue */
+    paddsw    mm4, mm0            /* #2 bias high residue */
+    packuswb  mm3, mm4            /* #2 pack to byte      */
+    paddsw    mm5, mm0            /* #3 bias low  residue */
+    paddsw    mm6, mm0            /* #3 bias high residue */
+    packuswb  mm5, mm6            /* #3 pack to byte      */
+    movq      [edx], mm1          /* #1 write row         */
+    movq      [edx + ebx], mm3    /* #2 write row         */
+    movq      [edx + ebx*2], mm5  /* #3 write row         */
+    movq      mm1, [edi+ 8*6]     /* #4 load low residue  */
+    lea       ecx, [ebx + ebx*2]  /* make dst_ystride * 3 */
+    movq      mm2, [edi+ 8*7]     /* #4 load high residue */
+    movq      mm3, [edi+ 8*8]     /* #5 load low residue  */
+    lea       esi, [ebx*4 + ebx]  /* make dst_ystride * 5 */
+    movq      mm4, [edi+ 8*9]     /* #5 load high residue */
+    movq      mm5, [edi+ 8*10]    /* #6 load low residue  */
+    lea       eax, [ecx*2 + ebx]  /* make dst_ystride * 7 */
+    movq      mm6, [edi+ 8*11]    /* #6 load high residue */
+    paddsw    mm1, mm0            /* #4 bias low  residue */
+    paddsw    mm2, mm0            /* #4 bias high residue */
+    packuswb  mm1, mm2            /* #4 pack to byte      */
+    paddsw    mm3, mm0            /* #5 bias low  residue */
+    paddsw    mm4, mm0            /* #5 bias high residue */
+    packuswb  mm3, mm4            /* #5 pack to byte      */
+    paddsw    mm5, mm0            /* #6 bias low  residue */
+    paddsw    mm6, mm0            /* #6 bias high residue */
+    packuswb  mm5, mm6            /* #6 pack to byte      */
+    movq      [edx + ecx], mm1    /* #4 write row         */
+    movq      [edx + ebx*4], mm3  /* #5 write row         */
+    movq      [edx + esi], mm5    /* #6 write row         */
+    movq      mm1, [edi+ 8*12]    /* #7 load low residue  */
+    movq      mm2, [edi+ 8*13]    /* #7 load high residue */
+    movq      mm3, [edi+ 8*14]    /* #8 load low residue  */
+    movq      mm4, [edi+ 8*15]    /* #8 load high residue */
+    paddsw    mm1, mm0            /* #7 bias low  residue */
+    paddsw    mm2, mm0            /* #7 bias high residue */
+    packuswb  mm1, mm2            /* #7 pack to byte      */
+    paddsw    mm3, mm0            /* #8 bias low  residue */
+    paddsw    mm4, mm0            /* #8 bias high residue */
+    packuswb  mm3, mm4            /* #8 pack to byte      */
+    movq      [edx + ecx*2], mm1  /* #7 write row         */
+    movq      [edx + eax], mm3    /* #8 write row         */
+  }
+}
+
+
+
+void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter reconstruction step with two iterations
+  running in parallel to hide some load-latencies and break the dependency
+  chains. The iteration for each instruction is noted by the #id in the
+  comments (in case you want to reconstruct it)
+  --------------------------------------------------------------------- */
+  _asm{
+    pxor      mm0, mm0          /* generate constant 0 */
+    mov       esi, [_src]
+    mov       edi, [_residue]
+    mov       eax, [_src_ystride]
+    mov       edx, [_dst]
+    mov       ebx, [_dst_ystride]
+    mov       ecx, 4
+
+    align 16
+
+nextchunk:
+    movq      mm3, [esi]        /* #1 load source        */
+    movq      mm1, [edi+0]      /* #1 load residium low  */
+    movq      mm2, [edi+8]      /* #1 load residium high */
+    movq      mm7, [esi+eax]    /* #2 load source        */
+    movq      mm4, mm3          /* #1 get copy of src    */
+    movq      mm5, [edi+16]     /* #2 load residium low  */
+    punpckhbw mm4, mm0          /* #1 expand high source */
+    movq      mm6, [edi+24]     /* #2 load residium high */
+    punpcklbw mm3, mm0          /* #1 expand low  source */
+    paddsw    mm4, mm2          /* #1 add residium high  */
+    movq      mm2, mm7          /* #2 get copy of src    */
+    paddsw    mm3, mm1          /* #1 add residium low   */
+    punpckhbw mm2, mm0          /* #2 expand high source */
+    packuswb  mm3, mm4          /* #1 final row pixels   */
+    punpcklbw mm7, mm0          /* #2 expand low  source */
+    movq      [edx], mm3        /* #1 write row          */
+    paddsw    mm2, mm6          /* #2 add residium high  */
+    add       edi, 32           /* residue += 4          */
+    paddsw    mm7, mm5          /* #2 add residium low   */
+    sub       ecx, 1            /* update loop counter   */
+    packuswb  mm7, mm2          /* #2 final row          */
+    lea       esi, [esi+eax*2]  /* src += stride * 2     */
+    movq      [edx + ebx], mm7  /* #2 write row          */
+    lea       edx, [edx+ebx*2]  /* dst += stride * 2     */
+    jne       nextchunk
+  }
+}
+
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,  int _dst_ystride,
+ const unsigned char *_src1,  int _src1_ystride, const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter2 reconstruction step.The building of the
+  average is done with a bit-twiddeling trick to avoid excessive register
+  copy work during byte to word conversion.
+
+              average = (a & b) + (((a ^ b) & 0xfe) >> 1);
+
+  (shown for a single byte; it's done with 8 of them at a time)
+
+  Slightly faster than the obvious method using add and shift, but not
+  earthshaking improvement either.
+
+  If anyone comes up with a way that produces bit-identical outputs
+  using the pavgb instruction let me know and I'll do the 3dnow codepath.
+  --------------------------------------------------------------------- */
+ _asm{
+   mov        eax, 0xfefefefe
+   mov        esi, [_src1]
+   mov        edi, [_src2]
+   movd       mm1, eax
+   mov        ebx, [_residue]
+   mov        edx, [_dst]
+   mov        eax, [_dst_ystride]
+   punpckldq  mm1, mm1            /* replicate lsb32     */
+   mov        ecx, 8              /* init loop counter   */
+   pxor       mm0, mm0            /* constant zero       */
+   sub        edx, eax            /* dst -= dst_stride   */
+
+   align      16
+
+nextrow:
+   movq       mm2,  [esi]         /* load source1        */
+   movq       mm3,  [edi]         /* load source2        */
+   movq       mm5,  [ebx + 0]     /* load lower residue  */
+   movq       mm6,  [ebx + 8]     /* load higer residue  */
+   add        esi,  _src1_ystride /* src1 += src1_stride */
+   add        edi,  _src2_ystride /* src2 += src1_stride */
+   movq       mm4,  mm2           /* get copy of source1 */
+   pand       mm2,  mm3           /* s1 & s2 (avg part)  */
+   pxor       mm3,  mm4           /* s1 ^ s2 (avg part)  */
+   add        ebx,  16            /* residue++           */
+   pand       mm3,  mm1           /* mask out low bits   */
+   psrlq      mm3,  1             /* shift xor avg-part  */
+   paddd      mm3,  mm2           /* build final average */
+   add        edx,  eax           /* dst += dst_stride   */
+   movq       mm2,  mm3           /* get copy of average */
+   punpckhbw  mm3,  mm0           /* average high        */
+   punpcklbw  mm2,  mm0           /* average low         */
+   paddsw     mm3,  mm6           /* high + residue      */
+   paddsw     mm2,  mm5           /* low  + residue      */
+   sub        ecx,  1             /* update loop counter */
+   packuswb   mm2,  mm3           /* pack and saturate   */
+   movq       [edx], mm2          /* write row           */
+   jne        nextrow
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+  _asm { emms }
+}
+
+#endif
+
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id:
+
+ ********************************************************************/
+#include "../../internal.h"
+
+/* ------------------------------------------------------------------------
+  MMX reconstruction fragment routines for Visual Studio.
+  Tested with VS2005. Should compile for VS2003 and VC6 as well.
+
+  Initial implementation 2007 by Nils Pipenbrinck.
+  ---------------------------------------------------------------------*/
+
+#if defined(USE_ASM)
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter reconstruction step with 8 iterations
+  unrolled. The iteration for each instruction is noted by the #id in the
+  comments (in case you want to reconstruct it)
+  --------------------------------------------------------------------- */
+  _asm{
+    mov       edi, [_residue]     /* load residue ptr     */
+    mov       eax, 0x00800080     /* generate constant    */
+    mov       ebx, [_dst_ystride] /* load dst-stride      */
+    mov       edx, [_dst]         /* load dest pointer    */
+
+    /* unrolled loop begins here */
+
+    movd      mm0, eax            /* load constant        */
+    movq      mm1, [edi+ 8*0]     /* #1 load low residue  */
+    movq      mm2, [edi+ 8*1]     /* #1 load high residue */
+    punpckldq mm0, mm0            /* build constant       */
+    movq      mm3, [edi+ 8*2]     /* #2 load low residue  */
+    movq      mm4, [edi+ 8*3]     /* #2 load high residue */
+    movq      mm5, [edi+ 8*4]     /* #3 load low residue  */
+    movq      mm6, [edi+ 8*5]     /* #3 load high residue */
+    paddsw    mm1, mm0            /* #1 bias low  residue */
+    paddsw    mm2, mm0            /* #1 bias high residue */
+    packuswb  mm1, mm2            /* #1 pack to byte      */
+    paddsw    mm3, mm0            /* #2 bias low  residue */
+    paddsw    mm4, mm0            /* #2 bias high residue */
+    packuswb  mm3, mm4            /* #2 pack to byte      */
+    paddsw    mm5, mm0            /* #3 bias low  residue */
+    paddsw    mm6, mm0            /* #3 bias high residue */
+    packuswb  mm5, mm6            /* #3 pack to byte      */
+    movq      [edx], mm1          /* #1 write row         */
+    movq      [edx + ebx], mm3    /* #2 write row         */
+    movq      [edx + ebx*2], mm5  /* #3 write row         */
+    movq      mm1, [edi+ 8*6]     /* #4 load low residue  */
+    lea       ecx, [ebx + ebx*2]  /* make dst_ystride * 3 */
+    movq      mm2, [edi+ 8*7]     /* #4 load high residue */
+    movq      mm3, [edi+ 8*8]     /* #5 load low residue  */
+    lea       esi, [ebx*4 + ebx]  /* make dst_ystride * 5 */
+    movq      mm4, [edi+ 8*9]     /* #5 load high residue */
+    movq      mm5, [edi+ 8*10]    /* #6 load low residue  */
+    lea       eax, [ecx*2 + ebx]  /* make dst_ystride * 7 */
+    movq      mm6, [edi+ 8*11]    /* #6 load high residue */
+    paddsw    mm1, mm0            /* #4 bias low  residue */
+    paddsw    mm2, mm0            /* #4 bias high residue */
+    packuswb  mm1, mm2            /* #4 pack to byte      */
+    paddsw    mm3, mm0            /* #5 bias low  residue */
+    paddsw    mm4, mm0            /* #5 bias high residue */
+    packuswb  mm3, mm4            /* #5 pack to byte      */
+    paddsw    mm5, mm0            /* #6 bias low  residue */
+    paddsw    mm6, mm0            /* #6 bias high residue */
+    packuswb  mm5, mm6            /* #6 pack to byte      */
+    movq      [edx + ecx], mm1    /* #4 write row         */
+    movq      [edx + ebx*4], mm3  /* #5 write row         */
+    movq      [edx + esi], mm5    /* #6 write row         */
+    movq      mm1, [edi+ 8*12]    /* #7 load low residue  */
+    movq      mm2, [edi+ 8*13]    /* #7 load high residue */
+    movq      mm3, [edi+ 8*14]    /* #8 load low residue  */
+    movq      mm4, [edi+ 8*15]    /* #8 load high residue */
+    paddsw    mm1, mm0            /* #7 bias low  residue */
+    paddsw    mm2, mm0            /* #7 bias high residue */
+    packuswb  mm1, mm2            /* #7 pack to byte      */
+    paddsw    mm3, mm0            /* #8 bias low  residue */
+    paddsw    mm4, mm0            /* #8 bias high residue */
+    packuswb  mm3, mm4            /* #8 pack to byte      */
+    movq      [edx + ecx*2], mm1  /* #7 write row         */
+    movq      [edx + eax], mm3    /* #8 write row         */
+  }
+}
+
+
+
+void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter reconstruction step with two iterations
+  running in parallel to hide some load-latencies and break the dependency
+  chains. The iteration for each instruction is noted by the #id in the
+  comments (in case you want to reconstruct it)
+  --------------------------------------------------------------------- */
+  _asm{
+    pxor      mm0, mm0          /* generate constant 0 */
+    mov       esi, [_src]
+    mov       edi, [_residue]
+    mov       eax, [_src_ystride]
+    mov       edx, [_dst]
+    mov       ebx, [_dst_ystride]
+    mov       ecx, 4
+
+    align 16
+
+nextchunk:
+    movq      mm3, [esi]        /* #1 load source        */
+    movq      mm1, [edi+0]      /* #1 load residium low  */
+    movq      mm2, [edi+8]      /* #1 load residium high */
+    movq      mm7, [esi+eax]    /* #2 load source        */
+    movq      mm4, mm3          /* #1 get copy of src    */
+    movq      mm5, [edi+16]     /* #2 load residium low  */
+    punpckhbw mm4, mm0          /* #1 expand high source */
+    movq      mm6, [edi+24]     /* #2 load residium high */
+    punpcklbw mm3, mm0          /* #1 expand low  source */
+    paddsw    mm4, mm2          /* #1 add residium high  */
+    movq      mm2, mm7          /* #2 get copy of src    */
+    paddsw    mm3, mm1          /* #1 add residium low   */
+    punpckhbw mm2, mm0          /* #2 expand high source */
+    packuswb  mm3, mm4          /* #1 final row pixels   */
+    punpcklbw mm7, mm0          /* #2 expand low  source */
+    movq      [edx], mm3        /* #1 write row          */
+    paddsw    mm2, mm6          /* #2 add residium high  */
+    add       edi, 32           /* residue += 4          */
+    paddsw    mm7, mm5          /* #2 add residium low   */
+    sub       ecx, 1            /* update loop counter   */
+    packuswb  mm7, mm2          /* #2 final row          */
+    lea       esi, [esi+eax*2]  /* src += stride * 2     */
+    movq      [edx + ebx], mm7  /* #2 write row          */
+    lea       edx, [edx+ebx*2]  /* dst += stride * 2     */
+    jne       nextchunk
+  }
+}
+
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,  int _dst_ystride,
+ const unsigned char *_src1,  int _src1_ystride, const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+  /* ---------------------------------------------------------------------
+  This function does the inter2 reconstruction step.The building of the
+  average is done with a bit-twiddeling trick to avoid excessive register
+  copy work during byte to word conversion.
+
+              average = (a & b) + (((a ^ b) & 0xfe) >> 1);
+
+  (shown for a single byte; it's done with 8 of them at a time)
+
+  Slightly faster than the obvious method using add and shift, but not
+  earthshaking improvement either.
+
+  If anyone comes up with a way that produces bit-identical outputs
+  using the pavgb instruction let me know and I'll do the 3dnow codepath.
+  --------------------------------------------------------------------- */
+ _asm{
+   mov        eax, 0xfefefefe
+   mov        esi, [_src1]
+   mov        edi, [_src2]
+   movd       mm1, eax
+   mov        ebx, [_residue]
+   mov        edx, [_dst]
+   mov        eax, [_dst_ystride]
+   punpckldq  mm1, mm1            /* replicate lsb32     */
+   mov        ecx, 8              /* init loop counter   */
+   pxor       mm0, mm0            /* constant zero       */
+   sub        edx, eax            /* dst -= dst_stride   */
+
+   align      16
+
+nextrow:
+   movq       mm2,  [esi]         /* load source1        */
+   movq       mm3,  [edi]         /* load source2        */
+   movq       mm5,  [ebx + 0]     /* load lower residue  */
+   movq       mm6,  [ebx + 8]     /* load higer residue  */
+   add        esi,  _src1_ystride /* src1 += src1_stride */
+   add        edi,  _src2_ystride /* src2 += src1_stride */
+   movq       mm4,  mm2           /* get copy of source1 */
+   pand       mm2,  mm3           /* s1 & s2 (avg part)  */
+   pxor       mm3,  mm4           /* s1 ^ s2 (avg part)  */
+   add        ebx,  16            /* residue++           */
+   pand       mm3,  mm1           /* mask out low bits   */
+   psrlq      mm3,  1             /* shift xor avg-part  */
+   paddd      mm3,  mm2           /* build final average */
+   add        edx,  eax           /* dst += dst_stride   */
+   movq       mm2,  mm3           /* get copy of average */
+   punpckhbw  mm3,  mm0           /* average high        */
+   punpcklbw  mm2,  mm0           /* average low         */
+   paddsw     mm3,  mm6           /* high + residue      */
+   paddsw     mm2,  mm5           /* low  + residue      */
+   sub        ecx,  1             /* update loop counter */
+   packuswb   mm2,  mm3           /* pack and saturate   */
+   movq       [edx], mm2          /* write row           */
+   jne        nextrow
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+  _asm { emms }
+}
+
+#endif
+


Property changes on: trunk/theora/lib/dec/x86_vc/mmxfrag.c
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: trunk/theora/lib/dec/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxidct.c	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxidct.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,2014 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: 
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+  MMX based IDCT for the theora codec.
+
+  Originally written by Rudolf Marek, based on code from On2's VP3.
+  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+  ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include <ogg/ogg.h>
+#include "../dct.h"
+#include "../idct.h"
+#include "x86int.h"
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16)) ogg_uint16_t 
+ OC_IDCT_CONSTS[(7+1)*4]={
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+  _asm {
+    mov     edx, [_y]
+    mov     eax, offset OC_IDCT_CONSTS             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 18H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 38H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 28H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 08H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 20H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 10H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx], mm0                   
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 08H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 18H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx]                   
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 10H]             
+    movq    mm0, mm4                               
+    movq    [edx + 38H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 28H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx], mm0                   
+    punpckhwd mm5, mm3                             
+    movq    [edx + 10H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 30H], mm4             
+    movq    [edx + 20H], mm2             
+    movq    mm2, [edx + 70H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 50H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 60H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 50H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 60H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 40H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 50H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 60H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 50H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx + 40H], mm0             
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 48H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 58H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx + 40H]             
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 50H]             
+    movq    mm0, mm4                               
+    movq    [edx + 78H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 68H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx + 40H], mm0             
+    punpckhwd mm5, mm3                             
+    movq    [edx + 50H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 70H], mm4             
+    movq    [edx + 60H], mm2             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 50H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 70H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 60H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 40H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 20H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 20H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 10H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 40H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 30H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 60H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 50H], mm5             
+    movq    [edx + 70H], mm7             
+    movq    [edx], mm0                   
+    movq    mm2, [edx + 38H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 18H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 28H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 18H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 28H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 08H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 18H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 28H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 28H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 18H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 48H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 38H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 68H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 58H], mm5             
+    movq    [edx + 78H], mm7             
+    movq    [edx + 08H], mm0             
+    /* emms  */
+  }
+}
+
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+  _asm {
+    mov     edx, [_y]
+    mov     eax, offset OC_IDCT_CONSTS             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 18H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 38H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 28H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 08H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 20H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 10H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx], mm0                   
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 08H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 18H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx]                   
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 10H]             
+    movq    mm0, mm4                               
+    movq    [edx + 38H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 28H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx], mm0                   
+    punpckhwd mm5, mm3                             
+    movq    [edx + 10H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 30H], mm4             
+    movq    [edx + 20H], mm2             
+    movq    mm2, [edx + 70H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 50H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 60H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 50H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 60H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 40H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 50H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 60H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 50H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx + 40H], mm0             
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 48H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 58H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx + 40H]             
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 50H]             
+    movq    mm0, mm4                               
+    movq    [edx + 78H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 68H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx + 40H], mm0             
+    punpckhwd mm5, mm3                             
+    movq    [edx + 50H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 70H], mm4             
+    movq    [edx + 60H], mm2             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 50H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 70H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 60H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 40H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 20H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 20H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 10H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 40H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 30H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 60H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 50H], mm5             
+    movq    [edx + 70H], mm7             
+    movq    [edx], mm0                   
+    movq    mm2, [edx + 38H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 18H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 28H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 18H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 28H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 08H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 18H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 28H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 28H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 18H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 48H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 38H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 68H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 58H], mm5             
+    movq    [edx + 78H], mm7             
+    movq    [edx + 08H], mm0             
+    /* emms  */
+  }
+}
+
+#endif
+
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: 
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+  MMX based IDCT for the theora codec.
+
+  Originally written by Rudolf Marek, based on code from On2's VP3.
+  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+  ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include <ogg/ogg.h>
+#include "../dct.h"
+#include "../idct.h"
+#include "x86int.h"
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16)) ogg_uint16_t 
+ OC_IDCT_CONSTS[(7+1)*4]={
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+  _asm {
+    mov     edx, [_y]
+    mov     eax, offset OC_IDCT_CONSTS             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 18H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 38H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 28H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 08H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 20H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 10H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx], mm0                   
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 08H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 18H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx]                   
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 10H]             
+    movq    mm0, mm4                               
+    movq    [edx + 38H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 28H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx], mm0                   
+    punpckhwd mm5, mm3                             
+    movq    [edx + 10H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 30H], mm4             
+    movq    [edx + 20H], mm2             
+    movq    mm2, [edx + 70H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 50H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 60H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 50H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 60H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 40H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 50H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 60H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 50H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx + 40H], mm0             
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 48H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 58H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx + 40H]             
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 50H]             
+    movq    mm0, mm4                               
+    movq    [edx + 78H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 68H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx + 40H], mm0             
+    punpckhwd mm5, mm3                             
+    movq    [edx + 50H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 70H], mm4             
+    movq    [edx + 60H], mm2             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 50H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 70H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 60H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 40H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 20H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 20H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 10H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 40H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 30H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 60H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 50H], mm5             
+    movq    [edx + 70H], mm7             
+    movq    [edx], mm0                   
+    movq    mm2, [edx + 38H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 18H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 28H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 18H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 28H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 08H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 18H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 28H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 28H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 18H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 48H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 38H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 68H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 58H], mm5             
+    movq    [edx + 78H], mm7             
+    movq    [edx + 08H], mm0             
+    /* emms  */
+  }
+}
+
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+  _asm {
+    mov     edx, [_y]
+    mov     eax, offset OC_IDCT_CONSTS             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 18H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 38H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 28H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 08H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 20H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 10H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx], mm0                   
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 08H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 18H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx]                   
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 10H]             
+    movq    mm0, mm4                               
+    movq    [edx + 38H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 28H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx], mm0                   
+    punpckhwd mm5, mm3                             
+    movq    [edx + 10H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 30H], mm4             
+    movq    [edx + 20H], mm2             
+    movq    mm2, [edx + 70H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 50H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 60H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 50H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 60H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 40H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 50H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    movq    mm3, [edx + 60H]             
+    psubw   mm4, mm7                               
+    paddw   mm1, mm1                               
+    paddw   mm7, mm7                               
+    paddw   mm1, mm2                               
+    paddw   mm7, mm4                               
+    psubw   mm4, mm3                               
+    paddw   mm3, mm3                               
+    psubw   mm6, mm5                               
+    paddw   mm5, mm5                               
+    paddw   mm3, mm4                               
+    paddw   mm5, mm6                               
+    psubw   mm7, mm0                               
+    paddw   mm0, mm0                               
+    movq    [edx + 50H], mm1             
+    paddw   mm0, mm7                               
+    movq    mm1, mm4                               
+    punpcklwd mm4, mm5                             
+    movq    [edx + 40H], mm0             
+    punpckhwd mm1, mm5                             
+    movq    mm0, mm6                               
+    punpcklwd mm6, mm7                             
+    movq    mm5, mm4                               
+    punpckldq mm4, mm6                             
+    punpckhdq mm5, mm6                             
+    movq    mm6, mm1                               
+    movq    [edx + 48H], mm4             
+    punpckhwd mm0, mm7                             
+    movq    [edx + 58H], mm5             
+    punpckhdq mm6, mm0                             
+    movq    mm4, [edx + 40H]             
+    punpckldq mm1, mm0                             
+    movq    mm5, [edx + 50H]             
+    movq    mm0, mm4                               
+    movq    [edx + 78H], mm6             
+    punpcklwd mm0, mm5                             
+    movq    [edx + 68H], mm1             
+    punpckhwd mm4, mm5                             
+    movq    mm5, mm2                               
+    punpcklwd mm2, mm3                             
+    movq    mm1, mm0                               
+    punpckldq mm0, mm2                             
+    punpckhdq mm1, mm2                             
+    movq    mm2, mm4                               
+    movq    [edx + 40H], mm0             
+    punpckhwd mm5, mm3                             
+    movq    [edx + 50H], mm1             
+    punpckhdq mm4, mm5                             
+    punpckldq mm2, mm5                             
+    movq    [edx + 70H], mm4             
+    movq    [edx + 60H], mm2             
+    movq    mm2, [edx + 30H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 50H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 10H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 70H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 20H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 60H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 10H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 20H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx]                   
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 40H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 10H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 20H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 20H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 10H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 40H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 30H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 60H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 50H], mm5             
+    movq    [edx + 70H], mm7             
+    movq    [edx], mm0                   
+    movq    mm2, [edx + 38H]             
+    movq    mm6, [eax + 10H]             
+    movq    mm4, mm2                               
+    movq    mm7, [edx + 58H]             
+    pmulhw  mm4, mm6                               
+    movq    mm1, [eax + 20H]             
+    pmulhw  mm6, mm7                               
+    movq    mm5, mm1                               
+    pmulhw  mm1, mm2                               
+    movq    mm3, [edx + 18H]             
+    pmulhw  mm5, mm7                               
+    movq    mm0, [eax]                   
+    paddw   mm4, mm2                               
+    paddw   mm6, mm7                               
+    paddw   mm2, mm1                               
+    movq    mm1, [edx + 78H]             
+    paddw   mm7, mm5                               
+    movq    mm5, mm0                               
+    pmulhw  mm0, mm3                               
+    paddw   mm4, mm7                               
+    pmulhw  mm5, mm1                               
+    movq    mm7, [eax + 30H]             
+    psubw   mm6, mm2                               
+    paddw   mm0, mm3                               
+    pmulhw  mm3, mm7                               
+    movq    mm2, [edx + 28H]             
+    pmulhw  mm7, mm1                               
+    paddw   mm5, mm1                               
+    movq    mm1, mm2                               
+    pmulhw  mm2, [eax + 08H]             
+    psubw   mm3, mm5                               
+    movq    mm5, [edx + 68H]             
+    paddw   mm0, mm7                               
+    movq    mm7, mm5                               
+    psubw   mm0, mm4                               
+    pmulhw  mm5, [eax + 08H]             
+    paddw   mm2, mm1                               
+    pmulhw  mm1, [eax + 28H]             
+    paddw   mm4, mm4                               
+    paddw   mm4, mm0                               
+    psubw   mm3, mm6                               
+    paddw   mm5, mm7                               
+    paddw   mm6, mm6                               
+    pmulhw  mm7, [eax + 28H]             
+    paddw   mm6, mm3                               
+    movq    [edx + 18H], mm4             
+    psubw   mm1, mm5                               
+    movq    mm4, [eax + 18H]             
+    movq    mm5, mm3                               
+    pmulhw  mm3, mm4                               
+    paddw   mm7, mm2                               
+    movq    [edx + 28H], mm6             
+    movq    mm2, mm0                               
+    movq    mm6, [edx + 08H]             
+    pmulhw  mm0, mm4                               
+    paddw   mm5, mm3                               
+    movq    mm3, [edx + 48H]             
+    psubw   mm5, mm1                               
+    paddw   mm2, mm0                               
+    psubw   mm6, mm3                               
+    movq    mm0, mm6                               
+    pmulhw  mm6, mm4                               
+    paddw   mm3, mm3                               
+    paddw   mm1, mm1                               
+    paddw   mm3, mm0                               
+    paddw   mm1, mm5                               
+    pmulhw  mm4, mm3                               
+    paddw   mm6, mm0                               
+    psubw   mm6, mm2                               
+    paddw   mm2, mm2                               
+    movq    mm0, [edx + 18H]             
+    paddw   mm2, mm6                               
+    paddw   mm4, mm3                               
+    psubw   mm2, mm1                               
+    paddw   mm2, [eax + 38H]             
+    paddw   mm1, mm1                               
+    paddw   mm1, mm2                               
+    psraw   mm2, 4                                 
+    psubw   mm4, mm7                               
+    psraw   mm1, 4                                 
+    movq    mm3, [edx + 28H]             
+    paddw   mm7, mm7                               
+    movq    [edx + 28H], mm2             
+    paddw   mm7, mm4                               
+    movq    [edx + 18H], mm1             
+    psubw   mm4, mm3                               
+    paddw   mm4, [eax + 38H]             
+    paddw   mm3, mm3                               
+    paddw   mm3, mm4                               
+    psraw   mm4, 4                                 
+    psubw   mm6, mm5                               
+    psraw   mm3, 4                                 
+    paddw   mm6, [eax + 38H]             
+    paddw   mm5, mm5                               
+    paddw   mm5, mm6                               
+    psraw   mm6, 4                                 
+    movq    [edx + 48H], mm4             
+    psraw   mm5, 4                                 
+    movq    [edx + 38H], mm3             
+    psubw   mm7, mm0                               
+    paddw   mm7, [eax + 38H]             
+    paddw   mm0, mm0                               
+    paddw   mm0, mm7                               
+    psraw   mm7, 4                                 
+    movq    [edx + 68H], mm6             
+    psraw   mm0, 4                                 
+    movq    [edx + 58H], mm5             
+    movq    [edx + 78H], mm7             
+    movq    [edx + 08H], mm0             
+    /* emms  */
+  }
+}
+
+#endif
+


Property changes on: trunk/theora/lib/dec/x86_vc/mmxidct.c
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,756 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: 
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+  MMX based loop filter for the theora codec.
+
+  Originally written by Rudolf Marek, based on code from On2's VP3.
+  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+  Note: I can't test these since my example files never get into the 
+  loop filters, but the code has been converted semi-automatic from
+  the GCC sources, so it ought to work.
+  ---------------------------------------------------------------------*/
+#include "../../internal.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+#if defined(USE_ASM)
+
+
+
+static void loop_filter_v(unsigned char *_pix,int _ystride, 
+                          const ogg_int16_t *_ll){
+  _asm {
+    mov       eax,  [_pix]
+    mov       edx,  [_ystride]
+    mov       ebx,  [_ll]
+
+    /* _pix -= ystride */
+    sub       eax,   edx                    
+    /*  mm0=0          */
+    pxor      mm0,   mm0                    
+    /* _pix -= ystride */
+    sub       eax,   edx                    
+    /*  esi=_ystride*3 */
+    lea       esi, [edx + edx*2]            
+
+    /*  mm7=_pix[0...8]*/       
+    movq      mm7, [eax]          
+    /*  mm4=_pix[0...8+_ystride*3]*/          
+    movq      mm4, [eax + esi]    
+    /*  mm6=_pix[0...8]*/           
+    movq      mm6, mm7                      
+    /*  Expand unsigned _pix[0...3] to 16 bits.*/                      
+    punpcklbw mm6, mm0                    
+    movq      mm5, mm4                      
+    /*  Expand unsigned _pix[4...7] to 16 bits.*/
+    punpckhbw mm7, mm0                    
+    punpcklbw mm4, mm0                    
+    /*  Expand other arrays too.*/        
+    punpckhbw mm5, mm0                    
+    /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/          
+    psubw     mm6, mm4                      
+    psubw     mm7, mm5                      
+    /*mm5=mm4=_pix[0...7+_ystride]*/          
+    movq      mm4, [eax + edx]    
+    /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/          
+    movq      mm2, [eax + edx*2]  
+    movq      mm5, mm4                                
+    movq      mm3, mm2                                
+    movq      mm1, mm2                      
+    /*Expand these arrays.*/                       
+    punpckhbw mm5, mm0                    
+    punpcklbw mm4, mm0                              
+    punpckhbw mm3, mm0                              
+    punpcklbw mm2, mm0                              
+    pcmpeqw   mm0, mm0                      
+    /*mm0=3 3 3 3   
+    mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    psubw     mm3, mm5                                
+    psrlw     mm0, 14                                 
+    psubw     mm2, mm4  
+    /*Scale by 3.*/
+    pmullw    mm3, mm0                                
+    pmullw    mm2, mm0  
+    /*mm0=4 4 4 4
+    f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+     3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    psrlw     mm0, 1                                  
+    paddw     mm3, mm7                                
+    psllw     mm0, 2    
+    paddw     mm2, mm6
+    /*Add 4.*/
+    paddw     mm3, mm0                                  
+    paddw     mm2, mm0                                
+    /*"Divide" by 8.*/                                
+    psraw     mm3, 3                                    
+    psraw     mm2, 3                                  
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/                         
+    /*Free up mm5.*/
+    packuswb  mm4, mm5  
+    /*mm0=L L L L*/
+    movq      mm0, [ebx]   
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    movq      mm5, mm2                                
+    pxor      mm6, mm6                                
+    movq      mm7, mm0                                
+    psubw     mm6, mm0                                
+    psllw     mm7, 1                                  
+    psllw     mm6, 1   
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    pcmpgtw   mm7, mm2                                
+    pcmpgtw   mm5, mm6                                
+    pand      mm2, mm7                                
+    movq      mm7, mm0                                
+    pand      mm2, mm5                                
+    psllw     mm7, 1                                  
+    movq      mm5, mm3   
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    pcmpgtw   mm7, mm3                                
+    pcmpgtw   mm5, mm6                                
+    pand      mm3, mm7                                
+    movq      mm7, mm0                                
+    pand      mm3, mm5    
+   /*if(R_i<-L)R_i'=R_i+2L;
+     if(R_i>L)R_i'=R_i-2L;
+     if(R_i<-L||R_i>L)R_i=-R_i':*/
+    psraw     mm6, 1                                  
+    movq      mm5, mm2                                
+    psllw     mm7, 1  
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    pcmpgtw   mm5, mm0 
+    /*mm6=-L>R_i?FF:00*/
+    pcmpgtw   mm6, mm2   
+    /*mm7=R_i>L?2L:0*/
+    pand      mm7, mm5 
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    psubw     mm2, mm7                                
+    movq      mm7, mm0 
+    /*mm5=-L>R_i||R_i>L*/
+    por       mm5, mm6                                
+    psllw     mm7, 1    
+    /*mm7=-L>R_i?2L:0*/
+    pand      mm7, mm6                                
+    pxor      mm6, mm6   
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    paddw     mm2, mm7                                
+    psubw     mm6, mm0 
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    pand      mm5, mm2                                
+    movq      mm7, mm0   
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    psubw     mm2, mm5                                
+    psllw     mm7, 1  
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    psubw     mm2, mm5                                
+    movq      mm5, mm3   
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    pcmpgtw   mm6, mm3
+    /*mm5=R_i>L?FF:00*/
+    pcmpgtw   mm5, mm0    
+    /*mm7=R_i>L?2L:0*/
+    pand      mm7, mm5 
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    psubw     mm3, mm7                                
+    psllw     mm0, 1      
+    /*mm5=-L>R_i||R_i>L*/
+    por       mm5, mm6     
+    /*mm0=-L>R_i?2L:0*/
+    pand      mm0, mm6   
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    paddw     mm3, mm0  
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    pand      mm5, mm3    
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    psubw     mm3, mm5  
+    /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
+    psubw     mm3, mm5   
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    pxor      mm0, mm0                                
+    movq      mm5, mm4                                
+    punpcklbw mm4, mm0                              
+    punpckhbw mm5, mm0                              
+    movq      mm6, mm1                                
+    punpcklbw mm1, mm0                              
+    punpckhbw mm6, mm0 
+    /*_pix[0...8+_ystride]+=R_i*/
+    paddw     mm4, mm2                                
+    paddw     mm5, mm3  
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    psubw     mm1, mm2                                
+    psubw     mm6, mm3                                
+    packuswb  mm4, mm5                               
+    packuswb  mm1, mm6 
+    /*Write it back out.*/
+    movq    [eax + edx], mm4              
+    movq    [eax + edx*2], mm1            
+  }
+}
+
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride, 
+                           const ogg_int16_t *_ll){
+  /* todo: merge the comments from the GCC sources */
+  _asm {
+    mov   ecx, [_pix]
+    mov   edx, [_ystride]
+    mov   eax, [_ll]
+    /*esi=_ystride*3*/
+    lea     esi, [edx + edx*2]              
+
+    movd    mm0, dword ptr [ecx]            
+    movd    mm1, dword ptr [ecx + edx]      
+    movd    mm2, dword ptr [ecx + edx*2]    
+    movd    mm3, dword ptr [ecx + esi]      
+    punpcklbw mm0, mm1                      
+    punpcklbw mm2, mm3                      
+    movq    mm1, mm0                        
+    punpckhwd mm0, mm2                      
+    punpcklwd mm1, mm2                      
+    pxor    mm7, mm7                        
+    movq    mm5, mm1                        
+    punpcklbw mm1, mm7                      
+    punpckhbw mm5, mm7                      
+    movq    mm3, mm0                        
+    punpcklbw mm0, mm7                      
+    punpckhbw mm3, mm7                      
+    psubw   mm1, mm3                        
+    movq    mm4, mm0                        
+    pcmpeqw mm2, mm2                        
+    psubw   mm0, mm5                        
+    psrlw   mm2, 14                         
+    pmullw  mm0, mm2                        
+    psrlw   mm2, 1                          
+    paddw   mm0, mm1                        
+    psllw   mm2, 2                          
+    paddw   mm0, mm2                        
+    psraw   mm0, 3                          
+    movq    mm6, qword ptr [eax]            
+    movq    mm1, mm0                        
+    pxor    mm2, mm2                        
+    movq    mm3, mm6                        
+    psubw   mm2, mm6                        
+    psllw   mm3, 1                          
+    psllw   mm2, 1                          
+    pcmpgtw mm3, mm0                        
+    pcmpgtw mm1, mm2                        
+    pand    mm0, mm3                        
+    pand    mm0, mm1                        
+    psraw   mm2, 1                          
+    movq    mm1, mm0                        
+    movq    mm3, mm6                        
+    pcmpgtw mm2, mm0                        
+    pcmpgtw mm1, mm6                        
+    psllw   mm3, 1                          
+    psllw   mm6, 1                          
+    pand    mm3, mm1                        
+    pand    mm6, mm2                        
+    psubw   mm0, mm3                        
+    por     mm1, mm2                        
+    paddw   mm0, mm6                        
+    pand    mm1, mm0                        
+    psubw   mm0, mm1                        
+    psubw   mm0, mm1                        
+    paddw   mm5, mm0                        
+    psubw   mm4, mm0                        
+    packuswb mm5, mm7                       
+    packuswb mm4, mm7                       
+    punpcklbw mm5, mm4                      
+    movd    edi, mm5                        
+    mov     word ptr [ecx + 01H], di        
+    psrlq   mm5, 32                         
+    shr     edi, 16                         
+    mov     word ptr [ecx + edx + 01H], di  
+    movd    edi, mm5                        
+    mov     word ptr [ecx + edx*2 + 01H], di
+    shr     edi, 16                         
+    mov     word ptr [ecx + esi + 01H], di  
+  }
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride, 
+                          const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
+}
+
+
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+   and we can do just a single emms call at the end this way.
+  We also do not use the _bv lookup table, instead computing the values that
+   would lie in it on the fly.*/
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  ogg_int16_t __declspec(align(8))        ll[4];
+  th_img_plane                            *iplane;
+  oc_fragment_plane                       *fplane;
+  oc_fragment                             *frag_top;
+  oc_fragment                             *frag0;
+  oc_fragment                             *frag;
+  oc_fragment                             *frag_end;
+  oc_fragment                             *frag0_end;
+  oc_fragment                             *frag_bot;
+  ll[0]=ll[1]=ll[2]=ll[3]=
+   (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  fplane=_state->fplanes+_pli;
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  frag_top=_state->frags+fplane->froffset;
+  frag0=frag_top+_fragy0*fplane->nhfrags;
+  frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+  frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+  while(frag0<frag0_end){
+    frag=frag0;
+    frag_end=frag+fplane->nhfrags;
+    while(frag<frag_end){
+      if(frag->coded){
+        if(frag>frag0){
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+        }
+        if(frag0>frag_top){
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+        }
+        if(frag+1<frag_end&&!(frag+1)->coded){
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+        }
+        if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+           iplane->ystride,ll);
+        }
+      }
+      frag++;
+    }
+    frag0+=fplane->nhfrags;
+  }
+
+  /*This needs to be removed when decode specific functions are implemented:*/
+  _mm_empty();
+}
+
+#endif
+
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: 
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+  MMX based loop filter for the theora codec.
+
+  Originally written by Rudolf Marek, based on code from On2's VP3.
+  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+  Note: I can't test these since my example files never get into the 
+  loop filters, but the code has been converted semi-automatic from
+  the GCC sources, so it ought to work.
+  ---------------------------------------------------------------------*/
+#include "../../internal.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+#if defined(USE_ASM)
+
+
+
+static void loop_filter_v(unsigned char *_pix,int _ystride, 
+                          const ogg_int16_t *_ll){
+  _asm {
+    mov       eax,  [_pix]
+    mov       edx,  [_ystride]
+    mov       ebx,  [_ll]
+
+    /* _pix -= ystride */
+    sub       eax,   edx                    
+    /*  mm0=0          */
+    pxor      mm0,   mm0                    
+    /* _pix -= ystride */
+    sub       eax,   edx                    
+    /*  esi=_ystride*3 */
+    lea       esi, [edx + edx*2]            
+
+    /*  mm7=_pix[0...8]*/       
+    movq      mm7, [eax]          
+    /*  mm4=_pix[0...8+_ystride*3]*/          
+    movq      mm4, [eax + esi]    
+    /*  mm6=_pix[0...8]*/           
+    movq      mm6, mm7                      
+    /*  Expand unsigned _pix[0...3] to 16 bits.*/                      
+    punpcklbw mm6, mm0                    
+    movq      mm5, mm4                      
+    /*  Expand unsigned _pix[4...7] to 16 bits.*/
+    punpckhbw mm7, mm0                    
+    punpcklbw mm4, mm0                    
+    /*  Expand other arrays too.*/        
+    punpckhbw mm5, mm0                    
+    /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/          
+    psubw     mm6, mm4                      
+    psubw     mm7, mm5                      
+    /*mm5=mm4=_pix[0...7+_ystride]*/          
+    movq      mm4, [eax + edx]    
+    /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/          
+    movq      mm2, [eax + edx*2]  
+    movq      mm5, mm4                                
+    movq      mm3, mm2                                
+    movq      mm1, mm2                      
+    /*Expand these arrays.*/                       
+    punpckhbw mm5, mm0                    
+    punpcklbw mm4, mm0                              
+    punpckhbw mm3, mm0                              
+    punpcklbw mm2, mm0                              
+    pcmpeqw   mm0, mm0                      
+    /*mm0=3 3 3 3   
+    mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    psubw     mm3, mm5                                
+    psrlw     mm0, 14                                 
+    psubw     mm2, mm4  
+    /*Scale by 3.*/
+    pmullw    mm3, mm0                                
+    pmullw    mm2, mm0  
+    /*mm0=4 4 4 4
+    f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+     3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    psrlw     mm0, 1                                  
+    paddw     mm3, mm7                                
+    psllw     mm0, 2    
+    paddw     mm2, mm6
+    /*Add 4.*/
+    paddw     mm3, mm0                                  
+    paddw     mm2, mm0                                
+    /*"Divide" by 8.*/                                
+    psraw     mm3, 3                                    
+    psraw     mm2, 3                                  
+    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/                         
+    /*Free up mm5.*/
+    packuswb  mm4, mm5  
+    /*mm0=L L L L*/
+    movq      mm0, [ebx]   
+    /*if(R_i<-2L||R_i>2L)R_i=0:*/
+    movq      mm5, mm2                                
+    pxor      mm6, mm6                                
+    movq      mm7, mm0                                
+    psubw     mm6, mm0                                
+    psllw     mm7, 1                                  
+    psllw     mm6, 1   
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    pcmpgtw   mm7, mm2                                
+    pcmpgtw   mm5, mm6                                
+    pand      mm2, mm7                                
+    movq      mm7, mm0                                
+    pand      mm2, mm5                                
+    psllw     mm7, 1                                  
+    movq      mm5, mm3   
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-2L -2L -2L -2L*/
+    /*mm7==2L 2L 2L 2L*/
+    pcmpgtw   mm7, mm3                                
+    pcmpgtw   mm5, mm6                                
+    pand      mm3, mm7                                
+    movq      mm7, mm0                                
+    pand      mm3, mm5    
+   /*if(R_i<-L)R_i'=R_i+2L;
+     if(R_i>L)R_i'=R_i-2L;
+     if(R_i<-L||R_i>L)R_i=-R_i':*/
+    psraw     mm6, 1                                  
+    movq      mm5, mm2                                
+    psllw     mm7, 1  
+    /*mm2==R_3 R_2 R_1 R_0*/
+    /*mm5==R_3 R_2 R_1 R_0*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm5=R_i>L?FF:00*/
+    pcmpgtw   mm5, mm0 
+    /*mm6=-L>R_i?FF:00*/
+    pcmpgtw   mm6, mm2   
+    /*mm7=R_i>L?2L:0*/
+    pand      mm7, mm5 
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    psubw     mm2, mm7                                
+    movq      mm7, mm0 
+    /*mm5=-L>R_i||R_i>L*/
+    por       mm5, mm6                                
+    psllw     mm7, 1    
+    /*mm7=-L>R_i?2L:0*/
+    pand      mm7, mm6                                
+    pxor      mm6, mm6   
+    /*mm2=-L>R_i?R_i+2L:R_i*/
+    paddw     mm2, mm7                                
+    psubw     mm6, mm0 
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    pand      mm5, mm2                                
+    movq      mm7, mm0   
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    psubw     mm2, mm5                                
+    psllw     mm7, 1  
+    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+    psubw     mm2, mm5                                
+    movq      mm5, mm3   
+    /*mm3==R_7 R_6 R_5 R_4*/
+    /*mm5==R_7 R_6 R_5 R_4*/
+    /*mm6==-L -L -L -L*/
+    /*mm0==L L L L*/
+    /*mm6=-L>R_i?FF:00*/
+    pcmpgtw   mm6, mm3
+    /*mm5=R_i>L?FF:00*/
+    pcmpgtw   mm5, mm0    
+    /*mm7=R_i>L?2L:0*/
+    pand      mm7, mm5 
+    /*mm2=R_i>L?R_i-2L:R_i*/
+    psubw     mm3, mm7                                
+    psllw     mm0, 1      
+    /*mm5=-L>R_i||R_i>L*/
+    por       mm5, mm6     
+    /*mm0=-L>R_i?2L:0*/
+    pand      mm0, mm6   
+    /*mm3=-L>R_i?R_i+2L:R_i*/
+    paddw     mm3, mm0  
+    /*mm5=-L>R_i||R_i>L?-R_i':0*/
+    pand      mm5, mm3    
+    /*mm2=-L>R_i||R_i>L?0:R_i*/
+    psubw     mm3, mm5  
+    /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
+    psubw     mm3, mm5   
+    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+       saturation op code, so we have to promote things back 16 bits.*/
+    pxor      mm0, mm0                                
+    movq      mm5, mm4                                
+    punpcklbw mm4, mm0                              
+    punpckhbw mm5, mm0                              
+    movq      mm6, mm1                                
+    punpcklbw mm1, mm0                              
+    punpckhbw mm6, mm0 
+    /*_pix[0...8+_ystride]+=R_i*/
+    paddw     mm4, mm2                                
+    paddw     mm5, mm3  
+    /*_pix[0...8+_ystride*2]-=R_i*/
+    psubw     mm1, mm2                                
+    psubw     mm6, mm3                                
+    packuswb  mm4, mm5                               
+    packuswb  mm1, mm6 
+    /*Write it back out.*/
+    movq    [eax + edx], mm4              
+    movq    [eax + edx*2], mm1            
+  }
+}
+
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride, 
+                           const ogg_int16_t *_ll){
+  /* todo: merge the comments from the GCC sources */
+  _asm {
+    mov   ecx, [_pix]
+    mov   edx, [_ystride]
+    mov   eax, [_ll]
+    /*esi=_ystride*3*/
+    lea     esi, [edx + edx*2]              
+
+    movd    mm0, dword ptr [ecx]            
+    movd    mm1, dword ptr [ecx + edx]      
+    movd    mm2, dword ptr [ecx + edx*2]    
+    movd    mm3, dword ptr [ecx + esi]      
+    punpcklbw mm0, mm1                      
+    punpcklbw mm2, mm3                      
+    movq    mm1, mm0                        
+    punpckhwd mm0, mm2                      
+    punpcklwd mm1, mm2                      
+    pxor    mm7, mm7                        
+    movq    mm5, mm1                        
+    punpcklbw mm1, mm7                      
+    punpckhbw mm5, mm7                      
+    movq    mm3, mm0                        
+    punpcklbw mm0, mm7                      
+    punpckhbw mm3, mm7                      
+    psubw   mm1, mm3                        
+    movq    mm4, mm0                        
+    pcmpeqw mm2, mm2                        
+    psubw   mm0, mm5                        
+    psrlw   mm2, 14                         
+    pmullw  mm0, mm2                        
+    psrlw   mm2, 1                          
+    paddw   mm0, mm1                        
+    psllw   mm2, 2                          
+    paddw   mm0, mm2                        
+    psraw   mm0, 3                          
+    movq    mm6, qword ptr [eax]            
+    movq    mm1, mm0                        
+    pxor    mm2, mm2                        
+    movq    mm3, mm6                        
+    psubw   mm2, mm6                        
+    psllw   mm3, 1                          
+    psllw   mm2, 1                          
+    pcmpgtw mm3, mm0                        
+    pcmpgtw mm1, mm2                        
+    pand    mm0, mm3                        
+    pand    mm0, mm1                        
+    psraw   mm2, 1                          
+    movq    mm1, mm0                        
+    movq    mm3, mm6                        
+    pcmpgtw mm2, mm0                        
+    pcmpgtw mm1, mm6                        
+    psllw   mm3, 1                          
+    psllw   mm6, 1                          
+    pand    mm3, mm1                        
+    pand    mm6, mm2                        
+    psubw   mm0, mm3                        
+    por     mm1, mm2                        
+    paddw   mm0, mm6                        
+    pand    mm1, mm0                        
+    psubw   mm0, mm1                        
+    psubw   mm0, mm1                        
+    paddw   mm5, mm0                        
+    psubw   mm4, mm0                        
+    packuswb mm5, mm7                       
+    packuswb mm4, mm7                       
+    punpcklbw mm5, mm4                      
+    movd    edi, mm5                        
+    mov     word ptr [ecx + 01H], di        
+    psrlq   mm5, 32                         
+    shr     edi, 16                         
+    mov     word ptr [ecx + edx + 01H], di  
+    movd    edi, mm5                        
+    mov     word ptr [ecx + edx*2 + 01H], di
+    shr     edi, 16                         
+    mov     word ptr [ecx + esi + 01H], di  
+  }
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride, 
+                          const ogg_int16_t *_ll){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_ll);
+  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
+}
+
+
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+   and we can do just a single emms call at the end this way.
+  We also do not use the _bv lookup table, instead computing the values that
+   would lie in it on the fly.*/
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  ogg_int16_t __declspec(align(8))        ll[4];
+  th_img_plane                            *iplane;
+  oc_fragment_plane                       *fplane;
+  oc_fragment                             *frag_top;
+  oc_fragment                             *frag0;
+  oc_fragment                             *frag;
+  oc_fragment                             *frag_end;
+  oc_fragment                             *frag0_end;
+  oc_fragment                             *frag_bot;
+  ll[0]=ll[1]=ll[2]=ll[3]=
+   (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  fplane=_state->fplanes+_pli;
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  frag_top=_state->frags+fplane->froffset;
+  frag0=frag_top+_fragy0*fplane->nhfrags;
+  frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+  frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+  while(frag0<frag0_end){
+    frag=frag0;
+    frag_end=frag+fplane->nhfrags;
+    while(frag<frag_end){
+      if(frag->coded){
+        if(frag>frag0){
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+        }
+        if(frag0>frag_top){
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+        }
+        if(frag+1<frag_end&&!(frag+1)->coded){
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+        }
+        if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+           iplane->ystride,ll);
+        }
+      }
+      frag++;
+    }
+    frag0+=fplane->nhfrags;
+  }
+
+  /*This needs to be removed when decode specific functions are implemented:*/
+  _mm_empty();
+}
+
+#endif
+


Property changes on: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: trunk/theora/lib/dec/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxstate.c	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxstate.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,382 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id:
+
+ ********************************************************************/
+
+/* ------------------------------------------------------------------------
+  MMX acceleration of complete fragment reconstruction algorithm.
+    Originally written by Rudolf Marek.
+
+  Conversion to MSC intrinsics by Nils Pipenbrinck.
+  ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include "../../internal.h"
+#include "../idct.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+static const unsigned char OC_FZIG_ZAGMMX[64]=
+{
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
+/* Fill a block with value */
+static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
+	__m64 t	 = _value;
+	_dst[0]  = t;	_dst[1]  = t;	_dst[2]  = t;	_dst[3]  = t;
+	_dst[4]  = t;	_dst[5]  = t;	_dst[6]  = t;	_dst[7]  = t;
+	_dst[8]  = t;	_dst[9]  = t;	_dst[10] = t;	_dst[11] = t;
+	_dst[12] = t;	_dst[13] = t;	_dst[14] = t;	_dst[15] = t;
+}
+
+/* copy a block of 8 byte elements using different strides */
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, 
+																			  unsigned char * _src, int _src_ystride){
+	__m64 a,b,c,d,e,f,g,h;
+	a = *(__m64*)(_src + 0 * _src_ystride);
+	b = *(__m64*)(_src + 1 * _src_ystride);
+	c = *(__m64*)(_src + 2 * _src_ystride);
+	d = *(__m64*)(_src + 3 * _src_ystride);
+	e = *(__m64*)(_src + 4 * _src_ystride);
+	f = *(__m64*)(_src + 5 * _src_ystride);
+	g = *(__m64*)(_src + 6 * _src_ystride);
+	h = *(__m64*)(_src + 7 * _src_ystride);
+	*(__m64*)(_dst + 0 * _dst_ystride) = a;
+	*(__m64*)(_dst + 1 * _dst_ystride) = b;
+	*(__m64*)(_dst + 2 * _dst_ystride) = c;
+	*(__m64*)(_dst + 3 * _dst_ystride) = d;
+	*(__m64*)(_dst + 4 * _dst_ystride) = e;
+	*(__m64*)(_dst + 5 * _dst_ystride) = f;
+	*(__m64*)(_dst + 6 * _dst_ystride) = g;
+	*(__m64*)(_dst + 7 * _dst_ystride) = h;
+}
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+  ogg_int16_t __declspec(align(16)) res_buf[64];
+  int dst_framei;
+  int dst_ystride;
+  int zzi;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    __m64 p;
+    /*Why is the iquant product rounded in this case and no others? Who knows.*/
+    p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+    /* broadcast 16 bits into all 4 mmx subregisters */
+    p = _m_punpcklwd (p,p);
+    p = _m_punpckldq (p,p);
+    loc_fill_mmx_value ((__m64 *)res_buf, p);
+  }
+  else{
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+		loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+
+    res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+    /*This is planned to be rewritten in MMX.*/
+    for(zzi=1;zzi<_ncoefs;zzi++)
+    {
+      int ci;
+      ci=OC_FZIG_ZAG[zzi];
+      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
+       _ac_iquant[ci]);
+    }
+
+    if(_last_zzi<10){
+      oc_idct8x8_10_mmx(res_buf);
+    }
+    else {
+      oc_idct8x8_mmx(res_buf);
+    }
+  }
+  /*Fill in the target buffer.*/
+  dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  /*For now ystride values in all ref frames assumed to be equal.*/
+  if(_frag->mbmode==OC_MODE_INTRA){
+    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
+  }
+  else{
+    int ref_framei;
+    int ref_ystride;
+    int mvoffset0;
+    int mvoffset1;
+    ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+     _frag->mv[1],ref_ystride,_pli)>1){
+      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+    }
+    else{
+			oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+    }
+  }
+
+	_mm_empty();
+}
+
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+  const int *fragi;
+  const int *fragi_end;
+  int        dst_framei;
+  int        dst_ystride;
+  int        src_framei;
+  int        src_ystride;
+  dst_framei=_state->ref_frame_idx[_dst_frame];
+  src_framei=_state->ref_frame_idx[_src_frame];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  fragi_end=_fragis+_nfragis;
+  for(fragi=_fragis;fragi<fragi_end;fragi++){
+    oc_fragment *frag = _state->frags+*fragi;
+		loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, 
+											 frag->buffer[src_framei], src_ystride);
+  }
+  _m_empty();
+}
+
+#endif
+
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id:
+
+ ********************************************************************/
+
+/* ------------------------------------------------------------------------
+  MMX acceleration of complete fragment reconstruction algorithm.
+    Originally written by Rudolf Marek.
+
+  Conversion to MSC intrinsics by Nils Pipenbrinck.
+  ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include "../../internal.h"
+#include "../idct.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+static const unsigned char OC_FZIG_ZAGMMX[64]=
+{
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
+/* Fill a block with value */
+static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
+	__m64 t	 = _value;
+	_dst[0]  = t;	_dst[1]  = t;	_dst[2]  = t;	_dst[3]  = t;
+	_dst[4]  = t;	_dst[5]  = t;	_dst[6]  = t;	_dst[7]  = t;
+	_dst[8]  = t;	_dst[9]  = t;	_dst[10] = t;	_dst[11] = t;
+	_dst[12] = t;	_dst[13] = t;	_dst[14] = t;	_dst[15] = t;
+}
+
+/* copy a block of 8 byte elements using different strides */
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, 
+																			  unsigned char * _src, int _src_ystride){
+	__m64 a,b,c,d,e,f,g,h;
+	a = *(__m64*)(_src + 0 * _src_ystride);
+	b = *(__m64*)(_src + 1 * _src_ystride);
+	c = *(__m64*)(_src + 2 * _src_ystride);
+	d = *(__m64*)(_src + 3 * _src_ystride);
+	e = *(__m64*)(_src + 4 * _src_ystride);
+	f = *(__m64*)(_src + 5 * _src_ystride);
+	g = *(__m64*)(_src + 6 * _src_ystride);
+	h = *(__m64*)(_src + 7 * _src_ystride);
+	*(__m64*)(_dst + 0 * _dst_ystride) = a;
+	*(__m64*)(_dst + 1 * _dst_ystride) = b;
+	*(__m64*)(_dst + 2 * _dst_ystride) = c;
+	*(__m64*)(_dst + 3 * _dst_ystride) = d;
+	*(__m64*)(_dst + 4 * _dst_ystride) = e;
+	*(__m64*)(_dst + 5 * _dst_ystride) = f;
+	*(__m64*)(_dst + 6 * _dst_ystride) = g;
+	*(__m64*)(_dst + 7 * _dst_ystride) = h;
+}
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+  ogg_int16_t __declspec(align(16)) res_buf[64];
+  int dst_framei;
+  int dst_ystride;
+  int zzi;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    __m64 p;
+    /*Why is the iquant product rounded in this case and no others? Who knows.*/
+    p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+    /* broadcast 16 bits into all 4 mmx subregisters */
+    p = _m_punpcklwd (p,p);
+    p = _m_punpckldq (p,p);
+    loc_fill_mmx_value ((__m64 *)res_buf, p);
+  }
+  else{
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+		loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+
+    res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+    /*This is planned to be rewritten in MMX.*/
+    for(zzi=1;zzi<_ncoefs;zzi++)
+    {
+      int ci;
+      ci=OC_FZIG_ZAG[zzi];
+      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
+       _ac_iquant[ci]);
+    }
+
+    if(_last_zzi<10){
+      oc_idct8x8_10_mmx(res_buf);
+    }
+    else {
+      oc_idct8x8_mmx(res_buf);
+    }
+  }
+  /*Fill in the target buffer.*/
+  dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  /*For now ystride values in all ref frames assumed to be equal.*/
+  if(_frag->mbmode==OC_MODE_INTRA){
+    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
+  }
+  else{
+    int ref_framei;
+    int ref_ystride;
+    int mvoffset0;
+    int mvoffset1;
+    ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+     _frag->mv[1],ref_ystride,_pli)>1){
+      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+    }
+    else{
+			oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+    }
+  }
+
+	_mm_empty();
+}
+
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+  const int *fragi;
+  const int *fragi_end;
+  int        dst_framei;
+  int        dst_ystride;
+  int        src_framei;
+  int        src_ystride;
+  dst_framei=_state->ref_frame_idx[_dst_frame];
+  src_framei=_state->ref_frame_idx[_src_frame];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  fragi_end=_fragis+_nfragis;
+  for(fragi=_fragis;fragi<fragi_end;fragi++){
+    oc_fragment *frag = _state->frags+*fragi;
+		loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, 
+											 frag->buffer[src_framei], src_ystride);
+  }
+  _m_empty();
+}
+
+#endif
+


Property changes on: trunk/theora/lib/dec/x86_vc/mmxstate.c
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: trunk/theora/lib/dec/x86_vc/x86int.h
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86int.h	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/x86int.h	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,98 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_vc_H)
+# define _x86_x86int_vc_H (1)
+# include "../../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
+void oc_restore_fpu_mmx(void);
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,                                                    
+  int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_vc_H)
+# define _x86_x86int_vc_H (1)
+# include "../../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
+void oc_restore_fpu_mmx(void);
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,                                                    
+  int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif


Property changes on: trunk/theora/lib/dec/x86_vc/x86int.h
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Added: trunk/theora/lib/dec/x86_vc/x86state.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86state.c	                        (rev 0)
+++ trunk/theora/lib/dec/x86_vc/x86state.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,41 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if defined(USE_ASM)
+
+#include "x86int.h"
+#include "../../cpu.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+  _state->cpu_flags=oc_cpu_flags_get();
+
+	/* fill with defaults */
+	oc_state_vtable_init_c(_state);
+
+	/* patch MMX functions */
+	if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx; 
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx; 
+    _state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx;
+	}
+}
+
+#endif


Property changes on: trunk/theora/lib/dec/x86_vc/x86state.c
___________________________________________________________________
Name: svn:keywords
   + Id
Name: svn:eol-style
   + native

Modified: trunk/theora/lib/enc/dct_decode.c
===================================================================
--- trunk/theora/lib/enc/dct_decode.c	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/enc/dct_decode.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -1309,8 +1309,11 @@
   funcs->FilterVert = FilterVert__c;
   funcs->FilterHoriz = FilterHoriz__c;
 #if defined(USE_ASM)
+  // Todo: Port the dct for MSC one day.
+#if !defined (_MSC_VER)  
   if (cpu_flags & OC_CPU_X86_MMX) {
     dsp_mmx_dct_decode_init(funcs);
   }
 #endif
+#endif
 }

Modified: trunk/theora/lib/enc/encoder_idct.c
===================================================================
--- trunk/theora/lib/enc/encoder_idct.c	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/enc/encoder_idct.c	2008-04-12 01:04:43 UTC (rev 14714)
@@ -562,8 +562,11 @@
   funcs->IDct10 = IDct10__c;
   funcs->IDct3 = IDct10__c;
 #if defined(USE_ASM)
+  // todo: make mmx encoder idct for MSC one day...
+#if !defined (_MSC_VER)
   if (cpu_flags & OC_CPU_X86_MMX) {
     dsp_mmx_idct_init(funcs);
   }
 #endif
+#endif
 }

Modified: trunk/theora/lib/internal.h
===================================================================
--- trunk/theora/lib/internal.h	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/internal.h	2008-04-12 01:04:43 UTC (rev 14714)
@@ -39,7 +39,8 @@
 
 /*Thank you Microsoft, I know the order of operations.*/
 # if defined(_MSC_VER)
-#  pragma warning(disable:4554)
+#  pragma warning(disable:4554) /* order of operations */
+#  pragma warning(disable:4799) /* disable missing EMMS warnings */
 # endif
 
 /*This library's version.*/
@@ -501,15 +502,4 @@
   oc_state_granule_time_func  granule_time;
 };
 
-#if defined(_MSC_VER) && !defined(TH_REALLY_NO_ASSEMBLY)
-# error You are compiling theora without inline assembly.\
- This is probably not what you want.  Instead, please either\
-  (1) download the assembly .lib binaries or\
-  (2) compile them yourself using MinGW, and make Visual Studio\
- link against them.\
-  Please seriously consider this before defining TH_REALLY_NO_ASSEMBLY\
-  to disable this message and compile without inline assembly.\
-  Thank you!
 #endif
-
-#endif

Modified: trunk/theora/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- trunk/theora/win32/VS2005/libtheora/libtheora.vcproj	2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/win32/VS2005/libtheora/libtheora.vcproj	2008-04-12 01:04:43 UTC (rev 14714)
@@ -42,7 +42,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
-				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM;DEBUG;"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="1"
@@ -129,7 +129,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM;"
 				StringPooling="true"
 				ExceptionHandling="0"
 				RuntimeLibrary="0"
@@ -221,7 +221,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM"
 				StringPooling="true"
 				ExceptionHandling="0"
 				RuntimeLibrary="0"
@@ -314,7 +314,7 @@
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"
 				AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM"
 				StringPooling="true"
 				ExceptionHandling="0"
 				RuntimeLibrary="0"
@@ -2287,25 +2287,29 @@
 					Name="x86"
 					>
 					<File
-						RelativePath="..\..\..\lib\dec\x86\mmxfrag.c"
+						RelativePath="..\..\..\lib\dec\x86_vc\mmxfrag.c"
 						>
 					</File>
 					<File
-						RelativePath="..\..\..\lib\dec\x86\mmxidct.c"
+						RelativePath="..\..\..\lib\dec\x86_vc\mmxloopfilter.c"
 						>
 					</File>
 					<File
-						RelativePath="..\..\..\lib\dec\x86\mmxstate.c"
+						RelativePath="..\..\..\lib\dec\x86_vc\mmxidct.c"
 						>
 					</File>
 					<File
-						RelativePath="..\..\..\lib\dec\x86\x86int.h"
+						RelativePath="..\..\..\lib\dec\x86_vc\mmxstate.c"
 						>
 					</File>
 					<File
-						RelativePath="..\..\..\lib\dec\x86\x86state.c"
+						RelativePath="..\..\..\lib\dec\x86_vc\x86int.h"
 						>
 					</File>
+					<File
+						RelativePath="..\..\..\lib\dec\x86_vc\x86state.c"
+						>
+					</File>
 				</Filter>
 			</Filter>
 		</Filter>