[xiph-commits] r14722 - trunk/theora/lib/dec/x86_vc

giles at svn.xiph.org giles at svn.xiph.org
Sat Apr 12 23:28:01 PDT 2008


Author: giles
Date: 2008-04-12 23:28:00 -0700 (Sat, 12 Apr 2008)
New Revision: 14722

Modified:
   trunk/theora/lib/dec/x86_vc/mmxfrag.c
   trunk/theora/lib/dec/x86_vc/mmxidct.c
   trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
   trunk/theora/lib/dec/x86_vc/mmxstate.c
   trunk/theora/lib/dec/x86_vc/x86int.h
Log:
Remove duplicate code in the new MSVC inline asm. Patch from Andrew Chew.


Modified: trunk/theora/lib/dec/x86_vc/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxfrag.c	2008-04-13 06:23:04 UTC (rev 14721)
+++ trunk/theora/lib/dec/x86_vc/mmxfrag.c	2008-04-13 06:28:00 UTC (rev 14722)
@@ -211,220 +211,4 @@
   _asm { emms }
 }
 
-#endif
-
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id:
-
- ********************************************************************/
-#include "../../internal.h"
-
-/* ------------------------------------------------------------------------
-  MMX reconstruction fragment routines for Visual Studio.
-  Tested with VS2005. Should compile for VS2003 and VC6 as well.
-
-  Initial implementation 2007 by Nils Pipenbrinck.
-  ---------------------------------------------------------------------*/
-
-#if defined(USE_ASM)
-
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t *_residue){
-  /* ---------------------------------------------------------------------
-  This function does the inter reconstruction step with 8 iterations
-  unrolled. The iteration for each instruction is noted by the #id in the
-  comments (in case you want to reconstruct it)
-  --------------------------------------------------------------------- */
-  _asm{
-    mov       edi, [_residue]     /* load residue ptr     */
-    mov       eax, 0x00800080     /* generate constant    */
-    mov       ebx, [_dst_ystride] /* load dst-stride      */
-    mov       edx, [_dst]         /* load dest pointer    */
-
-    /* unrolled loop begins here */
-
-    movd      mm0, eax            /* load constant        */
-    movq      mm1, [edi+ 8*0]     /* #1 load low residue  */
-    movq      mm2, [edi+ 8*1]     /* #1 load high residue */
-    punpckldq mm0, mm0            /* build constant       */
-    movq      mm3, [edi+ 8*2]     /* #2 load low residue  */
-    movq      mm4, [edi+ 8*3]     /* #2 load high residue */
-    movq      mm5, [edi+ 8*4]     /* #3 load low residue  */
-    movq      mm6, [edi+ 8*5]     /* #3 load high residue */
-    paddsw    mm1, mm0            /* #1 bias low  residue */
-    paddsw    mm2, mm0            /* #1 bias high residue */
-    packuswb  mm1, mm2            /* #1 pack to byte      */
-    paddsw    mm3, mm0            /* #2 bias low  residue */
-    paddsw    mm4, mm0            /* #2 bias high residue */
-    packuswb  mm3, mm4            /* #2 pack to byte      */
-    paddsw    mm5, mm0            /* #3 bias low  residue */
-    paddsw    mm6, mm0            /* #3 bias high residue */
-    packuswb  mm5, mm6            /* #3 pack to byte      */
-    movq      [edx], mm1          /* #1 write row         */
-    movq      [edx + ebx], mm3    /* #2 write row         */
-    movq      [edx + ebx*2], mm5  /* #3 write row         */
-    movq      mm1, [edi+ 8*6]     /* #4 load low residue  */
-    lea       ecx, [ebx + ebx*2]  /* make dst_ystride * 3 */
-    movq      mm2, [edi+ 8*7]     /* #4 load high residue */
-    movq      mm3, [edi+ 8*8]     /* #5 load low residue  */
-    lea       esi, [ebx*4 + ebx]  /* make dst_ystride * 5 */
-    movq      mm4, [edi+ 8*9]     /* #5 load high residue */
-    movq      mm5, [edi+ 8*10]    /* #6 load low residue  */
-    lea       eax, [ecx*2 + ebx]  /* make dst_ystride * 7 */
-    movq      mm6, [edi+ 8*11]    /* #6 load high residue */
-    paddsw    mm1, mm0            /* #4 bias low  residue */
-    paddsw    mm2, mm0            /* #4 bias high residue */
-    packuswb  mm1, mm2            /* #4 pack to byte      */
-    paddsw    mm3, mm0            /* #5 bias low  residue */
-    paddsw    mm4, mm0            /* #5 bias high residue */
-    packuswb  mm3, mm4            /* #5 pack to byte      */
-    paddsw    mm5, mm0            /* #6 bias low  residue */
-    paddsw    mm6, mm0            /* #6 bias high residue */
-    packuswb  mm5, mm6            /* #6 pack to byte      */
-    movq      [edx + ecx], mm1    /* #4 write row         */
-    movq      [edx + ebx*4], mm3  /* #5 write row         */
-    movq      [edx + esi], mm5    /* #6 write row         */
-    movq      mm1, [edi+ 8*12]    /* #7 load low residue  */
-    movq      mm2, [edi+ 8*13]    /* #7 load high residue */
-    movq      mm3, [edi+ 8*14]    /* #8 load low residue  */
-    movq      mm4, [edi+ 8*15]    /* #8 load high residue */
-    paddsw    mm1, mm0            /* #7 bias low  residue */
-    paddsw    mm2, mm0            /* #7 bias high residue */
-    packuswb  mm1, mm2            /* #7 pack to byte      */
-    paddsw    mm3, mm0            /* #8 bias low  residue */
-    paddsw    mm4, mm0            /* #8 bias high residue */
-    packuswb  mm3, mm4            /* #8 pack to byte      */
-    movq      [edx + ecx*2], mm1  /* #7 write row         */
-    movq      [edx + eax], mm3    /* #8 write row         */
-  }
-}
-
-
-
-void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
- const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
-  /* ---------------------------------------------------------------------
-  This function does the inter reconstruction step with two iterations
-  running in parallel to hide some load-latencies and break the dependency
-  chains. The iteration for each instruction is noted by the #id in the
-  comments (in case you want to reconstruct it)
-  --------------------------------------------------------------------- */
-  _asm{
-    pxor      mm0, mm0          /* generate constant 0 */
-    mov       esi, [_src]
-    mov       edi, [_residue]
-    mov       eax, [_src_ystride]
-    mov       edx, [_dst]
-    mov       ebx, [_dst_ystride]
-    mov       ecx, 4
-
-    align 16
-
-nextchunk:
-    movq      mm3, [esi]        /* #1 load source        */
-    movq      mm1, [edi+0]      /* #1 load residium low  */
-    movq      mm2, [edi+8]      /* #1 load residium high */
-    movq      mm7, [esi+eax]    /* #2 load source        */
-    movq      mm4, mm3          /* #1 get copy of src    */
-    movq      mm5, [edi+16]     /* #2 load residium low  */
-    punpckhbw mm4, mm0          /* #1 expand high source */
-    movq      mm6, [edi+24]     /* #2 load residium high */
-    punpcklbw mm3, mm0          /* #1 expand low  source */
-    paddsw    mm4, mm2          /* #1 add residium high  */
-    movq      mm2, mm7          /* #2 get copy of src    */
-    paddsw    mm3, mm1          /* #1 add residium low   */
-    punpckhbw mm2, mm0          /* #2 expand high source */
-    packuswb  mm3, mm4          /* #1 final row pixels   */
-    punpcklbw mm7, mm0          /* #2 expand low  source */
-    movq      [edx], mm3        /* #1 write row          */
-    paddsw    mm2, mm6          /* #2 add residium high  */
-    add       edi, 32           /* residue += 4          */
-    paddsw    mm7, mm5          /* #2 add residium low   */
-    sub       ecx, 1            /* update loop counter   */
-    packuswb  mm7, mm2          /* #2 final row          */
-    lea       esi, [esi+eax*2]  /* src += stride * 2     */
-    movq      [edx + ebx], mm7  /* #2 write row          */
-    lea       edx, [edx+ebx*2]  /* dst += stride * 2     */
-    jne       nextchunk
-  }
-}
-
-
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,  int _dst_ystride,
- const unsigned char *_src1,  int _src1_ystride, const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue){
-  /* ---------------------------------------------------------------------
-  This function does the inter2 reconstruction step.The building of the
-  average is done with a bit-twiddeling trick to avoid excessive register
-  copy work during byte to word conversion.
-
-              average = (a & b) + (((a ^ b) & 0xfe) >> 1);
-
-  (shown for a single byte; it's done with 8 of them at a time)
-
-  Slightly faster than the obvious method using add and shift, but not
-  earthshaking improvement either.
-
-  If anyone comes up with a way that produces bit-identical outputs
-  using the pavgb instruction let me know and I'll do the 3dnow codepath.
-  --------------------------------------------------------------------- */
- _asm{
-   mov        eax, 0xfefefefe
-   mov        esi, [_src1]
-   mov        edi, [_src2]
-   movd       mm1, eax
-   mov        ebx, [_residue]
-   mov        edx, [_dst]
-   mov        eax, [_dst_ystride]
-   punpckldq  mm1, mm1            /* replicate lsb32     */
-   mov        ecx, 8              /* init loop counter   */
-   pxor       mm0, mm0            /* constant zero       */
-   sub        edx, eax            /* dst -= dst_stride   */
-
-   align      16
-
-nextrow:
-   movq       mm2,  [esi]         /* load source1        */
-   movq       mm3,  [edi]         /* load source2        */
-   movq       mm5,  [ebx + 0]     /* load lower residue  */
-   movq       mm6,  [ebx + 8]     /* load higer residue  */
-   add        esi,  _src1_ystride /* src1 += src1_stride */
-   add        edi,  _src2_ystride /* src2 += src1_stride */
-   movq       mm4,  mm2           /* get copy of source1 */
-   pand       mm2,  mm3           /* s1 & s2 (avg part)  */
-   pxor       mm3,  mm4           /* s1 ^ s2 (avg part)  */
-   add        ebx,  16            /* residue++           */
-   pand       mm3,  mm1           /* mask out low bits   */
-   psrlq      mm3,  1             /* shift xor avg-part  */
-   paddd      mm3,  mm2           /* build final average */
-   add        edx,  eax           /* dst += dst_stride   */
-   movq       mm2,  mm3           /* get copy of average */
-   punpckhbw  mm3,  mm0           /* average high        */
-   punpcklbw  mm2,  mm0           /* average low         */
-   paddsw     mm3,  mm6           /* high + residue      */
-   paddsw     mm2,  mm5           /* low  + residue      */
-   sub        ecx,  1             /* update loop counter */
-   packuswb   mm2,  mm3           /* pack and saturate   */
-   movq       [edx], mm2          /* write row           */
-   jne        nextrow
- }
-}
-
-void oc_restore_fpu_mmx(void){
-  _asm { emms }
-}
-
-#endif
-
+#endif
\ No newline at end of file

Modified: trunk/theora/lib/dec/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxidct.c	2008-04-13 06:23:04 UTC (rev 14721)
+++ trunk/theora/lib/dec/x86_vc/mmxidct.c	2008-04-13 06:28:00 UTC (rev 14722)
@@ -1003,1012 +1003,4 @@
   }
 }
 
-#endif
-
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: 
-
- ********************************************************************/
-
-/* -------------------------------------------------------------------
-  MMX based IDCT for the theora codec.
-
-  Originally written by Rudolf Marek, based on code from On2's VP3.
-  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
-
-  ---------------------------------------------------------------------*/
-#if defined(USE_ASM)
-
-#include <ogg/ogg.h>
-#include "../dct.h"
-#include "../idct.h"
-#include "x86int.h"
-
-/*A table of constants used by the MMX routines.*/
-static const __declspec(align(16)) ogg_uint16_t 
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
-
-void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
-  _asm {
-    mov     edx, [_y]
-    mov     eax, offset OC_IDCT_CONSTS             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 18H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 38H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 28H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 08H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 20H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 10H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx], mm0                   
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 08H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 18H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx]                   
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 10H]             
-    movq    mm0, mm4                               
-    movq    [edx + 38H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 28H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx], mm0                   
-    punpckhwd mm5, mm3                             
-    movq    [edx + 10H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 30H], mm4             
-    movq    [edx + 20H], mm2             
-    movq    mm2, [edx + 70H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 50H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 60H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 50H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 60H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 40H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 50H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 60H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 50H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx + 40H], mm0             
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 48H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 58H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx + 40H]             
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 50H]             
-    movq    mm0, mm4                               
-    movq    [edx + 78H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 68H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx + 40H], mm0             
-    punpckhwd mm5, mm3                             
-    movq    [edx + 50H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 70H], mm4             
-    movq    [edx + 60H], mm2             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 50H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 70H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 60H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 40H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 20H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 20H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 10H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 40H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 30H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 60H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 50H], mm5             
-    movq    [edx + 70H], mm7             
-    movq    [edx], mm0                   
-    movq    mm2, [edx + 38H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 18H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 28H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 18H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 28H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 08H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 18H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 28H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 28H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 18H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 48H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 38H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 68H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 58H], mm5             
-    movq    [edx + 78H], mm7             
-    movq    [edx + 08H], mm0             
-    /* emms  */
-  }
-}
-
-
-void oc_idct8x8_mmx(ogg_int16_t _y[64]){
-  _asm {
-    mov     edx, [_y]
-    mov     eax, offset OC_IDCT_CONSTS             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 18H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 38H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 28H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 08H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 20H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 10H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx], mm0                   
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 08H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 18H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx]                   
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 10H]             
-    movq    mm0, mm4                               
-    movq    [edx + 38H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 28H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx], mm0                   
-    punpckhwd mm5, mm3                             
-    movq    [edx + 10H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 30H], mm4             
-    movq    [edx + 20H], mm2             
-    movq    mm2, [edx + 70H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 50H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 60H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 50H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 60H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 40H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 50H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    movq    mm3, [edx + 60H]             
-    psubw   mm4, mm7                               
-    paddw   mm1, mm1                               
-    paddw   mm7, mm7                               
-    paddw   mm1, mm2                               
-    paddw   mm7, mm4                               
-    psubw   mm4, mm3                               
-    paddw   mm3, mm3                               
-    psubw   mm6, mm5                               
-    paddw   mm5, mm5                               
-    paddw   mm3, mm4                               
-    paddw   mm5, mm6                               
-    psubw   mm7, mm0                               
-    paddw   mm0, mm0                               
-    movq    [edx + 50H], mm1             
-    paddw   mm0, mm7                               
-    movq    mm1, mm4                               
-    punpcklwd mm4, mm5                             
-    movq    [edx + 40H], mm0             
-    punpckhwd mm1, mm5                             
-    movq    mm0, mm6                               
-    punpcklwd mm6, mm7                             
-    movq    mm5, mm4                               
-    punpckldq mm4, mm6                             
-    punpckhdq mm5, mm6                             
-    movq    mm6, mm1                               
-    movq    [edx + 48H], mm4             
-    punpckhwd mm0, mm7                             
-    movq    [edx + 58H], mm5             
-    punpckhdq mm6, mm0                             
-    movq    mm4, [edx + 40H]             
-    punpckldq mm1, mm0                             
-    movq    mm5, [edx + 50H]             
-    movq    mm0, mm4                               
-    movq    [edx + 78H], mm6             
-    punpcklwd mm0, mm5                             
-    movq    [edx + 68H], mm1             
-    punpckhwd mm4, mm5                             
-    movq    mm5, mm2                               
-    punpcklwd mm2, mm3                             
-    movq    mm1, mm0                               
-    punpckldq mm0, mm2                             
-    punpckhdq mm1, mm2                             
-    movq    mm2, mm4                               
-    movq    [edx + 40H], mm0             
-    punpckhwd mm5, mm3                             
-    movq    [edx + 50H], mm1             
-    punpckhdq mm4, mm5                             
-    punpckldq mm2, mm5                             
-    movq    [edx + 70H], mm4             
-    movq    [edx + 60H], mm2             
-    movq    mm2, [edx + 30H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 50H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 10H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 70H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 20H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 60H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 10H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 20H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx]                   
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 40H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 10H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 20H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 20H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 10H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 40H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 30H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 60H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 50H], mm5             
-    movq    [edx + 70H], mm7             
-    movq    [edx], mm0                   
-    movq    mm2, [edx + 38H]             
-    movq    mm6, [eax + 10H]             
-    movq    mm4, mm2                               
-    movq    mm7, [edx + 58H]             
-    pmulhw  mm4, mm6                               
-    movq    mm1, [eax + 20H]             
-    pmulhw  mm6, mm7                               
-    movq    mm5, mm1                               
-    pmulhw  mm1, mm2                               
-    movq    mm3, [edx + 18H]             
-    pmulhw  mm5, mm7                               
-    movq    mm0, [eax]                   
-    paddw   mm4, mm2                               
-    paddw   mm6, mm7                               
-    paddw   mm2, mm1                               
-    movq    mm1, [edx + 78H]             
-    paddw   mm7, mm5                               
-    movq    mm5, mm0                               
-    pmulhw  mm0, mm3                               
-    paddw   mm4, mm7                               
-    pmulhw  mm5, mm1                               
-    movq    mm7, [eax + 30H]             
-    psubw   mm6, mm2                               
-    paddw   mm0, mm3                               
-    pmulhw  mm3, mm7                               
-    movq    mm2, [edx + 28H]             
-    pmulhw  mm7, mm1                               
-    paddw   mm5, mm1                               
-    movq    mm1, mm2                               
-    pmulhw  mm2, [eax + 08H]             
-    psubw   mm3, mm5                               
-    movq    mm5, [edx + 68H]             
-    paddw   mm0, mm7                               
-    movq    mm7, mm5                               
-    psubw   mm0, mm4                               
-    pmulhw  mm5, [eax + 08H]             
-    paddw   mm2, mm1                               
-    pmulhw  mm1, [eax + 28H]             
-    paddw   mm4, mm4                               
-    paddw   mm4, mm0                               
-    psubw   mm3, mm6                               
-    paddw   mm5, mm7                               
-    paddw   mm6, mm6                               
-    pmulhw  mm7, [eax + 28H]             
-    paddw   mm6, mm3                               
-    movq    [edx + 18H], mm4             
-    psubw   mm1, mm5                               
-    movq    mm4, [eax + 18H]             
-    movq    mm5, mm3                               
-    pmulhw  mm3, mm4                               
-    paddw   mm7, mm2                               
-    movq    [edx + 28H], mm6             
-    movq    mm2, mm0                               
-    movq    mm6, [edx + 08H]             
-    pmulhw  mm0, mm4                               
-    paddw   mm5, mm3                               
-    movq    mm3, [edx + 48H]             
-    psubw   mm5, mm1                               
-    paddw   mm2, mm0                               
-    psubw   mm6, mm3                               
-    movq    mm0, mm6                               
-    pmulhw  mm6, mm4                               
-    paddw   mm3, mm3                               
-    paddw   mm1, mm1                               
-    paddw   mm3, mm0                               
-    paddw   mm1, mm5                               
-    pmulhw  mm4, mm3                               
-    paddw   mm6, mm0                               
-    psubw   mm6, mm2                               
-    paddw   mm2, mm2                               
-    movq    mm0, [edx + 18H]             
-    paddw   mm2, mm6                               
-    paddw   mm4, mm3                               
-    psubw   mm2, mm1                               
-    paddw   mm2, [eax + 38H]             
-    paddw   mm1, mm1                               
-    paddw   mm1, mm2                               
-    psraw   mm2, 4                                 
-    psubw   mm4, mm7                               
-    psraw   mm1, 4                                 
-    movq    mm3, [edx + 28H]             
-    paddw   mm7, mm7                               
-    movq    [edx + 28H], mm2             
-    paddw   mm7, mm4                               
-    movq    [edx + 18H], mm1             
-    psubw   mm4, mm3                               
-    paddw   mm4, [eax + 38H]             
-    paddw   mm3, mm3                               
-    paddw   mm3, mm4                               
-    psraw   mm4, 4                                 
-    psubw   mm6, mm5                               
-    psraw   mm3, 4                                 
-    paddw   mm6, [eax + 38H]             
-    paddw   mm5, mm5                               
-    paddw   mm5, mm6                               
-    psraw   mm6, 4                                 
-    movq    [edx + 48H], mm4             
-    psraw   mm5, 4                                 
-    movq    [edx + 38H], mm3             
-    psubw   mm7, mm0                               
-    paddw   mm7, [eax + 38H]             
-    paddw   mm0, mm0                               
-    paddw   mm0, mm7                               
-    psraw   mm7, 4                                 
-    movq    [edx + 68H], mm6             
-    psraw   mm0, 4                                 
-    movq    [edx + 58H], mm5             
-    movq    [edx + 78H], mm7             
-    movq    [edx + 08H], mm0             
-    /* emms  */
-  }
-}
-
-#endif
-
+#endif
\ No newline at end of file

Modified: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	2008-04-13 06:23:04 UTC (rev 14721)
+++ trunk/theora/lib/dec/x86_vc/mmxloopfilter.c	2008-04-13 06:28:00 UTC (rev 14722)
@@ -352,17 +352,17 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+          loop_filter_h(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+          loop_filter_v(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->stride,ll);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
           loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
-           iplane->ystride,ll);
+           iplane->stride,ll);
         }
       }
       frag++;
@@ -374,383 +374,4 @@
   _mm_empty();
 }
 
-#endif
-
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: 
-
- ********************************************************************/
-
-/* -------------------------------------------------------------------
-  MMX based loop filter for the theora codec.
-
-  Originally written by Rudolf Marek, based on code from On2's VP3.
-  Converted to Visual Studio inline assembly by Nils Pipenbrinck.
-
-  Note: I can't test these since my example files never get into the 
-  loop filters, but the code has been converted semi-automatic from
-  the GCC sources, so it ought to work.
-  ---------------------------------------------------------------------*/
-#include "../../internal.h"
-#include "x86int.h"
-#include <mmintrin.h>
-
-#if defined(USE_ASM)
-
-
-
-static void loop_filter_v(unsigned char *_pix,int _ystride, 
-                          const ogg_int16_t *_ll){
-  _asm {
-    mov       eax,  [_pix]
-    mov       edx,  [_ystride]
-    mov       ebx,  [_ll]
-
-    /* _pix -= ystride */
-    sub       eax,   edx                    
-    /*  mm0=0          */
-    pxor      mm0,   mm0                    
-    /* _pix -= ystride */
-    sub       eax,   edx                    
-    /*  esi=_ystride*3 */
-    lea       esi, [edx + edx*2]            
-
-    /*  mm7=_pix[0...8]*/       
-    movq      mm7, [eax]          
-    /*  mm4=_pix[0...8+_ystride*3]*/          
-    movq      mm4, [eax + esi]    
-    /*  mm6=_pix[0...8]*/           
-    movq      mm6, mm7                      
-    /*  Expand unsigned _pix[0...3] to 16 bits.*/                      
-    punpcklbw mm6, mm0                    
-    movq      mm5, mm4                      
-    /*  Expand unsigned _pix[4...7] to 16 bits.*/
-    punpckhbw mm7, mm0                    
-    punpcklbw mm4, mm0                    
-    /*  Expand other arrays too.*/        
-    punpckhbw mm5, mm0                    
-    /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/          
-    psubw     mm6, mm4                      
-    psubw     mm7, mm5                      
-    /*mm5=mm4=_pix[0...7+_ystride]*/          
-    movq      mm4, [eax + edx]    
-    /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/          
-    movq      mm2, [eax + edx*2]  
-    movq      mm5, mm4                                
-    movq      mm3, mm2                                
-    movq      mm1, mm2                      
-    /*Expand these arrays.*/                       
-    punpckhbw mm5, mm0                    
-    punpcklbw mm4, mm0                              
-    punpckhbw mm3, mm0                              
-    punpcklbw mm2, mm0                              
-    pcmpeqw   mm0, mm0                      
-    /*mm0=3 3 3 3   
-    mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
-    psubw     mm3, mm5                                
-    psrlw     mm0, 14                                 
-    psubw     mm2, mm4  
-    /*Scale by 3.*/
-    pmullw    mm3, mm0                                
-    pmullw    mm2, mm0  
-    /*mm0=4 4 4 4
-    f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
-     3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
-    psrlw     mm0, 1                                  
-    paddw     mm3, mm7                                
-    psllw     mm0, 2    
-    paddw     mm2, mm6
-    /*Add 4.*/
-    paddw     mm3, mm0                                  
-    paddw     mm2, mm0                                
-    /*"Divide" by 8.*/                                
-    psraw     mm3, 3                                    
-    psraw     mm2, 3                                  
-    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/                         
-    /*Free up mm5.*/
-    packuswb  mm4, mm5  
-    /*mm0=L L L L*/
-    movq      mm0, [ebx]   
-    /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    movq      mm5, mm2                                
-    pxor      mm6, mm6                                
-    movq      mm7, mm0                                
-    psubw     mm6, mm0                                
-    psllw     mm7, 1                                  
-    psllw     mm6, 1   
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    pcmpgtw   mm7, mm2                                
-    pcmpgtw   mm5, mm6                                
-    pand      mm2, mm7                                
-    movq      mm7, mm0                                
-    pand      mm2, mm5                                
-    psllw     mm7, 1                                  
-    movq      mm5, mm3   
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    pcmpgtw   mm7, mm3                                
-    pcmpgtw   mm5, mm6                                
-    pand      mm3, mm7                                
-    movq      mm7, mm0                                
-    pand      mm3, mm5    
-   /*if(R_i<-L)R_i'=R_i+2L;
-     if(R_i>L)R_i'=R_i-2L;
-     if(R_i<-L||R_i>L)R_i=-R_i':*/
-    psraw     mm6, 1                                  
-    movq      mm5, mm2                                
-    psllw     mm7, 1  
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm5=R_i>L?FF:00*/
-    pcmpgtw   mm5, mm0 
-    /*mm6=-L>R_i?FF:00*/
-    pcmpgtw   mm6, mm2   
-    /*mm7=R_i>L?2L:0*/
-    pand      mm7, mm5 
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    psubw     mm2, mm7                                
-    movq      mm7, mm0 
-    /*mm5=-L>R_i||R_i>L*/
-    por       mm5, mm6                                
-    psllw     mm7, 1    
-    /*mm7=-L>R_i?2L:0*/
-    pand      mm7, mm6                                
-    pxor      mm6, mm6   
-    /*mm2=-L>R_i?R_i+2L:R_i*/
-    paddw     mm2, mm7                                
-    psubw     mm6, mm0 
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    pand      mm5, mm2                                
-    movq      mm7, mm0   
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    psubw     mm2, mm5                                
-    psllw     mm7, 1  
-    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    psubw     mm2, mm5                                
-    movq      mm5, mm3   
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm6=-L>R_i?FF:00*/
-    pcmpgtw   mm6, mm3
-    /*mm5=R_i>L?FF:00*/
-    pcmpgtw   mm5, mm0    
-    /*mm7=R_i>L?2L:0*/
-    pand      mm7, mm5 
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    psubw     mm3, mm7                                
-    psllw     mm0, 1      
-    /*mm5=-L>R_i||R_i>L*/
-    por       mm5, mm6     
-    /*mm0=-L>R_i?2L:0*/
-    pand      mm0, mm6   
-    /*mm3=-L>R_i?R_i+2L:R_i*/
-    paddw     mm3, mm0  
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    pand      mm5, mm3    
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    psubw     mm3, mm5  
-    /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
-    psubw     mm3, mm5   
-    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
-       saturation op code, so we have to promote things back 16 bits.*/
-    pxor      mm0, mm0                                
-    movq      mm5, mm4                                
-    punpcklbw mm4, mm0                              
-    punpckhbw mm5, mm0                              
-    movq      mm6, mm1                                
-    punpcklbw mm1, mm0                              
-    punpckhbw mm6, mm0 
-    /*_pix[0...8+_ystride]+=R_i*/
-    paddw     mm4, mm2                                
-    paddw     mm5, mm3  
-    /*_pix[0...8+_ystride*2]-=R_i*/
-    psubw     mm1, mm2                                
-    psubw     mm6, mm3                                
-    packuswb  mm4, mm5                               
-    packuswb  mm1, mm6 
-    /*Write it back out.*/
-    movq    [eax + edx], mm4              
-    movq    [eax + edx*2], mm1            
-  }
-}
-
-/*This code implements the bulk of loop_filter_h().
-  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
-   four p0's to one register we must transpose the values in four mmx regs.
-  When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride, 
-                           const ogg_int16_t *_ll){
-  /* todo: merge the comments from the GCC sources */
-  _asm {
-    mov   ecx, [_pix]
-    mov   edx, [_ystride]
-    mov   eax, [_ll]
-    /*esi=_ystride*3*/
-    lea     esi, [edx + edx*2]              
-
-    movd    mm0, dword ptr [ecx]            
-    movd    mm1, dword ptr [ecx + edx]      
-    movd    mm2, dword ptr [ecx + edx*2]    
-    movd    mm3, dword ptr [ecx + esi]      
-    punpcklbw mm0, mm1                      
-    punpcklbw mm2, mm3                      
-    movq    mm1, mm0                        
-    punpckhwd mm0, mm2                      
-    punpcklwd mm1, mm2                      
-    pxor    mm7, mm7                        
-    movq    mm5, mm1                        
-    punpcklbw mm1, mm7                      
-    punpckhbw mm5, mm7                      
-    movq    mm3, mm0                        
-    punpcklbw mm0, mm7                      
-    punpckhbw mm3, mm7                      
-    psubw   mm1, mm3                        
-    movq    mm4, mm0                        
-    pcmpeqw mm2, mm2                        
-    psubw   mm0, mm5                        
-    psrlw   mm2, 14                         
-    pmullw  mm0, mm2                        
-    psrlw   mm2, 1                          
-    paddw   mm0, mm1                        
-    psllw   mm2, 2                          
-    paddw   mm0, mm2                        
-    psraw   mm0, 3                          
-    movq    mm6, qword ptr [eax]            
-    movq    mm1, mm0                        
-    pxor    mm2, mm2                        
-    movq    mm3, mm6                        
-    psubw   mm2, mm6                        
-    psllw   mm3, 1                          
-    psllw   mm2, 1                          
-    pcmpgtw mm3, mm0                        
-    pcmpgtw mm1, mm2                        
-    pand    mm0, mm3                        
-    pand    mm0, mm1                        
-    psraw   mm2, 1                          
-    movq    mm1, mm0                        
-    movq    mm3, mm6                        
-    pcmpgtw mm2, mm0                        
-    pcmpgtw mm1, mm6                        
-    psllw   mm3, 1                          
-    psllw   mm6, 1                          
-    pand    mm3, mm1                        
-    pand    mm6, mm2                        
-    psubw   mm0, mm3                        
-    por     mm1, mm2                        
-    paddw   mm0, mm6                        
-    pand    mm1, mm0                        
-    psubw   mm0, mm1                        
-    psubw   mm0, mm1                        
-    paddw   mm5, mm0                        
-    psubw   mm4, mm0                        
-    packuswb mm5, mm7                       
-    packuswb mm4, mm7                       
-    punpcklbw mm5, mm4                      
-    movd    edi, mm5                        
-    mov     word ptr [ecx + 01H], di        
-    psrlq   mm5, 32                         
-    shr     edi, 16                         
-    mov     word ptr [ecx + edx + 01H], di  
-    movd    edi, mm5                        
-    mov     word ptr [ecx + edx*2 + 01H], di
-    shr     edi, 16                         
-    mov     word ptr [ecx + esi + 01H], di  
-  }
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride, 
-                          const ogg_int16_t *_ll){
-  _pix-=2;
-  loop_filter_h4(_pix,_ystride,_ll);
-  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
-}
-
-
-/*We copy the whole function because the MMX routines will be inlined 4 times,
-   and we can do just a single emms call at the end this way.
-  We also do not use the _bv lookup table, instead computing the values that
-   would lie in it on the fly.*/
-
-/*Apply the loop filter to a given set of fragment rows in the given plane.
-  The filter may be run on the bottom edge, affecting pixels in the next row of
-   fragments, so this row also needs to be available.
-  _bv:        The bounding values array.
-  _refi:      The index of the frame buffer to filter.
-  _pli:       The color plane to filter.
-  _fragy0:    The Y coordinate of the first fragment row to filter.
-  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
-  ogg_int16_t __declspec(align(8))        ll[4];
-  th_img_plane                            *iplane;
-  oc_fragment_plane                       *fplane;
-  oc_fragment                             *frag_top;
-  oc_fragment                             *frag0;
-  oc_fragment                             *frag;
-  oc_fragment                             *frag_end;
-  oc_fragment                             *frag0_end;
-  oc_fragment                             *frag_bot;
-  ll[0]=ll[1]=ll[2]=ll[3]=
-   (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
-  iplane=_state->ref_frame_bufs[_refi]+_pli;
-  fplane=_state->fplanes+_pli;
-  /*The following loops are constructed somewhat non-intuitively on purpose.
-    The main idea is: if a block boundary has at least one coded fragment on
-     it, the filter is applied to it.
-    However, the order that the filters are applied in matters, and VP3 chose
-     the somewhat strange ordering used below.*/
-  frag_top=_state->frags+fplane->froffset;
-  frag0=frag_top+_fragy0*fplane->nhfrags;
-  frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
-  frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
-  while(frag0<frag0_end){
-    frag=frag0;
-    frag_end=frag+fplane->nhfrags;
-    while(frag<frag_end){
-      if(frag->coded){
-        if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
-        }
-        if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
-        }
-        if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
-        }
-        if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
-          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
-           iplane->ystride,ll);
-        }
-      }
-      frag++;
-    }
-    frag0+=fplane->nhfrags;
-  }
-
-  /*This needs to be removed when decode specific functions are implemented:*/
-  _mm_empty();
-}
-
-#endif
-
+#endif
\ No newline at end of file

Modified: trunk/theora/lib/dec/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxstate.c	2008-04-13 06:23:04 UTC (rev 14721)
+++ trunk/theora/lib/dec/x86_vc/mmxstate.c	2008-04-13 06:28:00 UTC (rev 14722)
@@ -138,7 +138,7 @@
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
     oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
@@ -149,7 +149,7 @@
     int mvoffset0;
     int mvoffset1;
     ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride;
     if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
      _frag->mv[1],ref_ystride,_pli)>1){
       oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
@@ -176,8 +176,8 @@
   int        src_ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride;
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
     oc_fragment *frag = _state->frags+*fragi;
@@ -187,196 +187,4 @@
   _m_empty();
 }
 
-#endif
-
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id:
-
- ********************************************************************/
-
-/* ------------------------------------------------------------------------
-  MMX acceleration of complete fragment reconstruction algorithm.
-    Originally written by Rudolf Marek.
-
-  Conversion to MSC intrinsics by Nils Pipenbrinck.
-  ---------------------------------------------------------------------*/
-#if defined(USE_ASM)
-
-#include "../../internal.h"
-#include "../idct.h"
-#include "x86int.h"
-#include <mmintrin.h>
-
-static const unsigned char OC_FZIG_ZAGMMX[64]=
-{
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
-/* Fill a block with value */
-static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
-	__m64 t	 = _value;
-	_dst[0]  = t;	_dst[1]  = t;	_dst[2]  = t;	_dst[3]  = t;
-	_dst[4]  = t;	_dst[5]  = t;	_dst[6]  = t;	_dst[7]  = t;
-	_dst[8]  = t;	_dst[9]  = t;	_dst[10] = t;	_dst[11] = t;
-	_dst[12] = t;	_dst[13] = t;	_dst[14] = t;	_dst[15] = t;
-}
-
-/* copy a block of 8 byte elements using different strides */
-static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride, 
-																			  unsigned char * _src, int _src_ystride){
-	__m64 a,b,c,d,e,f,g,h;
-	a = *(__m64*)(_src + 0 * _src_ystride);
-	b = *(__m64*)(_src + 1 * _src_ystride);
-	c = *(__m64*)(_src + 2 * _src_ystride);
-	d = *(__m64*)(_src + 3 * _src_ystride);
-	e = *(__m64*)(_src + 4 * _src_ystride);
-	f = *(__m64*)(_src + 5 * _src_ystride);
-	g = *(__m64*)(_src + 6 * _src_ystride);
-	h = *(__m64*)(_src + 7 * _src_ystride);
-	*(__m64*)(_dst + 0 * _dst_ystride) = a;
-	*(__m64*)(_dst + 1 * _dst_ystride) = b;
-	*(__m64*)(_dst + 2 * _dst_ystride) = c;
-	*(__m64*)(_dst + 3 * _dst_ystride) = d;
-	*(__m64*)(_dst + 4 * _dst_ystride) = e;
-	*(__m64*)(_dst + 5 * _dst_ystride) = f;
-	*(__m64*)(_dst + 6 * _dst_ystride) = g;
-	*(__m64*)(_dst + 7 * _dst_ystride) = h;
-}
-
-void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
-  ogg_int16_t __declspec(align(16)) res_buf[64];
-  int dst_framei;
-  int dst_ystride;
-  int zzi;
-  /*_last_zzi is subtly different from an actual count of the number of
-     coefficients we decoded for this block.
-    It contains the value of zzi BEFORE the final token in the block was
-     decoded.
-    In most cases this is an EOB token (the continuation of an EOB run from a
-     previous block counts), and so this is the same as the coefficient count.
-    However, in the case that the last token was NOT an EOB token, but filled
-     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
-    Provided the last token was not a pure zero run, the minimum value it can
-     be is 46, and so that doesn't affect any of the cases in this routine.
-    However, if the last token WAS a pure zero run of length 63, then _last_zzi
-     will be 1 while the number of coefficients decoded is 64.
-    Thus, we will trigger the following special case, where the real
-     coefficient count would not.
-    Note also that a zero run of length 64 will give _last_zzi a value of 0,
-     but we still process the DC coefficient, which might have a non-zero value
-     due to DC prediction.
-    Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
-    It could be smarter... multiple separate zero runs at the end of a block
-     will fool it, but an encoder that generates these really deserves what it
-     gets.
-    Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    __m64 p;
-    /*Why is the iquant product rounded in this case and no others? Who knows.*/
-    p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-    /* broadcast 16 bits into all 4 mmx subregisters */
-    p = _m_punpcklwd (p,p);
-    p = _m_punpckldq (p,p);
-    loc_fill_mmx_value ((__m64 *)res_buf, p);
-  }
-  else{
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-		loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
-
-    res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
-    /*This is planned to be rewritten in MMX.*/
-    for(zzi=1;zzi<_ncoefs;zzi++)
-    {
-      int ci;
-      ci=OC_FZIG_ZAG[zzi];
-      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
-       _ac_iquant[ci]);
-    }
-
-    if(_last_zzi<10){
-      oc_idct8x8_10_mmx(res_buf);
-    }
-    else {
-      oc_idct8x8_mmx(res_buf);
-    }
-  }
-  /*Fill in the target buffer.*/
-  dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
-  /*For now ystride values in all ref frames assumed to be equal.*/
-  if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
-  }
-  else{
-    int ref_framei;
-    int ref_ystride;
-    int mvoffset0;
-    int mvoffset1;
-    ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
-    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
-     _frag->mv[1],ref_ystride,_pli)>1){
-      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
-       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
-    }
-    else{
-			oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
-    }
-  }
-
-	_mm_empty();
-}
-
-
-void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli){
-  const int *fragi;
-  const int *fragi_end;
-  int        dst_framei;
-  int        dst_ystride;
-  int        src_framei;
-  int        src_ystride;
-  dst_framei=_state->ref_frame_idx[_dst_frame];
-  src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
-  fragi_end=_fragis+_nfragis;
-  for(fragi=_fragis;fragi<fragi_end;fragi++){
-    oc_fragment *frag = _state->frags+*fragi;
-		loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride, 
-											 frag->buffer[src_framei], src_ystride);
-  }
-  _m_empty();
-}
-
-#endif
-
+#endif
\ No newline at end of file

Modified: trunk/theora/lib/dec/x86_vc/x86int.h
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86int.h	2008-04-13 06:23:04 UTC (rev 14721)
+++ trunk/theora/lib/dec/x86_vc/x86int.h	2008-04-13 06:28:00 UTC (rev 14722)
@@ -47,52 +47,3 @@
   int _refi,int _pli,int _fragy0,int _fragy_end);
 
 #endif
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_x86_x86int_vc_H)
-# define _x86_x86int_vc_H (1)
-# include "../../internal.h"
-
-void oc_state_vtable_init_x86(oc_theora_state *_state);
-
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t *_residue);
-
-void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue);
-
-void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli);
-
-void oc_restore_fpu_mmx(void);
-
-void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
- int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
-
-void oc_idct8x8_mmx(ogg_int16_t _y[64]);
-void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
-
-void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,                                                    
-  int _refi,int _pli,int _fragy0,int _fragy_end);
-
-#endif



More information about the commits mailing list