[xiph-commits] r13926 - trunk/theora/lib/enc/x86_32_vs

sping at svn.xiph.org sping at svn.xiph.org
Wed Oct 3 11:56:42 PDT 2007


Author: sping
Date: 2007-10-03 11:56:42 -0700 (Wed, 03 Oct 2007)
New Revision: 13926

Modified:
   trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
Log:
Fix inconsistent line endings


Modified: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c	2007-10-03 18:51:06 UTC (rev 13925)
+++ trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c	2007-10-03 18:56:42 UTC (rev 13926)
@@ -1,42 +1,42 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-
-#include "codec_internal.h"
-#include "dsp.h"
-
-#if 0
-//These are to let me selectively enable the C versions, these are needed
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-#endif
-
-
-static const ogg_int64_t V128 = 0x0080008000800080LL;
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
-{
-
-    //Make non-zero to use the C-version
-#if 0
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if 0
+//These are to let me selectively enable the C versions, these are needed
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+#endif
+
+
+static const ogg_int64_t V128 = 0x0080008000800080LL;
+
+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+                  ogg_uint32_t ReconPixelsPerLine) 
+{
+
+    //Make non-zero to use the C-version
+#if 0
   int i;
 
   /* For each block row */
@@ -54,200 +54,200 @@
     FiltPtr += PixelsPerLine;
     ReconPtr += ReconPixelsPerLine;
     DctInputPtr += 8;
-  }
-#else
-    __asm {
-        align 16
-
-        pxor		mm7, mm7	
-
-        mov     eax, FiltPtr
-        mov     ebx, ReconPtr
-        mov     edx, DctInputPtr
-
-     /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/		
-     
-     /* ITERATION 1 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 2 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 3 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 4 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 5 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 6 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 7 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-     /* ITERATION 8 */
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		/* mm1 = ReconPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        movq		mm3, mm1		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
-        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
-        movq		[edx], mm0		/* write answer out */
-        movq		[8 + edx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		edx, 16		
-        add		eax, PixelsPerLine		
-        add		ebx, ReconPixelsPerLine
-
-
-
-     
-
-    };
- 
-#endif
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
-{
-
-#if 0
+  }
+#else
+    __asm {
+        align 16
+
+        pxor		mm7, mm7	
+
+        mov     eax, FiltPtr
+        mov     ebx, ReconPtr
+        mov     edx, DctInputPtr
+
+     /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/		
+     
+     /* ITERATION 1 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 2 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 3 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 4 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 5 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 6 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 7 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+     /* ITERATION 8 */
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		/* mm1 = ReconPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        movq		mm3, mm1		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		/* mm1 = INT16(ReconPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		/* mm3 = INT16(ReconPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - ReconPtr */
+        psubw		mm2, mm3		/* mm2 = FiltPtr - ReconPtr */
+        movq		[edx], mm0		/* write answer out */
+        movq		[8 + edx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		edx, 16		
+        add		eax, PixelsPerLine		
+        add		ebx, ReconPixelsPerLine
+
+
+
+     
+
+    };
+ 
+#endif
+}
+
+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+                      ogg_uint32_t PixelsPerLine) 
+{
+
+#if 0
   int i;
   /* For each block row */
   for (i=8; i; i--) {
@@ -267,161 +267,161 @@
     /* Start next row */
     FiltPtr += PixelsPerLine;
     DctInputPtr += 8;
-  }
-
-#else
-    __asm {
-        align 16
-
-        pxor		mm7, mm7		
-
-        mov         eax, FiltPtr
-        mov         ebx, DctInputPtr
-
-        movq		mm1, V128
-
-        /*  ITERATION 1 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 2 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 3 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 4 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 5 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 6 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 7 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-
-        /*  ITERATION 8 */		
-        movq		mm0, [eax]		/* mm0 = FiltPtr */
-        movq		mm2, mm0		/* dup to prepare for up conversion */
-        /* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
-        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
-        /* start calculation */
-        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
-        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
-        movq		[ebx], mm0		/* write answer out */
-        movq		[8 + ebx], mm2		/* write answer out */
-        /* Increment pointers */
-        add		ebx, 16		
-        add		eax, PixelsPerLine	
-
-    };
- 
-#endif
-}
-
-
-
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
-{
-
-#if 0
+  }
+
+#else
+    __asm {
+        align 16
+
+        pxor		mm7, mm7		
+
+        mov         eax, FiltPtr
+        mov         ebx, DctInputPtr
+
+        movq		mm1, V128
+
+        /*  ITERATION 1 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 2 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 3 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 4 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 5 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 6 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 7 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 8 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+    };
+ 
+#endif
+}
+
+
+
+
+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+                     ogg_uint32_t PixelsPerLine,
+                     ogg_uint32_t ReconPixelsPerLine) 
+{
+
+#if 0
   int i;
 
   /* For each block row */
@@ -440,271 +440,271 @@
     ReconPtr1 += ReconPixelsPerLine;
     ReconPtr2 += ReconPixelsPerLine;
     DctInputPtr += 8;
-  }
-#else
-
-    __asm {
-        align 16
-
-            pxor        mm7, mm7
-
-        mov         eax, FiltPtr
-        mov         ebx, ReconPtr1
-        mov         ecx, ReconPtr2
-        mov         edx, DctInputPtr
-
-        /*  ITERATION 1 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-	
-
-        /*  ITERATION 2 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 3 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 4 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 5 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 6 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 7 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-
-        /*  ITERATION 8 */	
-        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
-        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
-        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
-        movq		mm2, mm0		;	/* dup to prepare for up conversion */
-        movq		mm3, mm1		;	/* dup to prepare for up conversion */
-        movq		mm5, mm4		;	/* dup to prepare for up conversion */
-	        ;	/* convert from UINT8 to INT16 */
-        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
-        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
-        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
-        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
-        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
-        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
-	        ;	/* average ReconPtr1 and ReconPtr2 */
-        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
-        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
-        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-        movq		[edx], mm0		;	/* write answer out */
-        movq		[8 + edx], mm2		;	/* write answer out */
-	        ;	/* Increment pointers */
-        add		edx, 16		;	
-        add		eax, PixelsPerLine		;	
-        add		ebx, ReconPixelsPerLine		;	
-        add		ecx, ReconPixelsPerLine		;	
-
-    };
-
-
-
-
- 
-#endif
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
-
-#if 0
+  }
+#else
+
+    __asm {
+        align 16
+
+            pxor        mm7, mm7
+
+        mov         eax, FiltPtr
+        mov         ebx, ReconPtr1
+        mov         ecx, ReconPtr2
+        mov         edx, DctInputPtr
+
+        /*  ITERATION 1 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+	
+
+        /*  ITERATION 2 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 3 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 4 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 5 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 6 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 7 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 8 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+    };
+
+
+
+
+ 
+#endif
+}
+
+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+{
+
+#if 0
   ogg_uint32_t SadValue;
   ogg_uint32_t SadValue1;
 
@@ -720,70 +720,70 @@
 
   SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
 
-  return SadValue;
-
-#else
-  ogg_uint32_t MaxSad;
-
-  
-  __asm {
-    align       16
-    mov         ebx, Src1
-    mov         ecx, Src2
-
-
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* zero out mm7 for unpack */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [ecx]		;	
-
-    movq		mm2, mm0		;	
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* ; unpack low four bytes to higher precision */
-    punpckhbw		mm1, mm7		;	/* ; unpack high four bytes to higher precision */
-
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-    psrlq		mm2, 32		;	/* fold and add */
-    psrlq		mm3, 32		;	
-    paddw		mm0, mm2		;	
-    paddw		mm1, mm3		;	
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-    psrlq		mm2, 16		;	
-    psrlq		mm3, 16		;	
-    paddw		mm0, mm2		;	
-    paddw		mm1, mm3		;	
-
-    psubusw		mm1, mm0		;	
-    paddw		mm1, mm0		;	/* mm1 = max(mm1, mm0) */
-    movd		eax, mm1		;
-
-    and         eax, 0xffff
-    mov         MaxSad, eax
-  };
-   return MaxSad;
-  
-  
-  
- 
-
-#endif
-}
-
-
-
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-
-#if 0
+  return SadValue;
+
+#else
+  ogg_uint32_t MaxSad;
+
+  
+  __asm {
+    align       16
+    mov         ebx, Src1
+    mov         ecx, Src2
+
+
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* zero out mm7 for unpack */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [ecx]		;	
+
+    movq		mm2, mm0		;	
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* ; unpack low four bytes to higher precision */
+    punpckhbw		mm1, mm7		;	/* ; unpack high four bytes to higher precision */
+
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+    psrlq		mm2, 32		;	/* fold and add */
+    psrlq		mm3, 32		;	
+    paddw		mm0, mm2		;	
+    paddw		mm1, mm3		;	
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+    psrlq		mm2, 16		;	
+    psrlq		mm3, 16		;	
+    paddw		mm0, mm2		;	
+    paddw		mm1, mm3		;	
+
+    psubusw		mm1, mm0		;	
+    paddw		mm1, mm0		;	/* mm1 = max(mm1, mm0) */
+    movd		eax, mm1		;
+
+    and         eax, 0xffff
+    mov         MaxSad, eax
+  };
+   return MaxSad;
+  
+  
+  
+ 
+
+#endif
+}
+
+
+
+
+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
+		                    ogg_uint32_t stride)
+{
+
+#if 0
   ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
   ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
   ogg_uint32_t MaxSad = 0;
@@ -824,94 +824,94 @@
       MaxSad = SadValue2[i];
   }
     
-  return MaxSad;
-#else
-  ogg_uint32_t MaxSad;
-
-
-    __asm {
-        align       16
-        mov         ebx, Src1
-        mov         ecx, Src2
-
-        pxor		mm3, mm3		;	/* zero out mm3 for unpack */
-        pxor		mm4, mm4		;	/* mm4 low sum */
-        pxor		mm5, mm5		;	/* mm5 high sum */
-        pxor		mm6, mm6		;	/* mm6 low sum */
-        pxor		mm7, mm7		;	/* mm7 high sum */
-        mov		edi, 4		;	/* 4 rows */
-        label_1:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
-
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
-
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm4, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm5, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
-
-        dec		edi		;	
-        jnz		label_1		;	
-
-        mov		edi, 4		;	/* 4 rows */
-        label_2:				;	
-        movq		mm0, [ebx]		;	/* take 8 bytes */
-        movq		mm1, [ecx]		;	/* take 8 bytes */
-
-        movq		mm2, mm0		;	
-        psubusb		mm0, mm1		;	/* A - B */
-        psubusb		mm1, mm2		;	/* B - A */
-        por		mm0, mm1		;	/* and or gives abs difference */
-        movq		mm1, mm0		;	
-
-        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
-        paddw		mm6, mm0		;	/* accumulate difference... */
-        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
-        paddw		mm7, mm1		;	/* accumulate difference... */
-        add		ebx, stride		;	/* Inc pointer into the new data */
-        add		ecx, stride		;	/* Inc pointer into the new data */
-
-        dec		edi		;	
-        jnz		label_2		;	
-
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm7, mm6) */
-        psubusw		mm5, mm4		;	
-        paddw		mm5, mm4		;	/* mm5 = max(mm5, mm4) */
-        psubusw		mm7, mm5		;	
-        paddw		mm7, mm5		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 32		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movq		mm6, mm7		;	
-        psrlq		mm6, 16		;	
-        psubusw		mm7, mm6		;	
-        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
-        movd		eax, mm7		;	
-        and		    eax, 0xffff		;
-
-        mov         MaxSad, eax
-    };
-
-    return MaxSad;
-
-
-#endif
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-
-#if 0
+  return MaxSad;
+#else
+  ogg_uint32_t MaxSad;
+
+
+    __asm {
+        align       16
+        mov         ebx, Src1
+        mov         ecx, Src2
+
+        pxor		mm3, mm3		;	/* zero out mm3 for unpack */
+        pxor		mm4, mm4		;	/* mm4 low sum */
+        pxor		mm5, mm5		;	/* mm5 high sum */
+        pxor		mm6, mm6		;	/* mm6 low sum */
+        pxor		mm7, mm7		;	/* mm7 high sum */
+        mov		edi, 4		;	/* 4 rows */
+        label_1:				;	
+        movq		mm0, [ebx]		;	/* take 8 bytes */
+        movq		mm1, [ecx]		;	/* take 8 bytes */
+
+        movq		mm2, mm0		;	
+        psubusb		mm0, mm1		;	/* A - B */
+        psubusb		mm1, mm2		;	/* B - A */
+        por		mm0, mm1		;	/* and or gives abs difference */
+        movq		mm1, mm0		;	
+
+        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
+        paddw		mm4, mm0		;	/* accumulate difference... */
+        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
+        paddw		mm5, mm1		;	/* accumulate difference... */
+        add		ebx, stride		;	/* Inc pointer into the new data */
+        add		ecx, stride		;	/* Inc pointer into the new data */
+
+        dec		edi		;	
+        jnz		label_1		;	
+
+        mov		edi, 4		;	/* 4 rows */
+        label_2:				;	
+        movq		mm0, [ebx]		;	/* take 8 bytes */
+        movq		mm1, [ecx]		;	/* take 8 bytes */
+
+        movq		mm2, mm0		;	
+        psubusb		mm0, mm1		;	/* A - B */
+        psubusb		mm1, mm2		;	/* B - A */
+        por		mm0, mm1		;	/* and or gives abs difference */
+        movq		mm1, mm0		;	
+
+        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
+        paddw		mm6, mm0		;	/* accumulate difference... */
+        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
+        paddw		mm7, mm1		;	/* accumulate difference... */
+        add		ebx, stride		;	/* Inc pointer into the new data */
+        add		ecx, stride		;	/* Inc pointer into the new data */
+
+        dec		edi		;	
+        jnz		label_2		;	
+
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm7, mm6) */
+        psubusw		mm5, mm4		;	
+        paddw		mm5, mm4		;	/* mm5 = max(mm5, mm4) */
+        psubusw		mm7, mm5		;	
+        paddw		mm7, mm5		;	/* mm7 = max(mm5, mm7) */
+        movq		mm6, mm7		;	
+        psrlq		mm6, 32		;	
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
+        movq		mm6, mm7		;	
+        psrlq		mm6, 16		;	
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
+        movd		eax, mm7		;	
+        and		    eax, 0xffff		;
+
+        mov         MaxSad, eax
+    };
+
+    return MaxSad;
+
+
+#endif
+}
+
+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+{
+
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  sad = 0;
 
@@ -930,189 +930,189 @@
     ptr2 += stride2;
   }
 
-  return sad;
-#else
-  ogg_uint32_t  DiffVal;
-
-  __asm {
-    align  16
-
-    mov         ebx, ptr1
-    mov         edx, ptr2
-
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* mm7 contains the result */
-    
-    ; /* ITERATION 1 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-    ; /* ITERATION 2 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-    ; /* ITERATION 3 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-    ; /* ITERATION 4 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-    ; /* ITERATION 5 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-    ; /* ITERATION 6 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-    ; /* ITERATION 7 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-
-    ; /* ITERATION 8 */
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, stride1		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		edx, stride2		;	/* Inc pointer into ref data */
-
-
-
-    ; /* ------ */
-
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddw		mm7, mm0		;	
-    movq		mm0, mm7		;	
-    psrlq		mm7, 16		;	
-    paddw		mm7, mm0		;	
-    movd		eax, mm7		;	
-    and		    eax, 0xffff		;	
-
-    mov         DiffVal, eax
-  };
-
-  return DiffVal;
-
- 
-
-#endif
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-#if 0
+  return sad;
+#else
+  ogg_uint32_t  DiffVal;
+
+  __asm {
+    align  16
+
+    mov         ebx, ptr1
+    mov         edx, ptr2
+
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* mm7 contains the result */
+    
+    ; /* ITERATION 1 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+    ; /* ITERATION 2 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 3 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+    ; /* ITERATION 4 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 5 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 6 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 7 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+
+    ; /* ITERATION 8 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+
+    ; /* ------ */
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddw		mm7, mm0		;	
+    movq		mm0, mm7		;	
+    psrlq		mm7, 16		;	
+    paddw		mm7, mm0		;	
+    movd		eax, mm7		;	
+    and		    eax, 0xffff		;	
+
+    mov         DiffVal, eax
+  };
+
+  return DiffVal;
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
+			   	  ogg_uint32_t thres)
+{
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  sad = 0;
 
@@ -1134,19 +1134,19 @@
     ptr2 += stride2;
   }
 
-  return sad;
-#else
-  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-#endif
-}
-
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-#if 0
+  return sad;
+#else
+  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
+#endif
+}
+
+
+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                      unsigned char *RefDataPtr1,
+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+			              ogg_uint32_t thres)
+{
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  sad = 0;
 
@@ -1169,76 +1169,76 @@
     RefDataPtr2 += RefStride;
   }
 
-  return sad;
-#else
-  ogg_uint32_t  DiffVal;
-
-  __asm {
-    align 16
-
-        mov     ebx, SrcData
-        mov     ecx, RefDataPtr1
-        mov     edx, RefDataPtr2
-
-
-    pcmpeqd		mm5, mm5		;	/* fefefefefefefefe in mm5 */
-    paddb		mm5, mm5		;	
-				    ;	
-    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
-    pxor		mm7, mm7		;	/* mm7 contains the result */
-    mov		edi, 8		;	/* 8 rows */
-    loop_start:				;	
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-
-    movq		mm2, [ecx]		;	
-    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
-    movq		mm1, mm2		;	
-    pand		mm1, mm3		;	
-    pxor		mm3, mm2		;	
-    pand		mm3, mm5		;	
-    psrlq		mm3, 1		;	
-    paddb		mm1, mm3		;	
-
-    movq		mm2, mm0		;	
-
-    psubusb		mm0, mm1		;	/* A - B */
-    psubusb		mm1, mm2		;	/* B - A */
-    por		mm0, mm1		;	/* and or gives abs difference */
-    movq		mm1, mm0		;	
-
-    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
-    paddw		mm7, mm0		;	/* accumulate difference... */
-    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
-    add		ebx, SrcStride		;	/* Inc pointer into the new data */
-    paddw		mm7, mm1		;	/* accumulate difference... */
-    add		ecx, RefStride		;	/* Inc pointer into ref data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
-
-    dec		edi		;	
-    jnz		loop_start		;	
-
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddw		mm7, mm0		;	
-    movq		mm0, mm7		;	
-    psrlq		mm7, 16		;	
-    paddw		mm7, mm0		;	
-    movd		eax, mm7		;	
-    and		eax, 0xffff		;	
-
-    mov DiffVal, eax
-  };
-
-  return DiffVal;
-
- 
-
-#endif
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-#if 0
+  return sad;
+#else
+  ogg_uint32_t  DiffVal;
+
+  __asm {
+    align 16
+
+        mov     ebx, SrcData
+        mov     ecx, RefDataPtr1
+        mov     edx, RefDataPtr2
+
+
+    pcmpeqd		mm5, mm5		;	/* fefefefefefefefe in mm5 */
+    paddb		mm5, mm5		;	
+				    ;	
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* mm7 contains the result */
+    mov		edi, 8		;	/* 8 rows */
+    loop_start:				;	
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+
+    movq		mm2, [ecx]		;	
+    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
+    movq		mm1, mm2		;	
+    pand		mm1, mm3		;	
+    pxor		mm3, mm2		;	
+    pand		mm3, mm5		;	
+    psrlq		mm3, 1		;	
+    paddb		mm1, mm3		;	
+
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, SrcStride		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		ecx, RefStride		;	/* Inc pointer into ref data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddw		mm7, mm0		;	
+    movq		mm0, mm7		;	
+    psrlq		mm7, 16		;	
+    paddw		mm7, mm0		;	
+    movd		eax, mm7		;	
+    and		eax, 0xffff		;	
+
+    mov DiffVal, eax
+  };
+
+  return DiffVal;
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
+{
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  XSum=0;
   ogg_uint32_t  XXSum=0;
@@ -1267,73 +1267,73 @@
    }
 
    /* Compute population variance as mis-match metric. */
-   return (( (XXSum<<6) - XSum*XSum ) );
-#else
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm {
-    align 16
-
-        mov     ecx, DataPtr
-
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:		
-    movq		mm0, [ecx]		;	/* take 8 bytes */
-    movq		mm2, mm0		;	
-
-    punpcklbw		mm0, mm6		;	
-    punpckhbw		mm2, mm6		;	
-
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
-
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
-
-    add		ecx, Stride		;	/* Inc pointer into src data */
-
-    dec		edi		;	
-    jnz		loop_start		;	
-
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov		eax, edi		;	
-
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd		ebx, mm7		;	
-
-        mov         XSum, eax
-        mov         XXSum, ebx;
-
-  };
-    /* Compute population variance as mis-match metric. */
-    return (( (XXSum<<6) - XSum*XSum ) );
-
- 
-
-#endif
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
-
-#if 0
+   return (( (XXSum<<6) - XSum*XSum ) );
+#else
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+  __asm {
+    align 16
+
+        mov     ecx, DataPtr
+
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:		
+    movq		mm0, [ecx]		;	/* take 8 bytes */
+    movq		mm2, mm0		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpckhbw		mm2, mm6		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ecx, Stride		;	/* Inc pointer into src data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov		eax, edi		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd		ebx, mm7		;	
+
+        mov         XSum, eax
+        mov         XXSum, ebx;
+
+  };
+    /* Compute population variance as mis-match metric. */
+    return (( (XXSum<<6) - XSum*XSum ) );
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  XSum=0;
   ogg_uint32_t  XXSum=0;
@@ -1378,83 +1378,83 @@
   }
 
   /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-#else
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-
-  __asm {
-    align 16
-
-        mov     ecx, SrcData
-        mov     edx, RefDataPtr
-
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:				;	
-    movq		mm0, [ecx]		;	/* take 8 bytes */
-    movq		mm1, [edx]		;	
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-
-    punpcklbw		mm0, mm6		;	
-    punpcklbw		mm1, mm6		;	
-    punpckhbw		mm2, mm6		;	
-    punpckhbw		mm3, mm6		;	
-
-    psubsw		mm0, mm1		;	
-    psubsw		mm2, mm3		;	
-
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
-
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
-
-    add		ecx, SrcStride		;	/* Inc pointer into src data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
-
-    dec		edi		;	
-    jnz		loop_start		;	
-
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov		eax, edi		;	
-
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd		ebx, mm7		;	
-
-        mov     XSum, eax
-        mov     XXSum, ebx
-
-  };
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-
- 
-#endif
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-#if 0
+  return (( (XXSum<<6) - XSum*XSum ));
+#else
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+
+  __asm {
+    align 16
+
+        mov     ecx, SrcData
+        mov     edx, RefDataPtr
+
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:				;	
+    movq		mm0, [ecx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpcklbw		mm1, mm6		;	
+    punpckhbw		mm2, mm6		;	
+    punpckhbw		mm3, mm6		;	
+
+    psubsw		mm0, mm1		;	
+    psubsw		mm2, mm3		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ecx, SrcStride		;	/* Inc pointer into src data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov		eax, edi		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd		ebx, mm7		;	
+
+        mov     XSum, eax
+        mov     XXSum, ebx
+
+  };
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+
+ 
+#endif
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                     unsigned char *RefDataPtr1,
+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+#if 0
   ogg_uint32_t  i;
   ogg_uint32_t  XSum=0;
   ogg_uint32_t  XXSum=0;
@@ -1500,107 +1500,107 @@
   }
 
   /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-#else
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm {
-    align 16
-
-        mov ebx, SrcData
-        mov ecx, RefDataPtr1
-        mov edx, RefDataPtr2
-
-    pcmpeqd		mm4, mm4		;	/* fefefefefefefefe in mm4 */
-    paddb		mm4, mm4		;	
-    pxor		mm5, mm5		;	
-    pxor		mm6, mm6		;	
-    pxor		mm7, mm7		;	
-    mov		edi, 8		;	
-    loop_start:				;	
-    movq		mm0, [ebx]		;	/* take 8 bytes */
-
-    movq		mm2, [ecx]		;	
-    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
-    movq		mm1, mm2		;	
-    pand		mm1, mm3		;	
-    pxor		mm3, mm2		;	
-    pand		mm3, mm4		;	
-    psrlq		mm3, 1		;	
-    paddb		mm1, mm3		;	
-
-    movq		mm2, mm0		;	
-    movq		mm3, mm1		;	
-
-    punpcklbw		mm0, mm6		;	
-    punpcklbw		mm1, mm6		;	
-    punpckhbw		mm2, mm6		;	
-    punpckhbw		mm3, mm6		;	
-
-    psubsw		mm0, mm1		;	
-    psubsw		mm2, mm3		;	
-
-    paddw		mm5, mm0		;	
-    paddw		mm5, mm2		;	
-
-    pmaddwd		mm0, mm0		;	
-    pmaddwd		mm2, mm2		;	
-				    ;	
-    paddd		mm7, mm0		;	
-    paddd		mm7, mm2		;	
-
-    add		ebx, SrcStride		;	/* Inc pointer into src data */
-    add		ecx, RefStride		;	/* Inc pointer into ref data */
-    add		edx, RefStride		;	/* Inc pointer into ref data */
-
-    dec		edi		;	
-    jnz		loop_start		;	
-
-    movq		mm0, mm5		;	
-    psrlq		mm5, 32		;	
-    paddw		mm5, mm0		;	
-    movq		mm0, mm5		;	
-    psrlq		mm5, 16		;	
-    paddw		mm5, mm0		;	
-    movd		edi, mm5		;	
-    movsx		edi, di		;	
-    mov         XSum, edi   ; /* movl		eax, edi		;	Modified for vc to resuse eax*/
-
-    movq		mm0, mm7		;	
-    psrlq		mm7, 32		;	
-    paddd		mm7, mm0		;	
-    movd        XXSum, mm7 ; /*movd		eax, mm7		; Modified for vc to reuse eax */
-  };
-
-    return (( (XXSum<<6) - XSum*XSum ));
-
-#endif
-}
-
-static void restore_fpu (void)
-{
-
-    __asm {
-        emms
-    }
-
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
-  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
-  funcs->restore_fpu = restore_fpu;
-  funcs->sub8x8 = sub8x8__mmx;
-  funcs->sub8x8_128 = sub8x8_128__mmx;
-  funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->row_sad8 = row_sad8__mmx;
-  funcs->col_sad8x8 = col_sad8x8__mmx;
-  funcs->sad8x8 = sad8x8__mmx;
-  funcs->sad8x8_thres = sad8x8_thres__mmx;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
-  funcs->intra8x8_err = intra8x8_err__mmx;
-  funcs->inter8x8_err = inter8x8_err__mmx;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-
+  return (( (XXSum<<6) - XSum*XSum ));
+#else
+  ogg_uint32_t XSum;
+  ogg_uint32_t XXSum;
+
+  __asm {
+    align 16
+
+        mov ebx, SrcData
+        mov ecx, RefDataPtr1
+        mov edx, RefDataPtr2
+
+    pcmpeqd		mm4, mm4		;	/* fefefefefefefefe in mm4 */
+    paddb		mm4, mm4		;	
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:				;	
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+
+    movq		mm2, [ecx]		;	
+    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
+    movq		mm1, mm2		;	
+    pand		mm1, mm3		;	
+    pxor		mm3, mm2		;	
+    pand		mm3, mm4		;	
+    psrlq		mm3, 1		;	
+    paddb		mm1, mm3		;	
+
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpcklbw		mm1, mm6		;	
+    punpckhbw		mm2, mm6		;	
+    punpckhbw		mm3, mm6		;	
+
+    psubsw		mm0, mm1		;	
+    psubsw		mm2, mm3		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ebx, SrcStride		;	/* Inc pointer into src data */
+    add		ecx, RefStride		;	/* Inc pointer into ref data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov         XSum, edi   ; /* movl		eax, edi		;	Modified for vc to resuse eax*/
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd        XXSum, mm7 ; /*movd		eax, mm7		; Modified for vc to reuse eax */
+  };
+
+    return (( (XXSum<<6) - XSum*XSum ));
+
+#endif
+}
+
+static void restore_fpu (void)
+{
+
+    __asm {
+        emms
+    }
+
+}
+
+void dsp_mmx_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
+  funcs->restore_fpu = restore_fpu;
+  funcs->sub8x8 = sub8x8__mmx;
+  funcs->sub8x8_128 = sub8x8_128__mmx;
+  funcs->sub8x8avg2 = sub8x8avg2__mmx;
+  funcs->row_sad8 = row_sad8__mmx;
+  funcs->col_sad8x8 = col_sad8x8__mmx;
+  funcs->sad8x8 = sad8x8__mmx;
+  funcs->sad8x8_thres = sad8x8_thres__mmx;
+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
+  funcs->intra8x8_err = intra8x8_err__mmx;
+  funcs->inter8x8_err = inter8x8_err__mmx;
+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+}
+


Property changes on: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
___________________________________________________________________
Name: svn:eol-style
   + native



More information about the commits mailing list