[xiph-commits] r11500 - in branches/theora-playtime: lib lib/x86_32_vs win32/VS2005/libtheora

Sat Jun 3 03:08:55 PDT 2006

Author: illiminable
Date: 2006-06-03 03:08:41 -0700 (Sat, 03 Jun 2006)
New Revision: 11500

Added:
   branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
Modified:
   branches/theora-playtime/lib/codec_internal.h
   branches/theora-playtime/lib/dsp.c
   branches/theora-playtime/lib/encoder_toplevel.c
   branches/theora-playtime/lib/reconstruct.c
   branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
Log:
* DCTDataBuffer malloc 16 byte aligned
* Add sse2 setup in reconstruct.c and dsp.c
* Add a #define for aligned free
* Implement sub8x8__sse2

Modified: branches/theora-playtime/lib/codec_internal.h
===================================================================

--- branches/theora-playtime/lib/codec_internal.h	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/lib/codec_internal.h	2006-06-03 10:08:41 UTC (rev 11500)
@@ -27,10 +27,12 @@
 #include "dsp.h"
 
 #if defined(USE_ASM) && defined(_MSC_VER)
+
 #define _theora_16_byte_aligned_malloc(x) _aligned_malloc((x),16)
 #define _theora_16_byte_aligned_free(x) _aligned_free((x))
 #else
 #define _theora_16_byte_aligned_malloc _ogg_malloc
+#define _theora_16_byte_aligned_free _ogg_free
 #endif
 
 #ifndef LIBOGG2

Modified: branches/theora-playtime/lib/dsp.c
===================================================================
--- branches/theora-playtime/lib/dsp.c	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/lib/dsp.c	2006-06-03 10:08:41 UTC (rev 11500)
@@ -413,12 +413,19 @@
   if (cpuflags & CPU_X86_MMX) {
     dsp_mmx_init(funcs);
   }
+
 # ifndef WIN32
-  /* This is implemented for win32 yet */
+  /* This is not implemented for win32 yet */
   if (cpuflags & CPU_X86_MMXEXT) {
     dsp_mmxext_init(funcs);
   }
 # endif
+
+  if (cpuflags & CPU_X86_SSE2) {
+    dsp_sse2_init(funcs);
+  }
+
+
 #endif
 }
 

Modified: branches/theora-playtime/lib/encoder_toplevel.c
===================================================================
--- branches/theora-playtime/lib/encoder_toplevel.c	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/lib/encoder_toplevel.c	2006-06-03 10:08:41 UTC (rev 11500)
@@ -49,7 +49,7 @@
   if(cpi->DCT_codes )
     _ogg_free( cpi->DCT_codes );
   if(cpi->DCTDataBuffer )
-    _ogg_free( cpi->DCTDataBuffer);
+    _theora_16_byte_aligned_free( cpi->DCTDataBuffer);
   if(cpi->quantized_list)
     _ogg_free( cpi->quantized_list);
   if(cpi->OriginalDC)
@@ -140,7 +140,7 @@
     _ogg_malloc(64*
                 sizeof(*cpi->DCT_codes));
   cpi->DCTDataBuffer =
-    _ogg_malloc(64*
+    _theora_16_byte_aligned_malloc(64*
                 sizeof(*cpi->DCTDataBuffer));
   cpi->quantized_list =
     _ogg_malloc(64*

Modified: branches/theora-playtime/lib/reconstruct.c
===================================================================
--- branches/theora-playtime/lib/reconstruct.c	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/lib/reconstruct.c	2006-06-03 10:08:41 UTC (rev 11500)
@@ -105,10 +105,12 @@
   funcs->recon_inter8x8 = recon_inter8x8__c;
   funcs->recon_inter8x8_half = recon_inter8x8_half__c;
 #if defined(USE_ASM)
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_mmx_recon_init(funcs);
+  }
+
   if (cpu_flags & CPU_X86_SSE2) {
     dsp_sse2_recon_init(funcs);
-  } else if (cpu_flags & CPU_X86_MMX) {
-    dsp_mmx_recon_init(funcs);
   }
 #endif
 }

Added: branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c
===================================================================
--- branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/lib/x86_32_vs/dsp_sse2.c	2006-06-03 10:08:41 UTC (rev 11500)
@@ -0,0 +1,1542 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if 0
+//These are to let me selectively enable the C versions, these are needed
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+#endif
+
+
+static const ogg_int64_t V128 = 0x0080008000800080LL;
+
+static void sub8x8__sse2 (unsigned char *FiltPtr, unsigned char *ReconPtr,
+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+                  ogg_uint32_t ReconPixelsPerLine) 
+{
+
+    //Make non-zero to use the C-version
+#if 0
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr += ReconPixelsPerLine;
+    DctInputPtr += 8;
+  }
+#else
+
+    __asm {
+        align 16
+
+        pxor		xmm0, xmm0	
+
+        mov     eax, FiltPtr
+        mov     ebx, ReconPtr
+        mov     edi, DctInputPtr
+        mov     ecx, PixelsPerLine
+        mov     edx, ReconPixelsPerLine
+
+        /* ITERATION 1 */
+        movq    xmm1, QWORD PTR [eax]     ;       /*  xmm1_L = FiltPtr[0..7] */
+        movq    xmm2, QWORD PTR [ebx]     ;       /*  xmm2_L = ReconPtr[0..7] */
+        punpcklbw   xmm1, xmm0  ;       /*  xmm1 = INT16(FiltPtr[0..7]) */
+        punpcklbw   xmm2, xmm0  ;       /*  xmm2 = INT16(ReconPtr[0..7]) */
+        psubw       xmm1, xmm2  ;       /*  xmm1 -= xmm2 {int16wise} */
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+        /* ITERATION 2 */
+        movq    xmm2, QWORD PTR [eax]     
+        movq    xmm3, QWORD PTR [ebx]     
+        punpcklbw   xmm2, xmm0  
+        punpcklbw   xmm3, xmm0  
+        psubw       xmm2, xmm3  
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+         /* ITERATION 3 */
+        movq    xmm3, QWORD PTR [eax]     
+        movq    xmm4, QWORD PTR [ebx]     
+        punpcklbw   xmm3, xmm0  
+        punpcklbw   xmm4, xmm0  
+        psubw       xmm3, xmm4  
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+        /* ITERATION 4 */
+        movq    xmm4, QWORD PTR [eax]     
+        movq    xmm5, QWORD PTR [ebx]     
+        punpcklbw   xmm4, xmm0  
+        punpcklbw   xmm5, xmm0  
+        psubw       xmm4, xmm5  
+
+
+        /* Write out the first 4 iterations */
+        movdqa      [edi], xmm1
+        movdqa      [edi + 16], xmm2
+        movdqa      [edi + 32], xmm3
+        movdqa      [edi + 48], xmm4
+
+        add         edi, 64
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+
+        /* ---------------------- Repeat of above -------------------- */
+
+        /* ITERATION 1 */
+        movq    xmm1, QWORD PTR [eax]     ;       /*  xmm1_L = FiltPtr[0..7] */
+        movq    xmm2, QWORD PTR [ebx]     ;       /*  xmm2_L = ReconPtr[0..7] */
+        punpcklbw   xmm1, xmm0  ;       /*  xmm1 = INT16(FiltPtr[0..7]) */
+        punpcklbw   xmm2, xmm0  ;       /*  xmm2 = INT16(ReconPtr[0..7]) */
+        psubw       xmm1, xmm2  ;       /*  xmm1 -= xmm2 {int16wise} */
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+        /* ITERATION 2 */
+        movq    xmm2, QWORD PTR [eax]     
+        movq    xmm3, QWORD PTR [ebx]     
+        punpcklbw   xmm2, xmm0  
+        punpcklbw   xmm3, xmm0  
+        psubw       xmm2, xmm3  
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+         /* ITERATION 3 */
+        movq    xmm3, QWORD PTR [eax]     
+        movq    xmm4, QWORD PTR [ebx]     
+        punpcklbw   xmm3, xmm0  
+        punpcklbw   xmm4, xmm0  
+        psubw       xmm3, xmm4  
+
+
+
+        /* Increment pointers */
+        add		eax, ecx		
+        add		ebx, edx
+
+        /* ITERATION 4 */
+        movq    xmm4, QWORD PTR [eax]     
+        movq    xmm5, QWORD PTR [ebx]     
+        punpcklbw   xmm4, xmm0  
+        punpcklbw   xmm5, xmm0  
+        psubw       xmm4, xmm5  
+
+
+        /* Write out the first 4 iterations */
+        movdqa      [edi], xmm1
+        movdqa      [edi + 16], xmm2
+        movdqa      [edi + 32], xmm3
+        movdqa      [edi + 48], xmm4
+
+
+    };
+
+
+#endif
+}
+
+static void sub8x8_128__sse2 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+                      ogg_uint32_t PixelsPerLine) 
+{
+
+#if 0
+  int i;
+  /* For each block row */
+  for (i=8; i; i--) {
+    /* INTRA mode so code raw image data */
+    /* We convert the data to 8 bit signed (by subtracting 128) as
+       this reduces the internal precision requirments in the DCT
+       transform. */
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    DctInputPtr += 8;
+  }
+
+#else
+    __asm {
+        align 16
+
+        pxor		mm7, mm7		
+
+        mov         eax, FiltPtr
+        mov         ebx, DctInputPtr
+
+        movq		mm1, V128
+
+        /*  ITERATION 1 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 2 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 3 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 4 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 5 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 6 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 7 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+
+        /*  ITERATION 8 */		
+        movq		mm0, [eax]		/* mm0 = FiltPtr */
+        movq		mm2, mm0		/* dup to prepare for up conversion */
+        /* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		/* mm0 = INT16(FiltPtr) */
+        punpckhbw		mm2, mm7		/* mm2 = INT16(FiltPtr) */
+        /* start calculation */
+        psubw		mm0, mm1		/* mm0 = FiltPtr - 128 */
+        psubw		mm2, mm1		/* mm2 = FiltPtr - 128 */
+        movq		[ebx], mm0		/* write answer out */
+        movq		[8 + ebx], mm2		/* write answer out */
+        /* Increment pointers */
+        add		ebx, 16		
+        add		eax, PixelsPerLine	
+
+    };
+ 
+#endif
+}
+
+
+
+
+static void sub8x8avg2__sse2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+                     ogg_uint32_t PixelsPerLine,
+                     ogg_uint32_t ReconPixelsPerLine) 
+{
+
+#if 0
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr1 += ReconPixelsPerLine;
+    ReconPtr2 += ReconPixelsPerLine;
+    DctInputPtr += 8;
+  }
+#else
+
+    __asm {
+        align 16
+
+            pxor        mm7, mm7
+
+        mov         eax, FiltPtr
+        mov         ebx, ReconPtr1
+        mov         ecx, ReconPtr2
+        mov         edx, DctInputPtr
+
+        /*  ITERATION 1 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+	
+
+        /*  ITERATION 2 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 3 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 4 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 5 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 6 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 7 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+
+        /*  ITERATION 8 */	
+        movq		mm0, [eax]		;	/* mm0 = FiltPtr */
+        movq		mm1, [ebx]		;	/* mm1 = ReconPtr1 */
+        movq		mm4, [ecx]		;	/* mm1 = ReconPtr2 */
+        movq		mm2, mm0		;	/* dup to prepare for up conversion */
+        movq		mm3, mm1		;	/* dup to prepare for up conversion */
+        movq		mm5, mm4		;	/* dup to prepare for up conversion */
+	        ;	/* convert from UINT8 to INT16 */
+        punpcklbw		mm0, mm7		;	/* mm0 = INT16(FiltPtr) */
+        punpcklbw		mm1, mm7		;	/* mm1 = INT16(ReconPtr1) */
+        punpcklbw		mm4, mm7		;	/* mm1 = INT16(ReconPtr2) */
+        punpckhbw		mm2, mm7		;	/* mm2 = INT16(FiltPtr) */
+        punpckhbw		mm3, mm7		;	/* mm3 = INT16(ReconPtr1) */
+        punpckhbw		mm5, mm7		;	/* mm3 = INT16(ReconPtr2) */
+	        ;	/* average ReconPtr1 and ReconPtr2 */
+        paddw		mm1, mm4		;	/* mm1 = ReconPtr1 + ReconPtr2 */
+        paddw		mm3, mm5		;	/* mm3 = ReconPtr1 + ReconPtr2 */
+        psrlw		mm1, 1		;	/* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+        psrlw		mm3, 1		;	/* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+        psubw		mm0, mm1		;	/* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        psubw		mm2, mm3		;	/* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+        movq		[edx], mm0		;	/* write answer out */
+        movq		[8 + edx], mm2		;	/* write answer out */
+	        ;	/* Increment pointers */
+        add		edx, 16		;	
+        add		eax, PixelsPerLine		;	
+        add		ebx, ReconPixelsPerLine		;	
+        add		ecx, ReconPixelsPerLine		;	
+
+    };
+
+
+
+
+ 
+#endif
+}
+
+static ogg_uint32_t row_sad8__sse2 (unsigned char *Src1, unsigned char *Src2)
+{
+
+#if 0
+  ogg_uint32_t SadValue;
+  ogg_uint32_t SadValue1;
+
+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
+	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+
+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
+	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+
+  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
+
+  return SadValue;
+
+#else
+  ogg_uint32_t MaxSad;
+
+  
+  __asm {
+    align       16
+    mov         ebx, Src1
+    mov         ecx, Src2
+
+
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* zero out mm7 for unpack */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [ecx]		;	
+
+    movq		mm2, mm0		;	
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* ; unpack low four bytes to higher precision */
+    punpckhbw		mm1, mm7		;	/* ; unpack high four bytes to higher precision */
+
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+    psrlq		mm2, 32		;	/* fold and add */
+    psrlq		mm3, 32		;	
+    paddw		mm0, mm2		;	
+    paddw		mm1, mm3		;	
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+    psrlq		mm2, 16		;	
+    psrlq		mm3, 16		;	
+    paddw		mm0, mm2		;	
+    paddw		mm1, mm3		;	
+
+    psubusw		mm1, mm0		;	
+    paddw		mm1, mm0		;	/* mm1 = max(mm1, mm0) */
+    movd		eax, mm1		;
+
+    and         eax, 0xffff
+    mov         MaxSad, eax
+  };
+   return MaxSad;
+  
+  
+  
+ 
+
+#endif
+}
+
+
+
+
+static ogg_uint32_t col_sad8x8__sse2 (unsigned char *Src1, unsigned char *Src2,
+		                    ogg_uint32_t stride)
+{
+
+#if 0
+  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t MaxSad = 0;
+  ogg_uint32_t i;
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue[0] += abs(Src1[0] - Src2[0]);
+    SadValue[1] += abs(Src1[1] - Src2[1]);
+    SadValue[2] += abs(Src1[2] - Src2[2]);
+    SadValue[3] += abs(Src1[3] - Src2[3]);
+    SadValue[4] += abs(Src1[4] - Src2[4]);
+    SadValue[5] += abs(Src1[5] - Src2[5]);
+    SadValue[6] += abs(Src1[6] - Src2[6]);
+    SadValue[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue2[0] += abs(Src1[0] - Src2[0]);
+    SadValue2[1] += abs(Src1[1] - Src2[1]);
+    SadValue2[2] += abs(Src1[2] - Src2[2]);
+    SadValue2[3] += abs(Src1[3] - Src2[3]);
+    SadValue2[4] += abs(Src1[4] - Src2[4]);
+    SadValue2[5] += abs(Src1[5] - Src2[5]);
+    SadValue2[6] += abs(Src1[6] - Src2[6]);
+    SadValue2[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+    
+  for ( i = 0; i < 8; i++ ){
+    if ( SadValue[i] > MaxSad )
+      MaxSad = SadValue[i];
+    if ( SadValue2[i] > MaxSad )
+      MaxSad = SadValue2[i];
+  }
+    
+  return MaxSad;
+#else
+  ogg_uint32_t MaxSad;
+
+
+    __asm {
+        align       16
+        mov         ebx, Src1
+        mov         ecx, Src2
+
+        pxor		mm3, mm3		;	/* zero out mm3 for unpack */
+        pxor		mm4, mm4		;	/* mm4 low sum */
+        pxor		mm5, mm5		;	/* mm5 high sum */
+        pxor		mm6, mm6		;	/* mm6 low sum */
+        pxor		mm7, mm7		;	/* mm7 high sum */
+        mov		edi, 4		;	/* 4 rows */
+        label_1:				;	
+        movq		mm0, [ebx]		;	/* take 8 bytes */
+        movq		mm1, [ecx]		;	/* take 8 bytes */
+
+        movq		mm2, mm0		;	
+        psubusb		mm0, mm1		;	/* A - B */
+        psubusb		mm1, mm2		;	/* B - A */
+        por		mm0, mm1		;	/* and or gives abs difference */
+        movq		mm1, mm0		;	
+
+        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
+        paddw		mm4, mm0		;	/* accumulate difference... */
+        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
+        paddw		mm5, mm1		;	/* accumulate difference... */
+        add		ebx, stride		;	/* Inc pointer into the new data */
+        add		ecx, stride		;	/* Inc pointer into the new data */
+
+        dec		edi		;	
+        jnz		label_1		;	
+
+        mov		edi, 4		;	/* 4 rows */
+        label_2:				;	
+        movq		mm0, [ebx]		;	/* take 8 bytes */
+        movq		mm1, [ecx]		;	/* take 8 bytes */
+
+        movq		mm2, mm0		;	
+        psubusb		mm0, mm1		;	/* A - B */
+        psubusb		mm1, mm2		;	/* B - A */
+        por		mm0, mm1		;	/* and or gives abs difference */
+        movq		mm1, mm0		;	
+
+        punpcklbw		mm0, mm3		;	/* unpack to higher precision for accumulation */
+        paddw		mm6, mm0		;	/* accumulate difference... */
+        punpckhbw		mm1, mm3		;	/* unpack high four bytes to higher precision */
+        paddw		mm7, mm1		;	/* accumulate difference... */
+        add		ebx, stride		;	/* Inc pointer into the new data */
+        add		ecx, stride		;	/* Inc pointer into the new data */
+
+        dec		edi		;	
+        jnz		label_2		;	
+
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm7, mm6) */
+        psubusw		mm5, mm4		;	
+        paddw		mm5, mm4		;	/* mm5 = max(mm5, mm4) */
+        psubusw		mm7, mm5		;	
+        paddw		mm7, mm5		;	/* mm7 = max(mm5, mm7) */
+        movq		mm6, mm7		;	
+        psrlq		mm6, 32		;	
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
+        movq		mm6, mm7		;	
+        psrlq		mm6, 16		;	
+        psubusw		mm7, mm6		;	
+        paddw		mm7, mm6		;	/* mm7 = max(mm5, mm7) */
+        movd		eax, mm7		;	
+        and		    eax, 0xffff		;
+
+        mov         MaxSad, eax
+    };
+
+    return MaxSad;
+
+
+#endif
+}
+
+static ogg_uint32_t sad8x8__sse2 (unsigned char *ptr1, ogg_uint32_t stride1,
+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+{
+
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
+  return sad;
+#else
+  ogg_uint32_t  DiffVal;
+
+  __asm {
+    align  16
+
+    mov         ebx, ptr1
+    mov         edx, ptr2
+
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* mm7 contains the result */
+    
+    ; /* ITERATION 1 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+    ; /* ITERATION 2 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 3 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+    ; /* ITERATION 4 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 5 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 6 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+    ; /* ITERATION 7 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+
+    ; /* ITERATION 8 */
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, stride1		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		edx, stride2		;	/* Inc pointer into ref data */
+
+
+
+    ; /* ------ */
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddw		mm7, mm0		;	
+    movq		mm0, mm7		;	
+    psrlq		mm7, 16		;	
+    paddw		mm7, mm0		;	
+    movd		eax, mm7		;	
+    and		    eax, 0xffff		;	
+
+    mov         DiffVal, eax
+  };
+
+  return DiffVal;
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t sad8x8_thres__sse2 (unsigned char *ptr1, ogg_uint32_t stride1,
+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
+			   	  ogg_uint32_t thres)
+{
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    if (sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
+  return sad;
+#else
+  return sad8x8__sse2 (ptr1, stride1, ptr2, stride2);
+#endif
+}
+
+
+static ogg_uint32_t sad8x8_xy2_thres__sse2 (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                      unsigned char *RefDataPtr1,
+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+			              ogg_uint32_t thres)
+{
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+
+    if ( sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
+  return sad;
+#else
+  ogg_uint32_t  DiffVal;
+
+  __asm {
+    align 16
+
+        mov     ebx, SrcData
+        mov     ecx, RefDataPtr1
+        mov     edx, RefDataPtr2
+
+
+    pcmpeqd		mm5, mm5		;	/* fefefefefefefefe in mm5 */
+    paddb		mm5, mm5		;	
+				    ;	
+    pxor		mm6, mm6		;	/* zero out mm6 for unpack */
+    pxor		mm7, mm7		;	/* mm7 contains the result */
+    mov		edi, 8		;	/* 8 rows */
+    loop_start:				;	
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+
+    movq		mm2, [ecx]		;	
+    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
+    movq		mm1, mm2		;	
+    pand		mm1, mm3		;	
+    pxor		mm3, mm2		;	
+    pand		mm3, mm5		;	
+    psrlq		mm3, 1		;	
+    paddb		mm1, mm3		;	
+
+    movq		mm2, mm0		;	
+
+    psubusb		mm0, mm1		;	/* A - B */
+    psubusb		mm1, mm2		;	/* B - A */
+    por		mm0, mm1		;	/* and or gives abs difference */
+    movq		mm1, mm0		;	
+
+    punpcklbw		mm0, mm6		;	/* unpack to higher precision for accumulation */
+    paddw		mm7, mm0		;	/* accumulate difference... */
+    punpckhbw		mm1, mm6		;	/* unpack high four bytes to higher precision */
+    add		ebx, SrcStride		;	/* Inc pointer into the new data */
+    paddw		mm7, mm1		;	/* accumulate difference... */
+    add		ecx, RefStride		;	/* Inc pointer into ref data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddw		mm7, mm0		;	
+    movq		mm0, mm7		;	
+    psrlq		mm7, 16		;	
+    paddw		mm7, mm0		;	
+    movd		eax, mm7		;	
+    and		eax, 0xffff		;	
+
+    mov DiffVal, eax
+  };
+
+  return DiffVal;
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t intra8x8_err__sse2 (unsigned char *DataPtr, ogg_uint32_t Stride)
+{
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+
+  for (i=8; i; i--) {
+     /* Examine alternate pixel locations. */
+     XSum += DataPtr[0];
+     XXSum += DataPtr[0]*DataPtr[0];
+     XSum += DataPtr[1];
+     XXSum += DataPtr[1]*DataPtr[1];
+     XSum += DataPtr[2];
+     XXSum += DataPtr[2]*DataPtr[2];
+     XSum += DataPtr[3];
+     XXSum += DataPtr[3]*DataPtr[3];
+     XSum += DataPtr[4];
+     XXSum += DataPtr[4]*DataPtr[4];
+     XSum += DataPtr[5];
+     XXSum += DataPtr[5]*DataPtr[5];
+     XSum += DataPtr[6];
+     XXSum += DataPtr[6]*DataPtr[6];
+     XSum += DataPtr[7];
+     XXSum += DataPtr[7]*DataPtr[7];
+
+     /* Step to next row of block. */
+     DataPtr += Stride;
+   }
+
+   /* Compute population variance as mis-match metric. */
+   return (( (XXSum<<6) - XSum*XSum ) );
+#else
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+  __asm {
+    align 16
+
+        mov     ecx, DataPtr
+
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:		
+    movq		mm0, [ecx]		;	/* take 8 bytes */
+    movq		mm2, mm0		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpckhbw		mm2, mm6		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ecx, Stride		;	/* Inc pointer into src data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov		eax, edi		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd		ebx, mm7		;	
+
+        mov         XSum, eax
+        mov         XXSum, ebx;
+
+  };
+    /* Compute population variance as mis-match metric. */
+    return (( (XXSum<<6) - XSum*XSum ) );
+
+ 
+
+#endif
+}
+
+static ogg_uint32_t inter8x8_err__sse2 (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+#else
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+
+  __asm {
+    align 16
+
+        mov     ecx, SrcData
+        mov     edx, RefDataPtr
+
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:				;	
+    movq		mm0, [ecx]		;	/* take 8 bytes */
+    movq		mm1, [edx]		;	
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpcklbw		mm1, mm6		;	
+    punpckhbw		mm2, mm6		;	
+    punpckhbw		mm3, mm6		;	
+
+    psubsw		mm0, mm1		;	
+    psubsw		mm2, mm3		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ecx, SrcStride		;	/* Inc pointer into src data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov		eax, edi		;	
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd		ebx, mm7		;	
+
+        mov     XSum, eax
+        mov     XXSum, ebx
+
+  };
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+
+ 
+#endif
+}
+
+static ogg_uint32_t inter8x8_err_xy2__sse2 (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                     unsigned char *RefDataPtr1,
+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+#if 0
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+#else
+  ogg_uint32_t XSum;
+  ogg_uint32_t XXSum;
+
+  __asm {
+    align 16
+
+        mov ebx, SrcData
+        mov ecx, RefDataPtr1
+        mov edx, RefDataPtr2
+
+    pcmpeqd		mm4, mm4		;	/* fefefefefefefefe in mm4 */
+    paddb		mm4, mm4		;	
+    pxor		mm5, mm5		;	
+    pxor		mm6, mm6		;	
+    pxor		mm7, mm7		;	
+    mov		edi, 8		;	
+    loop_start:				;	
+    movq		mm0, [ebx]		;	/* take 8 bytes */
+
+    movq		mm2, [ecx]		;	
+    movq		mm3, [edx]		;	/* take average of mm2 and mm3 */
+    movq		mm1, mm2		;	
+    pand		mm1, mm3		;	
+    pxor		mm3, mm2		;	
+    pand		mm3, mm4		;	
+    psrlq		mm3, 1		;	
+    paddb		mm1, mm3		;	
+
+    movq		mm2, mm0		;	
+    movq		mm3, mm1		;	
+
+    punpcklbw		mm0, mm6		;	
+    punpcklbw		mm1, mm6		;	
+    punpckhbw		mm2, mm6		;	
+    punpckhbw		mm3, mm6		;	
+
+    psubsw		mm0, mm1		;	
+    psubsw		mm2, mm3		;	
+
+    paddw		mm5, mm0		;	
+    paddw		mm5, mm2		;	
+
+    pmaddwd		mm0, mm0		;	
+    pmaddwd		mm2, mm2		;	
+				    ;	
+    paddd		mm7, mm0		;	
+    paddd		mm7, mm2		;	
+
+    add		ebx, SrcStride		;	/* Inc pointer into src data */
+    add		ecx, RefStride		;	/* Inc pointer into ref data */
+    add		edx, RefStride		;	/* Inc pointer into ref data */
+
+    dec		edi		;	
+    jnz		loop_start		;	
+
+    movq		mm0, mm5		;	
+    psrlq		mm5, 32		;	
+    paddw		mm5, mm0		;	
+    movq		mm0, mm5		;	
+    psrlq		mm5, 16		;	
+    paddw		mm5, mm0		;	
+    movd		edi, mm5		;	
+    movsx		edi, di		;	
+    mov         XSum, edi   ; /* movl		eax, edi		;	Modified for vc to resuse eax*/
+
+    movq		mm0, mm7		;	
+    psrlq		mm7, 32		;	
+    paddd		mm7, mm0		;	
+    movd        XXSum, mm7 ; /*movd		eax, mm7		; Modified for vc to reuse eax */
+  };
+
+    return (( (XXSum<<6) - XSum*XSum ));
+
+#endif
+}
+
+
+
+void dsp_sse2_init(DspFunctions *funcs)
+{
+  TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
+  funcs->sub8x8 = sub8x8__sse2;
+  //funcs->sub8x8_128 = sub8x8_128__sse2;
+  //funcs->sub8x8avg2 = sub8x8avg2__sse2;
+  //funcs->row_sad8 = row_sad8__sse2;
+  //funcs->col_sad8x8 = col_sad8x8__sse2;
+  //funcs->sad8x8 = sad8x8__sse2;
+  //funcs->sad8x8_thres = sad8x8_thres__sse2;
+  //funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__sse2;
+  //funcs->intra8x8_err = intra8x8_err__sse2;
+  //funcs->inter8x8_err = inter8x8_err__sse2;
+  //funcs->inter8x8_err_xy2 = inter8x8_err_xy2__sse2;
+}
+

Modified: branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-02 17:45:25 UTC (rev 11499)
+++ branches/theora-playtime/win32/VS2005/libtheora/libtheora.vcproj	2006-06-03 10:08:41 UTC (rev 11500)
@@ -415,6 +415,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\lib\x86_32_vs\dsp_sse2.c"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\lib\encode.c"
 				>
 			</File>