[Theora-dev] Re: MMX/mmxext optimisations

Tue Aug 24 03:42:45 PDT 2004

Lovely.  I actually get up to about 4x speed improvement
on some encodes.

Here's a patch that is ported to SVN HEAD.

--Adam

-------------- next part --------------
Index: lib/reconstruct.c
===================================================================

--- lib/reconstruct.c	(revision 7621)
+++ lib/reconstruct.c	(working copy)
@@ -16,12 +16,28 @@
  ********************************************************************/
 
 #include "encoder_internal.h"
+#include "dsp.h"
+#include "cpu.h"
 
-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                 ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void copy8x8__c (unsigned char *src,
+	                unsigned char *dest,
+	                unsigned int stride)
+{
+  int j;
+  for ( j = 0; j < 8; j++ ){
+    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+    src+=stride;
+    dest+=stride;
+  }
+}
+
+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+		      ogg_uint32_t LineStep)
+{
   ogg_uint32_t i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+  for (i = 8; i; i--){
     /* Convert the data back to 8 bit unsigned */
     /* Saturate the output to unsigend 8 bit values */
     ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
@@ -34,17 +50,16 @@
     ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
 
     ReconPtr += LineStep;
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
   }
-
 }
 
-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                 unsigned char * RefPtr, ogg_int16_t * ChangePtr,
-                 ogg_uint32_t LineStep ) {
+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
   ogg_uint32_t i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
+  for (i = 8; i; i--){
     ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
     ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
     ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
@@ -54,19 +69,19 @@
     ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
     ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
 
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
     ReconPtr += LineStep;
     RefPtr += LineStep;
   }
-
 }
 
-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                           unsigned char * RefPtr1, unsigned char * RefPtr2,
-                           ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+			   ogg_uint32_t LineStep)
+{
   ogg_uint32_t  i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+  for (i = 8; i; i--){
     ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
     ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
     ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
@@ -76,10 +91,20 @@
     ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
     ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
 
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
     ReconPtr += LineStep;
     RefPtr1 += LineStep;
     RefPtr2 += LineStep;
   }
+}
 
+void dsp_recon_init (DspFunctions *funcs)
+{
+  funcs->copy8x8 = copy8x8__c;
+  funcs->recon_intra8x8 = recon_intra8x8__c;
+  funcs->recon_inter8x8 = recon_inter8x8__c;
+  funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_i386_mmx_recon_init(&dsp_funcs);
+  }
 }
Index: lib/dct_encode.c
===================================================================
--- lib/dct_encode.c	(revision 7621)
+++ lib/dct_encode.c	(working copy)
@@ -17,110 +17,10 @@
 
 #include <stdlib.h>
 #include "encoder_internal.h"
+#include "dsp.h"
 
 static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
 
-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
-                  unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine ) {
-  int i;
-
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
-
-    /* Update the screen canvas in one step*/
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    ReconPtr += ReconPixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      unsigned char *old_ptr1, unsigned char *new_ptr1,
-                      ogg_uint32_t PixelsPerLine ) {
-  int i;
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
-    /* INTRA mode so code raw image data */
-    /* We convert the data to 8 bit signed (by subtracting 128) as
-       this reduces the internal precision requirments in the DCT
-       transform. */
-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
-
-    /* Update the screen canvas in one step */
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     unsigned char *old_ptr1, unsigned char *new_ptr1,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine ) {
-  int i;
-
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-    DctInputPtr[0] = (ogg_int16_t)
-      ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
-    DctInputPtr[1] = (ogg_int16_t)
-      ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
-    DctInputPtr[2] = (ogg_int16_t)
-      ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
-    DctInputPtr[3] = (ogg_int16_t)
-      ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
-    DctInputPtr[4] = (ogg_int16_t)
-      ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
-    DctInputPtr[5] = (ogg_int16_t)
-      ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
-    DctInputPtr[6] = (ogg_int16_t)
-      ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
-    DctInputPtr[7] = (ogg_int16_t)
-      ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
-
-    /* Update the screen canvas in one step */
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    ReconPtr1 += ReconPixelsPerLine;
-    ReconPtr2 += ReconPixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
 static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
                                        ogg_uint32_t * TokenListPtr ){
   unsigned char tokens_added = 0;
@@ -452,13 +352,15 @@
 
   /* Is the MV offset exactly pixel alligned */
   if ( AbsRefOffset == 0 ){
-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
-               PixelsPerLine, ReconPixelsPerLine );
+    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
+               PixelsPerLine, ReconPixelsPerLine);
+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
   } else {
     /* Fractional pixel MVs. */
     /* Note that we only use two pixel values even for the diagonal */
-    Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
-                 new_ptr1, PixelsPerLine, ReconPixelsPerLine );
+    dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
+                 PixelsPerLine, ReconPixelsPerLine);
+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
   }
 }
 
@@ -534,17 +436,18 @@
         pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
     }
 
-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
-               PixelsPerLine, ReconPixelsPerLine );
+    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
+               PixelsPerLine, ReconPixelsPerLine);
+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
   } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
-    Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
-
+    dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
   }
 
   /* Proceed to encode the data into the encode buffer if the encoder
      is enabled. */
   /* Perform a 2D DCT transform on the data. */
-  fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
+  dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
 
   /* Quantize that transform data. */
   quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
Index: lib/cpu.c
===================================================================
--- lib/cpu.c	(revision 0)
+++ lib/cpu.c	(revision 0)
@@ -0,0 +1,107 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+ogg_uint32_t cpu_flags = 0;
+
+#if 1
+static ogg_uint32_t cpu_get_flags (void)
+{
+  ogg_uint32_t eax, ebx, ecx, edx;
+  ogg_uint32_t flags;
+
+#define cpuid(op,eax,ebx,ecx,edx)      \
+  asm volatile ("pushl %%ebx   \n\t"   \
+                "cpuid         \n\t"   \
+                "movl %%ebx,%1 \n\t"   \
+                "popl %%ebx"           \
+              : "=a" (eax),            \
+                "=r" (ebx),            \
+                "=c" (ecx),            \
+                "=d" (edx)             \
+              : "a" (op)               \
+              : "cc")
+
+  asm volatile ("pushfl              \n\t"
+                "pushfl              \n\t"
+                "popl %0             \n\t"
+                "movl %0,%1          \n\t"
+                "xorl $0x200000,%0   \n\t"
+                "pushl %0            \n\t"
+                "popfl               \n\t"
+                "pushfl              \n\t"
+                "popl %0             \n\t"
+                "popfl"
+              : "=r" (eax),
+                "=r" (ebx)
+              :
+              : "cc");
+         
+  if (eax == ebx)             /* no cpuid */
+    return 0;
+
+  cpuid(0, eax, ebx, ecx, edx);
+
+  if (ebx == 0x756e6547 &&
+      edx == 0x49656e69 &&
+      ecx == 0x6c65746e) {
+    /* intel */
+
+  inteltest:
+    cpuid(1, eax, ebx, ecx, edx);
+    if ((edx & 0x00800000) == 0)
+      return 0;
+    flags = CPU_X86_MMX;
+    if (edx & 0x02000000)
+      flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
+    if (edx & 0x04000000)
+      flags |= CPU_X86_SSE2;
+    return flags;
+  } else if (ebx == 0x68747541 &&
+             edx == 0x69746e65 &&
+             ecx == 0x444d4163) {
+    /* AMD */
+    cpuid(0x80000000, eax, ebx, ecx, edx);
+    if ((unsigned)eax < 0x80000001)
+      goto inteltest;
+    cpuid(0x80000001, eax, ebx, ecx, edx);
+    if ((edx & 0x00800000) == 0)
+      return 0;
+    flags = CPU_X86_MMX;
+    if (edx & 0x80000000)
+      flags |= CPU_X86_3DNOW;
+    if (edx & 0x00400000)
+      flags |= CPU_X86_MMXEXT;
+    return flags;
+  }
+  else {
+    /* implement me */
+  }
+
+  return flags;
+}
+#else
+static ogg_uint32_t cpu_get_flags (void) {
+  return 0;
+}
+#endif
+
+void cpu_init () 
+{
+  cpu_flags = cpu_get_flags();
+}
Index: lib/cpu.h
===================================================================
--- lib/cpu.h	(revision 0)
+++ lib/cpu.h	(revision 0)
@@ -0,0 +1,28 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "encoder_internal.h"
+
+extern ogg_uint32_t cpu_flags;
+
+#define CPU_X86_MMX	(1<<0)
+#define CPU_X86_3DNOW	(1<<1)
+#define CPU_X86_MMXEXT	(1<<2)
+#define CPU_X86_SSE	(1<<3)
+#define CPU_X86_SSE2	(1<<4)
+
+void cpu_init () ;
Index: lib/i386/fdct_mmx.c
===================================================================
--- lib/i386/fdct_mmx.c	(revision 0)
+++ lib/i386/fdct_mmx.c	(revision 0)
@@ -0,0 +1,340 @@
+;//==========================================================================
+;//
+;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
+;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
+;//  PURPOSE.
+;//
+;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
+;//
+;//--------------------------------------------------------------------------
+
+#include <theora/theora.h>
+#include "dsp.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
+static const __attribute__ ((aligned(8))) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+/***********************************************************************
+ *	File:			fdct_m.asm
+ *
+ *	Description:
+ *					This function perform 2-D Forward DCT on a 8x8 block
+ *					
+ *
+ *	Input:			Pointers to input source data buffer and destination 
+ *					buffer.
+ *
+ *	Note:			none
+ *
+ *	Special Notes:	We try to do the truncation right to match the result 
+ *					of the c version. 
+ *
+ ************************************************************************/
+
+/* execute stage 1 of forward DCT */
+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
+  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
+  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
+  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
+  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
+  "  movq        %%mm0, %%mm4       \n\t"                                     \
+  "  movq        %%mm1, %%mm5       \n\t"                                     \
+  "  movq        %%mm2, %%mm6       \n\t"                                     \
+  "  movq        %%mm3, %%mm7       \n\t"                                     \
+                                                                              \
+  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
+  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
+  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
+  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
+  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
+  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
+                                                                              \
+  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
+                                                                              \
+  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
+                                                                              \
+  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
+                                                                              \
+  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
+  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
+  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
+  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
+  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
+                                                                              \
+  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
+  /* ------------------------------------------------------------------- */   \
+  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
+  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
+  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
+  /* ------------------------------------------------------------------- */   \
+  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
+  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
+                                                                              \
+  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
+  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
+                                                                              \
+  "  pmulhw   "M(xC4S4)", %%mm0     \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
+  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
+                                                                              \
+  "  movq        %%mm3, %%mm2       \n\t"                                     \
+  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
+                                                                              \
+  "  movq        %%mm3, %%mm0       \n\t"                                     \
+  "  pmulhw   "M(xC4S4)", %%mm3     \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
+                                                                              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
+                                                                              \
+  "  movq        %%mm3," #ip0 "     \n\t"                                     \
+  /* ------------------------------------------------------------------- */   \
+  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
+  "  pmulhw   "M(xC2S6)", %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
+                                                                              \
+  "  movq      " #temp ", %%mm2     \n\t"                                     \
+  "  movq        %%mm2, %%mm0       \n\t"                                     \
+                                                                              \
+  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
+  "  paddw       %%mm0, %%mm3       \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
+  "  movq        %%mm5, %%mm0       \n\t"                                     \
+                                                                              \
+  "  movq        %%mm5, %%mm2       \n\t"                                     \
+  "  pmulhw   "M(xC6S2)", %%mm0     \n\t" /* mm0 = xC6S2 * irot_input_x */    \
+                                                                              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
+                                                                              \
+  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
+  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
+                                                                              \
+  "  movq        %%mm5, %%mm0       \n\t"                                     \
+  "  movq        %%mm5, %%mm2       \n\t"                                     \
+                                                                              \
+  "  pmulhw   "M(xC2S6)", %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+                                                                              \
+  "  movq      " #temp ", %%mm3     \n\t"                                     \
+  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
+                                                                              \
+  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
+  "  movq        %%mm3, %%mm2       \n\t"                                     \
+                                                                              \
+  "  pmulhw   "M(xC6S2)", %%mm3     \n\t" /* mm3 = xC6S2 * irot_input_y */    \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
+  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
+                                                                              \
+  "  movq        %%mm3," #ip6 "     \n\t"                                     \
+  /* ------------------------------------------------------------------- */   \
+  "  movq     "M(xC4S4)", %%mm0     \n\t"                                     \
+  "  movq        %%mm1, %%mm2       \n\t"                                     \
+  "  movq        %%mm1, %%mm3       \n\t"                                     \
+                                                                              \
+  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
+  "  psrlw       $15, %%mm2         \n\t"				      \
+                                                                              \
+  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
+  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
+                                                                              \
+  "  movq        %%mm7, %%mm2       \n\t"                                     \
+  "  movq        %%mm7, %%mm3       \n\t"			              \
+                                                                              \
+  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
+  "  psrlw       $15, %%mm2         \n\t"			              \
+                                                                              \
+  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
+  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
+  /* ------------------------------------------------------------------- */   \
+  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
+  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
+                                                                              \
+  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
+  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
+  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
+                                                                              \
+  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
+  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
+  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
+  /* ------------------------------------------------------------------- */   \
+  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
+  "  movq        %%mm1, %%mm2       \n\t"                                     \
+                                                                              \
+  "  movq        %%mm1, %%mm3       \n\t"                                     \
+  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
+                                                                              \
+  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
+  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
+                                                                              \
+  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
+                                                                              \
+  "  movq        %%mm0, %%mm5       \n\t"                                     \
+  "  movq        %%mm0, %%mm2       \n\t"                                     \
+                                                                              \
+  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
+  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
+                                                                              \
+  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
+                                                                              \
+  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
+  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
+                                                                              \
+  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
+  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
+                                                                              \
+  "  movq        %%mm1," #ip1 "     \n\t"                                     \
+  "  movq        %%mm3," #ip7 "     \n\t"                                     \
+  /* ------------------------------------------------------------------- */   \
+  "  movq     "M(xC3S5)", %%mm0     \n\t"                                     \
+  "  movq     "M(xC5S3)", %%mm1     \n\t"                                     \
+                                                                              \
+  "  movq        %%mm6, %%mm5       \n\t"                                     \
+  "  movq        %%mm6, %%mm7       \n\t"                                     \
+                                                                              \
+  "  movq        %%mm4, %%mm2       \n\t"                                     \
+  "  movq        %%mm4, %%mm3       \n\t"                                     \
+                                                                              \
+  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
+  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
+                                                                              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
+  "  psrlw       $15, %%mm5         \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
+  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
+                                                                              \
+  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
+  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
+                                                                              \
+  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
+  "  movq        %%mm4," #ip3 "     \n\t"                                     \
+                                                                              \
+  "  movq        %%mm3, %%mm4       \n\t"                                     \
+  "  movq        %%mm7, %%mm6       \n\t"                                     \
+                                                                              \
+  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
+  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
+                                                                              \
+  "  paddw       %%mm2, %%mm4       \n\t"                                     \
+  "  paddw       %%mm5, %%mm6       \n\t"                                     \
+                                                                              \
+  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
+  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
+                                                                              \
+  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
+  "  movq        %%mm3," #ip5 "     \n\t" 
+
+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
+		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
+  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
+  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
+  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
+  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
+  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
+  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
+  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
+  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
+  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
+   /* Transpose 2x8 block */                                            \
+  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
+  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
+  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
+  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
+  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
+  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
+  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
+  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
+  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
+  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
+  "  movq        %%mm4," #op4 "     \n\t"                               \
+  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
+  "  movq        %%mm5," #op5 "     \n\t"                               \
+  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
+  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
+  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
+  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
+  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
+  "  movq        %%mm6," #op7 "     \n\t"                               \
+  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
+  "  movq        %%mm1," #op6 "     \n\t"                               \
+  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
+  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
+  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
+  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
+  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
+  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
+  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
+  "  movq        %%mm0," #op0 "     \n\t"                               \
+  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
+  "  movq        %%mm1," #op1 "     \n\t"                               \
+  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
+  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
+  "  movq        %%mm4," #op3 "     \n\t"                               \
+  "  movq        %%mm2," #op2 "     \n\t"
+
+
+static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
+{
+  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
+  ogg_int16_t *const temp= (int16_t*)align_tmp;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+    /*
+     * Input data is an 8x8 block.  To make processing of the data more efficent
+     * we will transpose the block of data to two 4x8 blocks???
+     */
+    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
+		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
+    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
+
+    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
+		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
+		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
+
+    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
+		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+    "  emms                         \n\t"
+    
+    : "+r" (InputData),
+      "+r" (OutputData)
+    : "r" (temp)
+    : "memory"
+  );
+}
+
+void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
+{
+  funcs->fdct_short = fdct_short__mmx;
+}
Index: lib/i386/dsp_mmx.c
===================================================================
--- lib/i386/dsp_mmx.c	(revision 0)
+++ lib/i386/dsp_mmx.c	(revision 0)
@@ -0,0 +1,642 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "dsp.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t V128 = 0x0080008000800080LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+
+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+                  ogg_uint32_t ReconPixelsPerLine) 
+{
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm7, %%mm7     \n\t" 
+
+    ".rept 8                        \n\t"
+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
+    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
+    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
+    /* convert from UINT8 to INT16 */
+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
+    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
+    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
+    /* start calculation */
+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
+    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
+    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
+    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
+    /* Increment pointers */
+    "  add         $16, %2           \n\t"
+    "  add         %3, %0           \n\t"
+    "  add         %4, %1           \n\t"
+    ".endr                          \n\t"
+
+     : "+r" (FiltPtr),
+       "+r" (ReconPtr),
+       "+r" (DctInputPtr)
+     : "m" (PixelsPerLine),
+       "m" (ReconPixelsPerLine) 
+     : "memory"
+  );
+}
+
+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+                      ogg_uint32_t PixelsPerLine) 
+{
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  movq      "M(V128)", %%mm1   \n\t"
+
+    ".rept 8                        \n\t"
+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
+    /* convert from UINT8 to INT16 */
+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
+    /* start calculation */
+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
+    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
+    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
+    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
+    /* Increment pointers */
+    "  add         $16, %1           \n\t"
+    "  add         %2, %0           \n\t"
+    ".endr                          \n\t"
+
+     : "+r" (FiltPtr),
+       "+r" (DctInputPtr)
+     : "m" (PixelsPerLine)
+     : "memory"
+  );
+}
+
+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+                     ogg_uint32_t PixelsPerLine,
+                     ogg_uint32_t ReconPixelsPerLine) 
+{
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm7, %%mm7     \n\t" 
+
+    ".rept 8                        \n\t"
+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
+    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
+    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
+    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
+    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
+    /* convert from UINT8 to INT16 */
+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
+    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
+    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
+    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
+    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
+    /* average ReconPtr1 and ReconPtr2 */
+    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
+    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
+    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
+    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
+    /* Increment pointers */
+    "  add         $16, %3           \n\t"
+    "  add         %4, %0           \n\t"
+    "  add         %5, %1           \n\t"
+    "  add         %5, %2           \n\t"
+    ".endr                          \n\t"
+
+     : "+r" (FiltPtr),
+       "+r" (ReconPtr1),
+       "+r" (ReconPtr2),
+       "+r" (DctInputPtr)
+     : "m" (PixelsPerLine),
+       "m" (ReconPixelsPerLine) 
+     : "memory"
+  );
+}
+
+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+{
+  ogg_uint32_t MaxSad;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
+    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  movq        %%mm1, %%mm3     \n\t"
+    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
+    "  psrlq       $32, %%mm3       \n\t"
+    "  paddw       %%mm2, %%mm0     \n\t"
+    "  paddw       %%mm3, %%mm1     \n\t"
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  movq        %%mm1, %%mm3     \n\t"
+    "  psrlq       $16, %%mm2       \n\t"
+    "  psrlq       $16, %%mm3       \n\t"
+    "  paddw       %%mm2, %%mm0     \n\t"
+    "  paddw       %%mm3, %%mm1     \n\t"
+
+    "  psubusw     %%mm0, %%mm1     \n\t"
+    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
+    "  movd        %%mm1, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=m" (MaxSad),
+       "+r" (Src1), 
+       "+r" (Src2) 
+     :
+     : "memory"
+  );
+  return MaxSad;
+}
+
+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
+		                    ogg_uint32_t stride)
+{
+  ogg_uint32_t MaxSad;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
+    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
+    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
+    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "1:                             \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "2:                             \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 2b                       \n\t"
+
+    "  psubusw     %%mm6, %%mm7     \n\t"
+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
+    "  psubusw     %%mm4, %%mm5     \n\t" 	
+    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
+    "  psubusw     %%mm5, %%mm7     \n\t" 	
+    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  movq        %%mm7, %%mm6     \n\t"
+    "  psrlq       $32, %%mm6       \n\t"
+    "  psubusw     %%mm6, %%mm7     \n\t" 	
+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  movq        %%mm7, %%mm6     \n\t"
+    "  psrlq       $16, %%mm6       \n\t"
+    "  psubusw     %%mm6, %%mm7     \n\t" 	
+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
+    "  movd        %%mm7, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=r" (MaxSad),
+       "+r" (Src1), 
+       "+r" (Src2) 
+     : "r" (stride)
+     : "memory", "edi"
+  );
+
+  return MaxSad;
+}
+
+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+{
+  ogg_uint32_t  DiffVal;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
+    ".rept 8                         \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"
+    "  movq        %%mm0, %%mm2     \n\t"
+
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
+    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
+    ".endr                          \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddw       %%mm0, %%mm7     \n\t"
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $16, %%mm7       \n\t"
+    "  paddw       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=m" (DiffVal),
+       "+r" (ptr1), 
+       "+r" (ptr2) 
+     : "r" (stride1),
+       "r" (stride2)
+     : "memory"
+  );
+
+  return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
+			   	  ogg_uint32_t thres)
+{
+  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                      unsigned char *RefDataPtr1,
+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+			              ogg_uint32_t thres)
+{
+  ogg_uint32_t  DiffVal;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
+    "  paddb       %%mm5, %%mm5     \n\t"
+   
+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
+    "  mov         $8, %%edi        \n\t"	/* 8 rows */
+    "1:                             \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+
+    "  movq        (%2), %%mm2      \n\t"
+    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
+    "  movq        %%mm2, %%mm1     \n\t"
+    "  pand        %%mm3, %%mm1     \n\t"
+    "  pxor        %%mm2, %%mm3     \n\t"
+    "  pand        %%mm5, %%mm3     \n\t"
+    "  psrlq       $1, %%mm3        \n\t"
+    "  paddb       %%mm3, %%mm1     \n\t"
+
+    "  movq        %%mm0, %%mm2     \n\t"
+
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
+    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
+    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddw       %%mm0, %%mm7     \n\t"
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $16, %%mm7       \n\t"
+    "  paddw       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=m" (DiffVal),
+       "+r" (SrcData), 
+       "+r" (RefDataPtr1), 
+       "+r" (RefDataPtr2) 
+     : "m" (SrcStride),
+       "m" (RefStride)
+     : "edi", "memory"
+  );
+
+  return DiffVal;
+}
+
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
+{
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm5, %%mm5     \n\t"
+    "  pxor        %%mm6, %%mm6     \n\t"
+    "  pxor        %%mm7, %%mm7     \n\t"
+    "  mov         $8, %%edi        \n\t"
+    "1:                             \n\t"
+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        %%mm0, %%mm2     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"
+    "  punpckhbw   %%mm6, %%mm2     \n\t"
+
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  paddw       %%mm2, %%mm5     \n\t"
+
+    "  pmaddwd     %%mm0, %%mm0     \n\t"
+    "  pmaddwd     %%mm2, %%mm2     \n\t"
+    
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  paddd       %%mm2, %%mm7     \n\t"
+
+    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $32, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $16, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movd        %%mm5, %%edi     \n\t"
+    "  movsx       %%di, %%edi      \n\t"
+    "  movl        %%edi, %0        \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %1        \n\t"
+
+     : "=r" (XSum),
+       "=r" (XXSum),
+       "+r" (DataPtr) 
+     : "r" (Stride)
+     : "edi", "memory"
+  );
+
+  /* Compute population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ) );
+}
+
+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+  ogg_uint32_t  XSum;
+  ogg_uint32_t  XXSum;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm5, %%mm5     \n\t"
+    "  pxor        %%mm6, %%mm6     \n\t"
+    "  pxor        %%mm7, %%mm7     \n\t"
+    "  mov         $8, %%edi        \n\t"
+    "1:                             \n\t"
+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%3), %%mm1      \n\t"
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  movq        %%mm1, %%mm3     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"
+    "  punpcklbw   %%mm6, %%mm1     \n\t"
+    "  punpckhbw   %%mm6, %%mm2     \n\t"
+    "  punpckhbw   %%mm6, %%mm3     \n\t"
+
+    "  psubsw      %%mm1, %%mm0     \n\t"
+    "  psubsw      %%mm3, %%mm2     \n\t"
+
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  paddw       %%mm2, %%mm5     \n\t"
+
+    "  pmaddwd     %%mm0, %%mm0     \n\t"
+    "  pmaddwd     %%mm2, %%mm2     \n\t"
+    
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  paddd       %%mm2, %%mm7     \n\t"
+
+    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
+    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $32, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $16, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movd        %%mm5, %%edi     \n\t"
+    "  movsx       %%di, %%edi      \n\t"
+    "  movl        %%edi, %0        \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %1        \n\t"
+
+     : "=m" (XSum),
+       "=m" (XXSum),
+       "+r" (SrcData), 
+       "+r" (RefDataPtr) 
+     : "m" (SrcStride),
+       "m" (RefStride)
+     : "edi", "memory"
+  );
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                     unsigned char *RefDataPtr1,
+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+  ogg_uint32_t XSum;
+  ogg_uint32_t XXSum;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
+    "  paddb       %%mm4, %%mm4     \n\t"
+    "  pxor        %%mm5, %%mm5     \n\t"
+    "  pxor        %%mm6, %%mm6     \n\t"
+    "  pxor        %%mm7, %%mm7     \n\t"
+    "  mov         $8, %%edi        \n\t"
+    "1:                             \n\t"
+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+
+    "  movq        (%3), %%mm2      \n\t"
+    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
+    "  movq        %%mm2, %%mm1     \n\t"
+    "  pand        %%mm3, %%mm1     \n\t"
+    "  pxor        %%mm2, %%mm3     \n\t"
+    "  pand        %%mm4, %%mm3     \n\t"
+    "  psrlq       $1, %%mm3        \n\t"
+    "  paddb       %%mm3, %%mm1     \n\t"
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  movq        %%mm1, %%mm3     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"
+    "  punpcklbw   %%mm6, %%mm1     \n\t"
+    "  punpckhbw   %%mm6, %%mm2     \n\t"
+    "  punpckhbw   %%mm6, %%mm3     \n\t"
+
+    "  psubsw      %%mm1, %%mm0     \n\t"
+    "  psubsw      %%mm3, %%mm2     \n\t"
+
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  paddw       %%mm2, %%mm5     \n\t"
+
+    "  pmaddwd     %%mm0, %%mm0     \n\t"
+    "  pmaddwd     %%mm2, %%mm2     \n\t"
+    
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  paddd       %%mm2, %%mm7     \n\t"
+
+    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
+    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
+    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $32, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $16, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movd        %%mm5, %%edi     \n\t"
+    "  movsx       %%di, %%edi      \n\t"
+    "  movl        %%edi, %0        \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %1        \n\t"
+
+     : "=m" (XSum),
+       "=m" (XXSum),
+       "+r" (SrcData), 
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2) 
+     : "m" (SrcStride),
+       "m" (RefStride)
+     : "edi", "memory"
+  );
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static void restore_fpu (void)
+{
+  __asm__ __volatile__ (
+    "  emms                         \n\t"
+  );
+}
+
+void dsp_i386_mmx_init(DspFunctions *funcs)
+{
+  funcs->restore_fpu = restore_fpu;
+  funcs->sub8x8 = sub8x8__mmx;
+  funcs->sub8x8_128 = sub8x8_128__mmx;
+  funcs->sub8x8avg2 = sub8x8avg2__mmx;
+  funcs->row_sad8 = row_sad8__mmx;
+  funcs->col_sad8x8 = col_sad8x8__mmx;
+  funcs->sad8x8 = sad8x8__mmx;
+  funcs->sad8x8_thres = sad8x8_thres__mmx;
+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
+  funcs->intra8x8_err = intra8x8_err__mmx;
+  funcs->inter8x8_err = inter8x8_err__mmx;
+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+}
+
Index: lib/i386/recon_mmx.c
===================================================================
--- lib/i386/recon_mmx.c	(revision 0)
+++ lib/i386/recon_mmx.c	(revision 0)
@@ -0,0 +1,185 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include "encoder_internal.h"
+
+static const __attribute__ ((aligned(8))) ogg_int64_t V128 = 0x8080808080808080LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+static void copy8x8__mmx (unsigned char *src,
+	                unsigned char *dest,
+	                unsigned int stride)
+{
+  __asm__ __volatile__ (
+    "  .balign 16                      \n\t"
+
+    "  lea         (%2, %2, 2), %%edi  \n\t"
+
+    "  movq        (%1), %%mm0         \n\t"
+    "  movq        (%1, %2), %%mm1     \n\t"
+    "  movq        (%1, %2, 2), %%mm2  \n\t"
+    "  movq        (%1, %%edi), %%mm3  \n\t"
+
+    "  lea         (%1, %2, 4), %1     \n\t" 
+
+    "  movq        %%mm0, (%0)         \n\t"
+    "  movq        %%mm1, (%0, %2)     \n\t"
+    "  movq        %%mm2, (%0, %2, 2)  \n\t"
+    "  movq        %%mm3, (%0, %%edi)  \n\t"
+
+    "  lea         (%0, %2, 4), %0     \n\t" 
+
+    "  movq        (%1), %%mm0         \n\t"
+    "  movq        (%1, %2), %%mm1     \n\t"
+    "  movq        (%1, %2, 2), %%mm2  \n\t"
+    "  movq        (%1, %%edi), %%mm3  \n\t"
+
+    "  movq        %%mm0, (%0)         \n\t"
+    "  movq        %%mm1, (%0, %2)     \n\t"
+    "  movq        %%mm2, (%0, %2, 2)  \n\t"
+    "  movq        %%mm3, (%0, %%edi)  \n\t"
+      : "+a" (dest)
+      : "c" (src),
+        "d" (stride)
+      : "memory", "edi"
+  );
+}
+
+static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+		      ogg_uint32_t LineStep)
+{
+  __asm__ __volatile__ (
+    "  .balign 16                      \n\t"
+
+    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
+
+    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
+    "1:                                \n\t" 
+    "  movq         (%1), %%mm2        \n\t" /* First four input values */
+
+    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
+    "  por         %%mm0, %%mm0        \n\t" 
+    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
+    "  lea         16(%1), %1          \n\t" /* Step source buffer */
+    "  cmp         %%edi, %1           \n\t" /* are we done */
+
+    "  movq        %%mm2, (%0)         \n\t" /* store results */
+
+    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
+    "  jc          1b                  \n\t" /* Loop back if we are not done */
+      : "+r" (ReconPtr)
+      : "r" (ChangePtr),
+        "r" (LineStep)
+      : "memory", "edi"
+  );
+}
+
+static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
+  __asm__ __volatile__ (
+    "  .balign 16                      \n\t"
+
+    "  pxor        %%mm0, %%mm0        \n\t"
+    "  lea         128(%1), %%edi      \n\t"
+
+    "1:                                \n\t"
+    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
+
+    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
+    "  movq        %%mm2, %%mm3        \n\t"
+    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
+    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
+    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
+    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
+    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
+    "  add         %3, %2              \n\t" /* next row of reference pixels */
+    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
+    "  lea         16(%1), %1          \n\t" /* next row of changes */
+    "  cmp         %%edi, %1            \n\t" /* are we done? */
+
+    "  movq        %%mm2, (%0)         \n\t" /* store result */
+
+    "  lea         (%0, %3), %0        \n\t" /* next row of output */
+    "  jc          1b                  \n\t"
+      : "+r" (ReconPtr)
+      : "r" (ChangePtr),
+        "r" (RefPtr),
+        "r" (LineStep)
+      : "memory", "edi"
+  );
+}
+
+static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+			   ogg_uint32_t LineStep)
+{
+  __asm__ __volatile__ (
+    "  .balign 16                      \n\t"
+
+    "  pxor        %%mm0, %%mm0        \n\t"
+    "  lea         128(%1), %%edi      \n\t"
+
+    "1:                                \n\t"
+    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
+    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
+
+    "  movq        %%mm2, %%mm3        \n\t"
+    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
+    "  movq        %%mm4, %%mm5        \n\t"
+    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
+    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
+    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
+    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
+    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
+    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
+    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
+    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
+    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
+    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
+    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
+    "  lea         16(%1), %1          \n\t" /* next row of changes */
+    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
+    "  add         %4, %2              \n\t" /* next row of reference pixels */
+    "  add         %4, %3              \n\t" /* next row of reference pixels */
+    "  movq        %%mm2, (%0)         \n\t" /* store result */
+    "  add         %4, %0              \n\t" /* next row of output */
+    "  cmp         %%edi, %1           \n\t" /* are we done? */
+    "  jc          1b                  \n\t"
+      : "+r" (ReconPtr)
+      : "r" (ChangePtr),
+        "r" (RefPtr1),
+        "r" (RefPtr2),
+        "m" (LineStep)
+      : "memory", "edi"
+  );
+}
+
+void dsp_i386_mmx_recon_init(DspFunctions *funcs)
+{
+  funcs->copy8x8 = copy8x8__mmx;
+  funcs->recon_intra8x8 = recon_intra8x8__mmx;
+  funcs->recon_inter8x8 = recon_inter8x8__mmx;
+  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
+}
+
Index: lib/i386/dsp_mmxext.c
===================================================================
--- lib/i386/dsp_mmxext.c	(revision 0)
+++ lib/i386/dsp_mmxext.c	(revision 0)
@@ -0,0 +1,316 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "dsp.h"
+
+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+{
+  ogg_uint32_t  DiffVal;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+
+    ".rept 7                        \n\t"
+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%2), %%mm1             \n\t"
+    "  psadbw %%mm1, %%mm0          \n\t"
+    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+    ".endr                          \n\t"
+
+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%2), %%mm1             \n\t"
+    "  psadbw %%mm1, %%mm0          \n\t"
+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  movd %%mm7, %0               \n\t"
+
+     : "=r" (DiffVal),
+       "+r" (ptr1), 
+       "+r" (ptr2) 
+     : "r" (stride1),
+       "r" (stride2)
+     : "memory"
+  );
+
+  return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
+			   	  ogg_uint32_t thres)
+{
+  ogg_uint32_t  DiffVal;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+
+    ".rept 8                        \n\t"
+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%2), %%mm1             \n\t"
+    "  psadbw %%mm1, %%mm0          \n\t"
+    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+    ".endr                          \n\t"
+
+    "  movd %%mm7, %0               \n\t"
+
+     : "=r" (DiffVal),
+       "+r" (ptr1), 
+       "+r" (ptr2) 
+     : "r" (stride1),
+       "r" (stride2)
+     : "memory"
+  );
+
+  return DiffVal;
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                      unsigned char *RefDataPtr1,
+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+			              ogg_uint32_t thres)
+{
+  ogg_uint32_t  DiffVal;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    ".rept 8                        \n\t"
+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%2), %%mm1             \n\t"
+    "  movq (%3), %%mm2             \n\t"
+    "  pavgb %%mm2, %%mm1           \n\t"
+    "  psadbw %%mm1, %%mm0          \n\t"
+
+    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
+    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
+    ".endr                          \n\t"
+
+    "  movd %%mm7, %0               \n\t"
+     : "=m" (DiffVal),
+       "+r" (SrcData), 
+       "+r" (RefDataPtr1), 
+       "+r" (RefDataPtr2) 
+     : "m" (SrcStride),
+       "m" (RefStride)
+     : "memory"
+  );
+
+  return DiffVal;
+}
+		
+static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
+{
+  ogg_uint32_t MaxSad;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  movd        (%1), %%mm0      \n\t"
+    "  movd        (%2), %%mm1      \n\t"
+    "  psadbw      %%mm0, %%mm1     \n\t"
+    "  movd        4(%1), %%mm2     \n\t"
+    "  movd        4(%2), %%mm3     \n\t"
+    "  psadbw      %%mm2, %%mm3     \n\t"
+
+    "  pmaxsw      %%mm1, %%mm3     \n\t"
+    "  movd        %%mm3, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=m" (MaxSad),
+       "+r" (Src1), 
+       "+r" (Src2) 
+     :
+     : "memory"
+  );
+
+  return MaxSad;
+}
+
+static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
+		                    ogg_uint32_t stride)
+{
+  ogg_uint32_t MaxSad;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
+    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
+    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
+    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "1:                             \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "2:                             \n\t"
+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
+    "  movq        %%mm0, %%mm1     \n\t"
+
+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
+    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 2b                       \n\t"
+
+    "  pmaxsw      %%mm6, %%mm7     \n\t"
+    "  pmaxsw      %%mm4, %%mm5     \n\t"
+    "  pmaxsw      %%mm5, %%mm7     \n\t"
+    "  movq        %%mm7, %%mm6     \n\t"
+    "  psrlq       $32, %%mm6       \n\t"
+    "  pmaxsw      %%mm6, %%mm7     \n\t"
+    "  movq        %%mm7, %%mm6     \n\t"
+    "  psrlq       $16, %%mm6       \n\t"
+    "  pmaxsw      %%mm6, %%mm7     \n\t"
+    "  movd        %%mm7, %0        \n\t"
+    "  andl        $0xffff, %0      \n\t"
+
+     : "=r" (MaxSad),
+       "+r" (Src1), 
+       "+r" (Src2) 
+     : "r" (stride)
+     : "memory", "edi"
+  );
+
+  return MaxSad;
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                     unsigned char *RefDataPtr1,
+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+  ogg_uint32_t XSum;
+  ogg_uint32_t XXSum;
+
+  __asm__ __volatile__ (
+    "  .balign 16                   \n\t"
+
+    "  pxor        %%mm4, %%mm4     \n\t"
+    "  pxor        %%mm5, %%mm5     \n\t"
+    "  pxor        %%mm6, %%mm6     \n\t"
+    "  pxor        %%mm7, %%mm7     \n\t"
+    "  mov         $8, %%edi        \n\t"
+    "1:                             \n\t"
+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
+
+    "  movq        (%3), %%mm2      \n\t"
+    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
+    "  pavgb       %%mm2, %%mm1     \n\t"
+
+    "  movq        %%mm0, %%mm2     \n\t"
+    "  movq        %%mm1, %%mm3     \n\t"
+
+    "  punpcklbw   %%mm6, %%mm0     \n\t"
+    "  punpcklbw   %%mm4, %%mm1     \n\t"
+    "  punpckhbw   %%mm6, %%mm2     \n\t"
+    "  punpckhbw   %%mm4, %%mm3     \n\t"
+
+    "  psubsw      %%mm1, %%mm0     \n\t"
+    "  psubsw      %%mm3, %%mm2     \n\t"
+
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  paddw       %%mm2, %%mm5     \n\t"
+
+    "  pmaddwd     %%mm0, %%mm0     \n\t"
+    "  pmaddwd     %%mm2, %%mm2     \n\t"
+    
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  paddd       %%mm2, %%mm7     \n\t"
+
+    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
+    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
+    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
+
+    "  dec         %%edi            \n\t"
+    "  jnz 1b                       \n\t"
+
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $32, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movq        %%mm5, %%mm0     \n\t"
+    "  psrlq       $16, %%mm5       \n\t"
+    "  paddw       %%mm0, %%mm5     \n\t"
+    "  movd        %%mm5, %%edi     \n\t"
+    "  movsx       %%di, %%edi      \n\t"
+    "  movl        %%edi, %0        \n\t"
+
+    "  movq        %%mm7, %%mm0     \n\t"
+    "  psrlq       $32, %%mm7       \n\t"
+    "  paddd       %%mm0, %%mm7     \n\t"
+    "  movd        %%mm7, %1        \n\t"
+
+     : "=m" (XSum),
+       "=m" (XXSum),
+       "+r" (SrcData), 
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2) 
+     : "m" (SrcStride),
+       "m" (RefStride)
+     : "edi", "memory"
+  );
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+}
+
+void dsp_i386_mmxext_init(DspFunctions *funcs)
+{
+  funcs->row_sad8 = row_sad8__mmxext;
+  funcs->col_sad8x8 = col_sad8x8__mmxext;
+  funcs->sad8x8 = sad8x8__mmxext;
+  funcs->sad8x8_thres = sad8x8_thres__mmxext;
+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
+}
+
Index: lib/dct.c
===================================================================
--- lib/dct.c	(revision 7621)
+++ lib/dct.c	(working copy)
@@ -16,6 +16,7 @@
  ********************************************************************/
 
 #include "encoder_internal.h"
+#include "cpu.h"
 
 static ogg_int32_t xC1S7 = 64277;
 static ogg_int32_t xC2S6 = 60547;
@@ -28,7 +29,7 @@
 #define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
 #define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
 
-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
   int loop;
 
   ogg_int32_t  is07, is12, is34, is56;
@@ -251,3 +252,12 @@
     op ++;
   }
 }
+
+void dsp_dct_init (DspFunctions *funcs)
+{
+  funcs->fdct_short = fdct_short__c;
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_i386_mmx_fdct_init(&dsp_funcs);
+  }
+}
+
Index: lib/mcomp.c
===================================================================
--- lib/mcomp.c	(revision 7621)
+++ lib/mcomp.c	(working copy)
@@ -17,6 +17,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include "dsp.h"
 #include "encoder_internal.h"
 
 /* Initialises motion compentsation. */
@@ -100,164 +101,25 @@
                           unsigned char * RefDataPtr1,
                           unsigned char * RefDataPtr2,
                           ogg_uint32_t PixelsPerLine ) {
-  ogg_uint32_t  i;
-  ogg_int32_t   XSum=0;
-  ogg_int32_t   XXSum=0;
   ogg_int32_t   DiffVal;
-  ogg_int32_t   AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
+  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
+  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
 
   /* Mode of interpolation chosen based upon on the offset of the
      second reference pointer */
-  if ( AbsRefOffset == 0 ) {
-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
-      XSum += DiffVal;
-
-      /* negative array indexes are strictly forbidden by ANSI C and C99 */
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      /* Step to next row of block. */
-      NewDataPtr += PixelsPerLine;
-      RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
-    }
-
+  if ( RefOffset == 0 ) {
+    DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
+		          RefDataPtr1, RefPixelsPerLine);
   }else{
-
-    /* Simple two reference interpolation */
-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal = ((int)NewDataPtr[0]) -
-        (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[1]) -
-        (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[2]) -
-        (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[3]) -
-        (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[4]) -
-        (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[5]) -
-        (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[6]) -
-        (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[7]) -
-        (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      /* Step to next row of block. */
-      NewDataPtr += PixelsPerLine;
-      RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
-      RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
-    }
+    DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
+		          RefDataPtr1, 
+		          RefDataPtr2, RefPixelsPerLine);
   }
 
   /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t GetSumAbsDiffs  (unsigned char * NewDataPtr,
-                              unsigned char  * RefDataPtr,
-                              ogg_uint32_t PixelsPerLine,
-                              ogg_uint32_t ErrorSoFar) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  DiffVal = ErrorSoFar;
-
-  /* Decide on standard or MMX implementation */
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
-    /* Step to next row of block. */
-    NewDataPtr += PixelsPerLine;
-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
-  }
-
   return DiffVal;
 }
 
-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
-                                 unsigned char * RefDataPtr,
-                                 ogg_uint32_t PixelsPerLine,
-                                 ogg_uint32_t ErrorSoFar,
-                                 ogg_uint32_t BestSoFar ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  DiffVal = ErrorSoFar;
-
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
-    if ( DiffVal > BestSoFar )break;
-
-    /* Step to next row of block. */
-    NewDataPtr += PixelsPerLine;
-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
-  }
-
-  return DiffVal;
-}
-
 static ogg_uint32_t GetHalfPixelSumAbsDiffs (unsigned char * SrcData,
                                       unsigned char * RefDataPtr1,
                                       unsigned char * RefDataPtr2,
@@ -265,119 +127,61 @@
                                       ogg_uint32_t ErrorSoFar,
                                       ogg_uint32_t BestSoFar ) {
 
-  ogg_uint32_t  i;
   ogg_uint32_t  DiffVal = ErrorSoFar;
   ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
   ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
 
   if ( RefOffset == 0 ) {
     /* Simple case as for non 0.5 pixel */
-    DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
-                               ErrorSoFar);
+    DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, 
+		               RefDataPtr1, RefPixelsPerLine);
   } else  {
-    for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
-                                            (int)RefDataPtr2[0]) / 2) );
-      DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
-                                            (int)RefDataPtr2[1]) / 2) );
-      DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
-                                            (int)RefDataPtr2[2]) / 2) );
-      DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
-                                            (int)RefDataPtr2[3]) / 2) );
-      DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
-                                            (int)RefDataPtr2[4]) / 2) );
-      DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
-                                            (int)RefDataPtr2[5]) / 2) );
-      DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
-                                            (int)RefDataPtr2[6]) / 2) );
-      DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
-                                            (int)RefDataPtr2[7]) / 2) );
-
-      if ( DiffVal > BestSoFar ) break;
-
-      /* Step to next row of block. */
-      SrcData += PixelsPerLine;
-      RefDataPtr1 += RefPixelsPerLine;
-      RefDataPtr2 += RefPixelsPerLine;
-    }
+    DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, 
+		               RefDataPtr1, 
+		               RefDataPtr2, RefPixelsPerLine, BestSoFar);
   }
 
   return DiffVal;
 }
 
-static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
-                            ogg_uint32_t PixelsPerLine ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  unsigned char *DiffPtr;
-
-  /* Loop expanded out for speed. */
-  DiffPtr = DataPtr;
-
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-
-    /* Examine alternate pixel locations. */
-    XSum += DiffPtr[0];
-    XXSum += DiffPtr[0]*DiffPtr[0];
-    XSum += DiffPtr[1];
-    XXSum += DiffPtr[1]*DiffPtr[1];
-    XSum += DiffPtr[2];
-    XXSum += DiffPtr[2]*DiffPtr[2];
-    XSum += DiffPtr[3];
-    XXSum += DiffPtr[3]*DiffPtr[3];
-    XSum += DiffPtr[4];
-    XXSum += DiffPtr[4]*DiffPtr[4];
-    XSum += DiffPtr[5];
-    XXSum += DiffPtr[5]*DiffPtr[5];
-    XSum += DiffPtr[6];
-    XXSum += DiffPtr[6]*DiffPtr[6];
-    XSum += DiffPtr[7];
-    XXSum += DiffPtr[7]*DiffPtr[7];
-
-    /* Step to next row of block. */
-    DiffPtr += PixelsPerLine;
-  }
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
 ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
                               ogg_uint32_t PixelsPerLine ) {
   ogg_uint32_t  LocalFragIndex = FragIndex;
   ogg_uint32_t  IntraError = 0;
 
+  dsp_static_save_fpu ();
+
   /* Add together the intra errors for those blocks in the macro block
      that are coded (Y only) */
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_static_intra8x8_err (&cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
-
   LocalFragIndex++;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_static_intra8x8_err (&cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
   LocalFragIndex = FragIndex + cpi->pb.HFragments;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_static_intra8x8_err (&cpi->
                      ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                     PixelsPerLine );
+                    PixelsPerLine);
 
   LocalFragIndex++;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_static_intra8x8_err (&cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
+  dsp_static_restore_fpu ();
+
   return IntraError;
 }
 
@@ -400,6 +204,8 @@
   unsigned char * SrcPtr1;
   unsigned char * RefPtr1;
 
+  dsp_static_save_fpu ();
+
   /* Work out pixel offset into source buffer. */
   PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
 
@@ -462,6 +268,9 @@
     InterError += GetInterErr( SrcPtr1, RefPtr1,
                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
   }
+
+  dsp_static_restore_fpu ();
+
   return InterError;
 }
 
@@ -496,6 +305,8 @@
   unsigned char * RefDataPtr1;
   unsigned char * RefDataPtr2;
 
+  dsp_static_save_fpu ();
+
   /* Note which of the four blocks in the macro block are to be
      included in the search. */
   MBlockDispFrags[0] =
@@ -518,20 +329,20 @@
 
   /* Check the 0,0 candidate. */
   if ( MBlockDispFrags[0] ) {
-    Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
-                         PixelsPerLine, Error);
+    Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[1] ) {
-    Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
-                         PixelsPerLine, Error);
+    Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[2] ) {
-    Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
-                         PixelsPerLine, Error);
+    Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[3] ) {
-    Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
-                         PixelsPerLine, Error);
+    Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
 
   /* Set starting values to results of 0, 0 vector. */
@@ -554,24 +365,23 @@
 
       /* Get the score for the current offset */
       if ( MBlockDispFrags[0] ) {
-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
-                             PixelsPerLine, Error);
+        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
 
       if ( MBlockDispFrags[1] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( MBlockDispFrags[2] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( MBlockDispFrags[3] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[3],
-                                 CandidateBlockPtr + RefRow2Offset + 8,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( Error < MinError ) {
@@ -652,6 +462,8 @@
   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                   FragIndex, MV->x, MV->y, PixelsPerLine );
 
+  dsp_static_restore_fpu ();
+
   /* Return score of best matching block. */
   return InterMVError;
 }
@@ -684,6 +496,8 @@
   unsigned char * RefDataPtr1;
   unsigned char * RefDataPtr2;
 
+  dsp_static_save_fpu ();
+
   /* Note which of the four blocks in the macro block are to be
      included in the search. */
   MBlockDispFrags[0] = cpi->
@@ -717,20 +531,20 @@
 
       /* Summ errors for each block. */
       if ( MBlockDispFrags[0] ) {
-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
-                             PixelsPerLine, Error);
+        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[1] ){
-        Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
-                             PixelsPerLine, Error);
+        Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[2] ){
-        Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
-                             PixelsPerLine, Error);
+        Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[3] ){
-        Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,
-                             PixelsPerLine, Error);
+        Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
 
       /* Was this the best so far */
@@ -808,6 +622,8 @@
   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                   FragIndex, MV->x, MV->y, PixelsPerLine );
 
+  dsp_static_restore_fpu ();
+
   /* Return score of best matching block. */
   return InterMVError;
 }
@@ -850,8 +666,8 @@
 
     for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
       /* Get the block error score. */
-      Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
-                           PixelsPerLine, 0);
+      Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
 
       /* Was this the best so far */
       if ( Error < MinError ) {
@@ -911,6 +727,8 @@
                                         MOTION_VECTOR *MV ) {
   ogg_uint32_t  InterMVError;
 
+  dsp_static_save_fpu ();
+
   /* For the moment the 4MV mode is only deemd to be valid if all four
      Y blocks are to be updated */
   /* This May be adapted later. */
@@ -941,6 +759,8 @@
     InterMVError = HUGE_ERROR;
   }
 
+  dsp_static_restore_fpu ();
+
   /* Return score of best matching block. */
   return InterMVError;
 }
Index: lib/dct_decode.c
===================================================================
--- lib/dct_decode.c	(revision 7621)
+++ lib/dct_decode.c	(working copy)
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "encoder_internal.h"
+#include "dsp.h"
 
 
 #define GOLDEN_FRAME_THRESH_Q   50
@@ -112,22 +113,6 @@
   SetupBoundingValueArray_Generic(pbi, FLimit);
 }
 
-void CopyBlock(unsigned char *src,
-               unsigned char *dest,
-               unsigned int srcstride){
-  unsigned char *s = src;
-  unsigned char *d = dest;
-  unsigned int stride = srcstride;
-
-  int j;
-  for ( j = 0; j < 8; j++ ){
-    ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
-    ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
-    s+=stride;
-    d+=stride;
-  }
-}
-
 static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
   ogg_uint32_t ReconPixelsPerLine;
   ogg_int32_t     ReconPixelIndex;
@@ -160,9 +145,8 @@
   ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
 
   /* Get the pixel index for the first pixel in the fragment. */
-  ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
-              (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+  dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
+                             (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
 }
 
 static void ExpandBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
@@ -237,10 +221,9 @@
     /* Reconstruct the pixel data using the last frame reconstruction
        and change data when the motion vector is (0,0), the recon is
        based on the lastframe without loop filtering---- for testing */
-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
                 &pbi->LastFrameRecon[ReconPixelIndex],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
   }else if ( ModeUsesMC[pbi->CodingMode] ) {
     /* The mode uses a motion vector. */
     /* Get vector from list */
@@ -287,29 +270,30 @@
     if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
       /* Reconstruct the pixel dats from the reference frame and change data
          (no half pixel in this case as the two references were the same. */
-      ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+      dsp_static_recon_inter8x8 (
+		  &pbi->ThisFrameRecon[ReconPixelIndex],
                   LastFrameRecPtr, pbi->ReconDataBuffer,
-                  ReconPixelsPerLine );
+                  ReconPixelsPerLine);
     }else{
       /* Fractional pixel reconstruction. */
       /* Note that we only use two pixels per reconstruction even for
          the diagonal. */
-      ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
+      dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
                             LastFrameRecPtr, LastFrameRecPtr2,
-                            pbi->ReconDataBuffer, ReconPixelsPerLine );
+                            pbi->ReconDataBuffer, ReconPixelsPerLine);
     }
   } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
     /* Golden frame with motion vector */
     /* Reconstruct the pixel data using the golden frame
        reconstruction and change data */
-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
                 &pbi->GoldenFrame[ ReconPixelIndex ],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
   } else {
     /* Simple Intra coding */
     /* Get the pixel index for the first pixel in the fragment. */
-    ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
+    dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
+              pbi->ReconDataBuffer, ReconPixelsPerLine);
   }
 }
 
@@ -464,7 +448,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
     }
   }
 
@@ -476,7 +460,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
 
     }
   }
@@ -501,7 +485,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
     }
   }
 
@@ -513,7 +497,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
 
     }
   }
Index: lib/pp.c
===================================================================
--- lib/pp.c	(revision 7621)
+++ lib/pp.c	(working copy)
@@ -19,6 +19,7 @@
 #include <string.h>
 #include "encoder_internal.h"
 #include "pp.h"
+#include "dsp.h"
 
 #define MAX(a, b) ((a>b)?a:b)
 #define MIN(a, b) ((a<b)?a:b)
@@ -490,7 +491,7 @@
 
       } else {
 
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
 
       }
 
@@ -529,7 +530,7 @@
         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                         LineLength,Quality,QuantScale);
       }else{
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
       }
 
       ++Block;
@@ -565,7 +566,7 @@
         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                         LineLength,Quality,QuantScale);
       }else{
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
       }
 
       ++Block;
Index: lib/encoder_internal.h
===================================================================
--- lib/encoder_internal.h	(revision 7621)
+++ lib/encoder_internal.h	(working copy)
@@ -24,6 +24,7 @@
 
 #include <theora/theora.h>
 #include "huffman.h"
+#include "dsp.h"
 
 #ifndef LIBOGG2
 #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
@@ -689,23 +690,9 @@
                    ogg_int16_t *QuantMatrix,
                    ogg_int16_t * OutputData );
 
-extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                        ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );
+extern void dsp_recon_init (DspFunctions *funcs);
 
-extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                        unsigned char * RefPtr, ogg_int16_t * ChangePtr,
-                        ogg_uint32_t LineStep ) ;
-
-extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                                  unsigned char * RefPtr1,
-                                  unsigned char * RefPtr2,
-                                  ogg_int16_t * ChangePtr,
-                                  ogg_uint32_t LineStep ) ;
-
 extern void SetupLoopFilter(PB_INSTANCE *pbi);
-extern void CopyBlock(unsigned char *src,
-                      unsigned char *dest,
-                      unsigned int srcstride);
 extern void LoopFilter(PB_INSTANCE *pbi);
 extern void ReconRefFrames (PB_INSTANCE *pbi);
 extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,
Index: lib/scan.c
===================================================================
--- lib/scan.c	(revision 7621)
+++ lib/scan.c	(working copy)
@@ -19,9 +19,20 @@
 #include <math.h>
 #include <string.h>
 #include "encoder_internal.h"
+#include "dsp.h"
 
 #define MAX_SEARCH_LINE_LEN                   7
 
+#define SET8_0(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
+  ((ogg_uint32_t *)ptr)[1] = 0x00000000;
+#define SET8_1(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
+  ((ogg_uint32_t *)ptr)[1] = 0x01010101;
+#define SET8_8(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
+  ((ogg_uint32_t *)ptr)[1] = 0x08080808;
+
 static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
   0, 0, 0, 0, 2, 4, 12, 24
 };
@@ -384,69 +395,6 @@
   ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
 }
 
-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
-                                  unsigned char * Src2 ){
-  ogg_uint32_t SadValue;
-  ogg_uint32_t SadValue1;
-
-  SadValue    = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
-    abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
-
-  SadValue1   = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
-    abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
-
-  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
-  return SadValue;
-}
-
-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
-                           unsigned char * Src1,
-                           unsigned char * Src2 ){
-  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t MaxSad = 0;
-  ogg_uint32_t i;
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue[0] += abs(Src1[0] - Src2[0]);
-    SadValue[1] += abs(Src1[1] - Src2[1]);
-    SadValue[2] += abs(Src1[2] - Src2[2]);
-    SadValue[3] += abs(Src1[3] - Src2[3]);
-    SadValue[4] += abs(Src1[4] - Src2[4]);
-    SadValue[5] += abs(Src1[5] - Src2[5]);
-    SadValue[6] += abs(Src1[6] - Src2[6]);
-    SadValue[7] += abs(Src1[7] - Src2[7]);
-
-    Src1 += ppi->PlaneStride;
-    Src2 += ppi->PlaneStride;
-  }
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue2[0] += abs(Src1[0] - Src2[0]);
-    SadValue2[1] += abs(Src1[1] - Src2[1]);
-    SadValue2[2] += abs(Src1[2] - Src2[2]);
-    SadValue2[3] += abs(Src1[3] - Src2[3]);
-    SadValue2[4] += abs(Src1[4] - Src2[4]);
-    SadValue2[5] += abs(Src1[5] - Src2[5]);
-    SadValue2[6] += abs(Src1[6] - Src2[6]);
-    SadValue2[7] += abs(Src1[7] - Src2[7]);
-
-    Src1 += ppi->PlaneStride;
-    Src2 += ppi->PlaneStride;
-  }
-
-  for ( i = 0; i < 8; i++ ){
-    if ( SadValue[i] > MaxSad )
-      MaxSad = SadValue[i];
-    if ( SadValue2[i] > MaxSad )
-      MaxSad = SadValue2[i];
-  }
-
-  return MaxSad;
-}
-
-
 static int RowSadScan( PP_INSTANCE *ppi,
                        unsigned char * YuvPtr1,
                        unsigned char * YuvPtr2,
@@ -475,7 +423,7 @@
     for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
       if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
         /* Calculate the SAD score for the block row */
-        GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
+        GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);
 
         /* Now test the group SAD score */
         if ( GrpSad > LocalGrpLowSadThresh ){
@@ -532,7 +480,7 @@
     /* Skip if block already marked to be coded. */
     if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
       /* Calculate the SAD score for the block column */
-      MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
+      MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
 
       /* Now test the group SAD score */
       if ( MaxSad > LocalGrpLowSadThresh ){
@@ -758,7 +706,7 @@
       if (*DispFragPtr == CANDIDATE_BLOCK){
 
         /* Clear down entries in changed locals array */
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
 
         for ( j = 0; j < HFRAGPIXELS; j++ ){
           /* Take a local copy of the measured difference. */
@@ -777,10 +725,10 @@
       }else{
         /* If we are breaking out here mark all pixels as changed. */
         if ( *DispFragPtr > BLOCK_NOT_CODED ){
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
       }
 
@@ -816,7 +764,7 @@
     /* Test for break out conditions to save time. */
     if (*DispFragPtr == CANDIDATE_BLOCK){
       /* Clear down entries in changed locals array */
-      memset(ChLocalsPtr,0,8);
+      SET8_0(ChLocalsPtr);
 
       for ( j = 0; j < HFRAGPIXELS; j++ ){
         /* Take a local copy of the measured difference. */
@@ -839,10 +787,10 @@
     }else{
       /* If we are breaking out here mark all pixels as changed. */
       if ( *DispFragPtr > BLOCK_NOT_CODED ){
-        memset(bits_map_ptr,1,8);
-        memset(ChLocalsPtr,8,8);
+        SET8_1(bits_map_ptr);
+        SET8_8(ChLocalsPtr);
       }else{
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
       }
     }
 
@@ -876,7 +824,7 @@
       /* Test for break out conditions to save time. */
       if (*DispFragPtr == CANDIDATE_BLOCK){
         /* Clear down entries in changed locals array */
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
         for ( j = 0; j < HFRAGPIXELS; j++ ){
           /* Take a local copy of the measured difference. */
           Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -899,10 +847,10 @@
       }else{
         /* If we are breaking out here mark all pixels as changed. */
         if ( *DispFragPtr > BLOCK_NOT_CODED ){
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
       }
 
@@ -935,7 +883,7 @@
     /* Test for break out conditions to save time. */
     if (*DispFragPtr == CANDIDATE_BLOCK){
       /* Clear down entries in changed locals array */
-      memset(ChLocalsPtr,0,8);
+      SET8_0(ChLocalsPtr);
 
       for ( j = 0; j < HFRAGPIXELS; j++ ){
         /* Take a local copy of the measured difference. */
@@ -959,10 +907,10 @@
     }else{
       /* If we are breaking out here mark all pixels as changed.*/
       if ( *DispFragPtr > BLOCK_NOT_CODED ) {
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
     }
     /* If we have a lot of changed pixels for this fragment on this
@@ -1071,7 +1019,7 @@
         }
       }else{
         if ( *DispFragPtr > BLOCK_NOT_CODED )
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
 
         /* Step pointers */
         ChLocalsPtr += HFRAGPIXELS;
@@ -1133,7 +1081,7 @@
         }
       }else{
         if ( *DispFragPtr > BLOCK_NOT_CODED )
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
 
         /* Step pointers */
         ChLocalsPtr += HFRAGPIXELS;
@@ -2126,10 +2074,12 @@
     /* Fast break out test for obvious yes and no cases in this row of
        blocks */
     if ( i < ppi->PlaneVFragments ){
+      dsp_static_save_fpu ();
       UpdatedOrCandidateBlocks =
         RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
-      if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
-        UpdatedOrCandidateBlocks = 1;
+      UpdatedOrCandidateBlocks |=
+        ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
+      dsp_static_restore_fpu ();
     }else{
       /* Make sure we still call other functions if RowSadScan() disabled */
       UpdatedOrCandidateBlocks = 1;
Index: lib/dsp.c
===================================================================
--- lib/dsp.c	(revision 0)
+++ lib/dsp.c	(revision 0)
@@ -0,0 +1,416 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include "cpu.h"
+#include "encoder_internal.h"
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+
+DspFunctions dsp_funcs;
+
+static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+                  ogg_uint32_t ReconPixelsPerLine) {
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr += ReconPixelsPerLine;
+    DctInputPtr += 8;
+  }
+}
+
+static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+                      ogg_uint32_t PixelsPerLine) {
+  int i;
+  /* For each block row */
+  for (i=8; i; i--) {
+    /* INTRA mode so code raw image data */
+    /* We convert the data to 8 bit signed (by subtracting 128) as
+       this reduces the internal precision requirments in the DCT
+       transform. */
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    DctInputPtr += 8;
+  }
+}
+
+static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+                     ogg_uint32_t PixelsPerLine,
+                     ogg_uint32_t ReconPixelsPerLine) 
+{
+  int i;
+
+  /* For each block row */
+  for (i=8; i; i--) {
+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
+
+    /* Start next row */
+    FiltPtr += PixelsPerLine;
+    ReconPtr1 += ReconPixelsPerLine;
+    ReconPtr2 += ReconPixelsPerLine;
+    DctInputPtr += 8;
+  }
+}
+
+static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
+{
+  ogg_uint32_t SadValue;
+  ogg_uint32_t SadValue1;
+
+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
+	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
+	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
+	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
+
+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
+	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
+	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
+	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
+
+  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
+
+  return SadValue;
+}
+
+static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
+		                    ogg_uint32_t stride)
+{
+  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+  ogg_uint32_t MaxSad = 0;
+  ogg_uint32_t i;
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue[0] += abs(Src1[0] - Src2[0]);
+    SadValue[1] += abs(Src1[1] - Src2[1]);
+    SadValue[2] += abs(Src1[2] - Src2[2]);
+    SadValue[3] += abs(Src1[3] - Src2[3]);
+    SadValue[4] += abs(Src1[4] - Src2[4]);
+    SadValue[5] += abs(Src1[5] - Src2[5]);
+    SadValue[6] += abs(Src1[6] - Src2[6]);
+    SadValue[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+
+  for ( i = 0; i < 4; i++ ){
+    SadValue2[0] += abs(Src1[0] - Src2[0]);
+    SadValue2[1] += abs(Src1[1] - Src2[1]);
+    SadValue2[2] += abs(Src1[2] - Src2[2]);
+    SadValue2[3] += abs(Src1[3] - Src2[3]);
+    SadValue2[4] += abs(Src1[4] - Src2[4]);
+    SadValue2[5] += abs(Src1[5] - Src2[5]);
+    SadValue2[6] += abs(Src1[6] - Src2[6]);
+    SadValue2[7] += abs(Src1[7] - Src2[7]);
+    
+    Src1 += stride;
+    Src2 += stride;
+  }
+    
+  for ( i = 0; i < 8; i++ ){
+    if ( SadValue[i] > MaxSad )
+      MaxSad = SadValue[i];
+    if ( SadValue2[i] > MaxSad )
+      MaxSad = SadValue2[i];
+  }
+    
+  return MaxSad;
+}
+
+static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
+  return sad;
+}
+
+static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
+			   	  ogg_uint32_t thres)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
+
+    if (sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    ptr1 += stride1;
+    ptr2 += stride2;
+  }
+
+  return sad;
+}
+
+static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                      unsigned char *RefDataPtr1,
+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+			              ogg_uint32_t thres)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  sad = 0;
+
+  for (i=8; i; i--) {
+    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+
+    if ( sad > thres )
+      break;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
+  return sad;
+}
+
+static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+
+  for (i=8; i; i--) {
+     /* Examine alternate pixel locations. */
+     XSum += DataPtr[0];
+     XXSum += DataPtr[0]*DataPtr[0];
+     XSum += DataPtr[1];
+     XXSum += DataPtr[1]*DataPtr[1];
+     XSum += DataPtr[2];
+     XXSum += DataPtr[2]*DataPtr[2];
+     XSum += DataPtr[3];
+     XXSum += DataPtr[3]*DataPtr[3];
+     XSum += DataPtr[4];
+     XXSum += DataPtr[4]*DataPtr[4];
+     XSum += DataPtr[5];
+     XXSum += DataPtr[5]*DataPtr[5];
+     XSum += DataPtr[6];
+     XXSum += DataPtr[6]*DataPtr[6];
+     XSum += DataPtr[7];
+     XXSum += DataPtr[7]*DataPtr[7];
+
+     /* Step to next row of block. */
+     DataPtr += Stride;
+   }
+
+   /* Compute population variance as mis-match metric. */
+   return (( (XXSum<<6) - XSum*XSum ) );
+}
+
+static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+        
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                     unsigned char *RefDataPtr1,
+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+  ogg_uint32_t  i;
+  ogg_uint32_t  XSum=0;
+  ogg_uint32_t  XXSum=0;
+  ogg_int32_t   DiffVal;
+
+  for (i=8; i; i--) {
+    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
+    XSum += DiffVal;
+    XXSum += DiffVal*DiffVal;
+
+    /* Step to next row of block. */
+    SrcData += SrcStride;
+    RefDataPtr1 += RefStride;
+    RefDataPtr2 += RefStride;
+  }
+
+  /* Compute and return population variance as mis-match metric. */
+  return (( (XXSum<<6) - XSum*XSum ));
+}
+
+static void nop (void) { /* NOP */ }
+
+void dsp_init(DspFunctions *funcs)
+{
+  funcs->save_fpu = nop;
+  funcs->restore_fpu = nop;
+  funcs->sub8x8 = sub8x8__c;
+  funcs->sub8x8_128 = sub8x8_128__c;
+  funcs->sub8x8avg2 = sub8x8avg2__c;
+  funcs->row_sad8 = row_sad8__c;
+  funcs->col_sad8x8 = col_sad8x8__c;
+  funcs->sad8x8 = sad8x8__c;
+  funcs->sad8x8_thres = sad8x8_thres__c;
+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
+  funcs->intra8x8_err = intra8x8_err__c;
+  funcs->inter8x8_err = inter8x8_err__c;
+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
+}
+
+void dsp_static_init(void)
+{
+  cpu_init ();
+  dsp_init (&dsp_funcs);
+  dsp_recon_init (&dsp_funcs);
+  dsp_dct_init (&dsp_funcs);
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_i386_mmx_init(&dsp_funcs);
+  }
+  if (cpu_flags & CPU_X86_MMXEXT) {
+    dsp_i386_mmxext_init(&dsp_funcs);
+  }
+}
+
Index: lib/Makefile.am
===================================================================
--- lib/Makefile.am	(revision 7621)
+++ lib/Makefile.am	(working copy)
@@ -3,12 +3,13 @@
 lib_LTLIBRARIES = libtheora.la
 
 if THEORA_SUPPORT_ENCODE
-encoder_sources = dct_encode.c encode.c encoder_toplevel.c
+encoder_sources = dct_encode.c encode.c encoder_toplevel.c 
 else
 encoder_sources = encoder_disabled.c
 endif
 
 libtheora_la_SOURCES = \
+        cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c i386/recon_mmx.c i386/fdct_mmx.c \
 	blockmap.c \
 	comment.c \
 	dct.c \
Index: lib/blockmap.c
===================================================================
--- lib/blockmap.c	(revision 7621)
+++ lib/blockmap.c	(working copy)
@@ -21,7 +21,7 @@
                             ogg_uint32_t FirstSB,
                             ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
                             ogg_uint32_t VFrags ){
-  ogg_uint32_t i, j;
+  ogg_uint32_t i, j = 0;
   ogg_uint32_t xpos;
   ogg_uint32_t ypos;
   ogg_uint32_t SBrow, SBcol;
Index: lib/encoder_toplevel.c
===================================================================
--- lib/encoder_toplevel.c	(revision 7621)
+++ lib/encoder_toplevel.c	(working copy)
@@ -777,6 +777,8 @@
 
   CP_INSTANCE *cpi;
 
+  dsp_static_init ();
+
   memset(th, 0, sizeof(*th));
   th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
 
Index: lib/toplevel.c
===================================================================
--- lib/toplevel.c	(revision 7621)
+++ lib/toplevel.c	(working copy)
@@ -290,6 +290,8 @@
   PB_INSTANCE *pbi;
   codec_setup_info *ci;
 
+  dsp_static_init ();
+
   ci=(codec_setup_info *)c->codec_setup;
   th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
 
Index: lib/dsp.h
===================================================================
--- lib/dsp.h	(revision 0)
+++ lib/dsp.h	(revision 0)
@@ -0,0 +1,154 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#ifndef DSP_H
+#define DSP_H
+
+#include <theora/theora.h>
+
+typedef struct
+{
+  void   (*save_fpu)            (void);
+  void   (*restore_fpu)         (void);
+
+  void   (*sub8x8)  		(unsigned char *FiltPtr, unsigned char *ReconPtr,
+	                   	 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+				 ogg_uint32_t ReconPixelsPerLine);
+
+  void   (*sub8x8_128) 		(unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+			         ogg_uint32_t PixelsPerLine);
+
+  void   (*sub8x8avg2) 		(unsigned char *FiltPtr, unsigned char *ReconPtr1,
+		                 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+			         ogg_uint32_t PixelsPerLine,
+			         ogg_uint32_t ReconPixelsPerLine); 
+
+  void   (*copy8x8)  		(unsigned char *src, unsigned char *dest, 
+		                 ogg_uint32_t stride);
+
+  void   (*recon_intra8x8)  	(unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
+		                 ogg_uint32_t LineStep);
+
+  void   (*recon_inter8x8)  	(unsigned char *ReconPtr, unsigned char *RefPtr, 
+		                 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
+
+  void   (*recon_inter8x8_half)	(unsigned char *ReconPtr, unsigned char *RefPtr1, 
+		  		 unsigned char *RefPtr2, ogg_int16_t *ChangePtr, 
+				 ogg_uint32_t LineStep);
+
+  void   (*fdct_short)          (ogg_int16_t *InputData, ogg_int16_t *OutputData);
+
+  ogg_uint32_t (*row_sad8)	(unsigned char *Src1, unsigned char *Src2);
+
+  ogg_uint32_t (*col_sad8x8)	(unsigned char *Src1, unsigned char *Src2,
+		  		 ogg_uint32_t stride);
+
+  ogg_uint32_t (*sad8x8)	(unsigned char *ptr1, ogg_uint32_t stride1,
+		        	 unsigned char *ptr2, ogg_uint32_t stride2);
+
+  ogg_uint32_t (*sad8x8_thres)	(unsigned char *ptr1, ogg_uint32_t stride1,
+		       		 unsigned char *ptr2, ogg_uint32_t stride2, 
+				 ogg_uint32_t thres);
+
+  ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr1,
+			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+				 ogg_uint32_t thres);
+
+  ogg_uint32_t (*intra8x8_err)	(unsigned char *DataPtr, ogg_uint32_t Stride);
+
+  ogg_uint32_t (*inter8x8_err)	(unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride);
+
+  ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
+		                 unsigned char *RefDataPtr1,
+			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
+} DspFunctions;
+
+extern DspFunctions dsp_funcs;
+
+extern void dsp_recon_init (DspFunctions *funcs);
+
+void dsp_init(DspFunctions *funcs);
+void dsp_static_init(void);
+
+#define dsp_save_fpu(funcs) (funcs.save_fpu ())
+#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
+
+#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
+#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)
+
+#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
+#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)
+
+#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
+#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)
+
+#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6))
+#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)
+
+#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
+#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1))
+#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
+	(funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
+#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
+	dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)
+
+#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
+	(funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
+#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
+	dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)
+
+#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
+#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)
+
+#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
+#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)
+
+#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
+#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)
+
+#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2))
+#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)
+
+#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t))
+#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)
+
+#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
+	(funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
+#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
+	dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)
+
+#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
+#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1)
+
+#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
+	(funcs.inter8x8_err (ptr1,str1,ptr2,str2))
+#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
+	dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)
+
+#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
+	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
+#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
+	dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)
+
+
+#endif /* DSP_H */
Index: lib/encode.c
===================================================================
--- lib/encode.c	(revision 7621)
+++ lib/encode.c	(working copy)
@@ -531,8 +531,7 @@
 
 static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
                                      ogg_int32_t BlockIndex ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  ErrorVal = 0;
+  ogg_uint32_t  ErrorVal;
 
   unsigned char * SrcDataPtr =
     &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
@@ -550,21 +549,8 @@
     RecStride = cpi->pb.UVStride;
   }
 
+  ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);
 
-  /* Decide on standard or MMX implementation */
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
-    ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
-    ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
-    ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
-    ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
-    ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
-    ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
-    ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
-    /* Step to next row of block. */
-    SrcDataPtr += SrcStride;
-    RecDataPtr += RecStride;
-  }
   return ErrorVal;
 }
 
@@ -933,9 +919,13 @@
     /* Zero Decoder EOB run count */
     cpi->pb.EOB_Run = 0;
 
+    dsp_static_save_fpu ();
+
     /* Encode any fragments coded using DCT. */
     coded_pixels += QuadCodeDisplayFragments (cpi);
 
+    dsp_static_restore_fpu ();
+
     return coded_pixels;
 
 }