[xiph-commits] r11427 - in trunk/theora: . examples lib lib/x86_32 lib/x86_64

giles at svn.xiph.org giles at svn.xiph.org
Fri May 26 11:51:14 PDT 2006


Author: giles
Date: 2006-05-26 11:51:09 -0700 (Fri, 26 May 2006)
New Revision: 11427

Added:
   trunk/theora/lib/cpu.c
   trunk/theora/lib/cpu.h
   trunk/theora/lib/dsp.c
   trunk/theora/lib/dsp.h
   trunk/theora/lib/x86_32/
   trunk/theora/lib/x86_32/dsp_mmx.c
   trunk/theora/lib/x86_32/dsp_mmxext.c
   trunk/theora/lib/x86_32/fdct_mmx.c
   trunk/theora/lib/x86_32/recon_mmx.c
   trunk/theora/lib/x86_64/
   trunk/theora/lib/x86_64/dsp_mmx.c
   trunk/theora/lib/x86_64/dsp_mmxext.c
   trunk/theora/lib/x86_64/fdct_mmx.c
   trunk/theora/lib/x86_64/recon_mmx.c
Removed:
   trunk/theora/lib/x86_32/dsp_mmx.c
   trunk/theora/lib/x86_32/dsp_mmxext.c
   trunk/theora/lib/x86_32/fdct_mmx.c
   trunk/theora/lib/x86_32/recon_mmx.c
   trunk/theora/lib/x86_64/dsp_mmx.c
   trunk/theora/lib/x86_64/dsp_mmxext.c
   trunk/theora/lib/x86_64/fdct_mmx.c
   trunk/theora/lib/x86_64/recon_mmx.c
Modified:
   trunk/theora/configure.ac
   trunk/theora/examples/Makefile.am
   trunk/theora/lib/Makefile.am
   trunk/theora/lib/codec_internal.h
   trunk/theora/lib/dct.c
   trunk/theora/lib/dct_decode.c
   trunk/theora/lib/dct_encode.c
   trunk/theora/lib/decode.c
   trunk/theora/lib/encode.c
   trunk/theora/lib/encoder_toplevel.c
   trunk/theora/lib/idct.c
   trunk/theora/lib/mcomp.c
   trunk/theora/lib/pp.c
   trunk/theora/lib/reconstruct.c
   trunk/theora/lib/scan.c
   trunk/theora/lib/toplevel.c
Log:
Merge theora-mmx branch work. We now use some SIMD assembly acceleration 
by default on x86 and x86_64 architectures.


Modified: trunk/theora/configure.ac
===================================================================
--- trunk/theora/configure.ac	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/configure.ac	2006-05-26 18:51:09 UTC (rev 11427)
@@ -95,13 +95,29 @@
 
         case $host in 
         *)
-                DEBUG="-g -W -D__NO_MATH_INLINES"
-                CFLAGS="-g -O2 -Wall"
-                PROFILE="-W -pg -g -O2 -fno-inline-functions";;
+                DEBUG="-g -Wall -D__NO_MATH_INLINES"
+                CFLAGS="-Wall -O3 -fforce-addr -fomit-frame-pointer -finline-functions -funroll-loops"
+                PROFILE="-Wall -pg -g -O3 -fno-inline-functions";;
         esac
 fi
 CFLAGS="$CFLAGS $cflags_save"
 
+cpu_optimization="no optimization for your platform, please send a patch"
+cpu_x86_64=no
+cpu_x86_32=no
+case $target_cpu in
+	i[[3456]]86)
+		cpu_x86_32=yes 
+		cpu_optimization="32bit x86"
+    	;;
+	x86_64)
+		cpu_x86_64=yes
+		cpu_optimization="64bit x86"
+	;;
+esac
+AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
+
 # Test whenever ld supports -version-script
 AC_PROG_LD
 AC_PROG_LD_GNU
@@ -288,8 +304,9 @@
 
   General configuration:
 
-    Encoding support: ............ ${ac_enable_encode}
-    Floating point support: ...... ${ac_enable_float}
+    Encoding support: ........... ${ac_enable_encode}
+    Floating point support: ..... ${ac_enable_float}
+    Assembly optimization: ...... ${cpu_optimization}
 
   Installation paths:
 

Modified: trunk/theora/examples/Makefile.am
===================================================================
--- trunk/theora/examples/Makefile.am	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/examples/Makefile.am	2006-05-26 18:51:09 UTC (rev 11427)
@@ -26,8 +26,8 @@
 encoder_example_DEPENDENCIES = $(GETOPT_OBJS)
 
 debug:
-	$(MAKE) all CFLAGS="@DEBUG@ $(CFLAGS)"
+	$(MAKE) all CFLAGS="@DEBUG@"
 
 profile:
-	$(MAKE) all CFLAGS="@PROFILE@ $(CFLAGS)"
+	$(MAKE) all CFLAGS="@PROFILE@"
 

Modified: trunk/theora/lib/Makefile.am
===================================================================
--- trunk/theora/lib/Makefile.am	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/Makefile.am	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,6 +1,14 @@
 INCLUDES = -I$(top_srcdir)/include
 
-EXTRA_DIST = Version_script.in
+EXTRA_DIST = Version_script.in \
+	x86_32/dsp_mmx.c \
+	x86_32/dsp_mmxext.c \
+	x86_32/recon_mmx.c \
+	x86_32/fdct_mmx.c \
+	x86_64/dsp_mmx.c \
+	x86_64/dsp_mmxext.c \
+	x86_64/recon_mmx.c \
+	x86_64/fdct_mmx.c
 
 lib_LTLIBRARIES = libtheora.la
 
@@ -10,6 +18,25 @@
 encoder_sources = dct_encode.c encode.c encoder_toplevel.c
 endif
 
+if CPU_x86_64
+arch_dir = x86_64
+arch_sources= \
+	$(arch_dir)/dsp_mmx.c \
+	$(arch_dir)/dsp_mmxext.c \
+	$(arch_dir)/recon_mmx.c \
+	$(arch_dir)/fdct_mmx.c
+else
+if CPU_x86_32
+arch_dir = x86_32
+arch_sources= \
+	$(arch_dir)/dsp_mmx.c \
+	$(arch_dir)/dsp_mmxext.c \
+	$(arch_dir)/recon_mmx.c \
+	$(arch_dir)/fdct_mmx.c
+endif
+endif
+
+
 libtheora_la_SOURCES = \
 	blockmap.c \
 	comment.c \
@@ -28,6 +55,9 @@
 	reconstruct.c \
 	scan.c \
 	toplevel.c \
+	cpu.c \
+	dsp.c \
+  $(arch_sources) \
 	$(encoder_sources)
 
 noinst_HEADERS = \
@@ -39,7 +69,9 @@
 	pp.h \
 	quant_lookup.h \
 	toplevel.h \
-	toplevel_lookup.h
+	toplevel_lookup.h \
+	cpu.h \
+	dsp.h
 
 libtheora_la_CFLAGS = $(OGG_CFLAGS)
 libtheora_la_LDFLAGS = -version-info @V_LIB_CURRENT@:@V_LIB_REVISION@:@V_LIB_AGE@ @SHLIB_VERSION_ARG@

Modified: trunk/theora/lib/codec_internal.h
===================================================================
--- trunk/theora/lib/codec_internal.h	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/codec_internal.h	2006-05-26 18:51:09 UTC (rev 11427)
@@ -24,6 +24,7 @@
 
 #include "theora/theora.h"
 #include "huffman.h"
+#include "dsp.h"
 
 #ifndef LIBOGG2
 #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
@@ -227,6 +228,8 @@
   ogg_int32_t ChLocalsCircularBufferSize;
   ogg_int32_t PixelMapCircularBufferSize;
 
+  DspFunctions dsp;  /* Selected functions for this platform */
+
 } PP_INSTANCE;
 
 /** block coding modes */
@@ -492,6 +495,8 @@
 
   unsigned char *DataOutputInPtr;
 
+  DspFunctions   dsp;  /* Selected functions for this platform */
+
 } PB_INSTANCE;
 
 /* Encoder (Compressor) instance -- installed in a theora_state */
@@ -678,6 +683,8 @@
   int               packetflag;
   int               doneflag;
 
+  DspFunctions   dsp;  /* Selected functions for this platform */
+
 } CP_INSTANCE;
 
 #define clamp255(x) ((unsigned char)((((x)<0)-1) & ((x) | -((x)>255))))
@@ -687,7 +694,7 @@
                                      ogg_uint32_t * KFIndicator );
 
 extern void ClearPPInstance(PP_INSTANCE *ppi);
-extern void InitPPInstance(PP_INSTANCE *ppi);
+extern void InitPPInstance(PP_INSTANCE *ppi, DspFunctions *funcs);
 extern int GetFrameType(PB_INSTANCE *pbi);
 extern void InitPBInstance(PB_INSTANCE *pbi);
 extern void ClearPBInstance(PB_INSTANCE *pbi);

Copied: trunk/theora/lib/cpu.c (from rev 11426, branches/theora-mmx/lib/cpu.c)

Copied: trunk/theora/lib/cpu.h (from rev 11426, branches/theora-mmx/lib/cpu.h)

Modified: trunk/theora/lib/dct.c
===================================================================
--- trunk/theora/lib/dct.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -16,6 +16,8 @@
  ********************************************************************/
 
 #include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
 
 static ogg_int32_t xC1S7 = 64277;
 static ogg_int32_t xC2S6 = 60547;
@@ -28,7 +30,7 @@
 #define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
 #define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
 
-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
   int loop;
 
   ogg_int32_t  is07, is12, is34, is56;
@@ -251,3 +253,14 @@
     op ++;
   }
 }
+
+void dsp_dct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+  funcs->fdct_short = fdct_short__c;
+#if (defined(__i386__) || defined(__x86_64__))
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_mmx_fdct_init(funcs);
+  }
+#endif
+}
+

Modified: trunk/theora/lib/dct_decode.c
===================================================================
--- trunk/theora/lib/dct_decode.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct_decode.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "codec_internal.h"
+#include "dsp.h"
 
 
 #define GOLDEN_FRAME_THRESH_Q   50
@@ -112,22 +113,6 @@
   SetupBoundingValueArray_Generic(pbi, FLimit);
 }
 
-void CopyBlock(unsigned char *src,
-               unsigned char *dest,
-               unsigned int srcstride){
-  unsigned char *s = src;
-  unsigned char *d = dest;
-  unsigned int stride = srcstride;
-
-  int j;
-  for ( j = 0; j < 8; j++ ){
-    ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
-    ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
-    s+=stride;
-    d+=stride;
-  }
-}
-
 static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
   ogg_uint32_t ReconPixelsPerLine;
   ogg_int32_t     ReconPixelIndex;
@@ -163,8 +148,8 @@
   ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
 
   /* Get the pixel index for the first pixel in the fragment. */
-  ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
-              (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
+  dsp_recon_intra8x8 (pbi->dsp, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
+              (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
 
 }
 
@@ -248,10 +233,9 @@
     /* Reconstruct the pixel data using the last frame reconstruction
        and change data when the motion vector is (0,0), the recon is
        based on the lastframe without loop filtering---- for testing */
-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+    dsp_recon_inter8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
                 &pbi->LastFrameRecon[ReconPixelIndex],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
   }else if ( ModeUsesMC[pbi->CodingMode] ) {
     /* The mode uses a motion vector. */
     /* Get vector from list */
@@ -298,29 +282,30 @@
     if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
       /* Reconstruct the pixel dats from the reference frame and change data
          (no half pixel in this case as the two references were the same. */
-      ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+      dsp_recon_inter8x8 (pbi->dsp,
+		  &pbi->ThisFrameRecon[ReconPixelIndex],
                   LastFrameRecPtr, pbi->ReconDataBuffer,
-                  ReconPixelsPerLine );
+                  ReconPixelsPerLine);
     }else{
       /* Fractional pixel reconstruction. */
       /* Note that we only use two pixels per reconstruction even for
          the diagonal. */
-      ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
+      dsp_recon_inter8x8_half(pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
                             LastFrameRecPtr, LastFrameRecPtr2,
-                            pbi->ReconDataBuffer, ReconPixelsPerLine );
+                            pbi->ReconDataBuffer, ReconPixelsPerLine);
     }
   } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
     /* Golden frame with motion vector */
     /* Reconstruct the pixel data using the golden frame
        reconstruction and change data */
-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+    dsp_recon_inter8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
                 &pbi->GoldenFrame[ ReconPixelIndex ],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
   } else {
     /* Simple Intra coding */
     /* Get the pixel index for the first pixel in the fragment. */
-    ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
-                pbi->ReconDataBuffer, ReconPixelsPerLine );
+    dsp_recon_intra8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
+              pbi->ReconDataBuffer, ReconPixelsPerLine);
   }
 }
 
@@ -475,7 +460,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
     }
   }
 
@@ -487,7 +472,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
 
     }
   }
@@ -512,7 +497,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
     }
   }
 
@@ -524,7 +509,7 @@
       SrcPtr = &SrcReconPtr[ PixelIndex ];
       DestPtr = &DestReconPtr[ PixelIndex ];
 
-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+      dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
 
     }
   }

Modified: trunk/theora/lib/dct_encode.c
===================================================================
--- trunk/theora/lib/dct_encode.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct_encode.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -17,110 +17,10 @@
 
 #include <stdlib.h>
 #include "codec_internal.h"
+#include "dsp.h"
 
 static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
 
-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
-                  unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine ) {
-  int i;
-
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
-
-    /* Update the screen canvas in one step*/
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    ReconPtr += ReconPixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      unsigned char *old_ptr1, unsigned char *new_ptr1,
-                      ogg_uint32_t PixelsPerLine ) {
-  int i;
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
-    /* INTRA mode so code raw image data */
-    /* We convert the data to 8 bit signed (by subtracting 128) as
-       this reduces the internal precision requirments in the DCT
-       transform. */
-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
-
-    /* Update the screen canvas in one step */
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     unsigned char *old_ptr1, unsigned char *new_ptr1,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine ) {
-  int i;
-
-  /* For each block row */
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-    DctInputPtr[0] = (ogg_int16_t)
-      ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
-    DctInputPtr[1] = (ogg_int16_t)
-      ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
-    DctInputPtr[2] = (ogg_int16_t)
-      ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
-    DctInputPtr[3] = (ogg_int16_t)
-      ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
-    DctInputPtr[4] = (ogg_int16_t)
-      ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
-    DctInputPtr[5] = (ogg_int16_t)
-      ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
-    DctInputPtr[6] = (ogg_int16_t)
-      ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
-    DctInputPtr[7] = (ogg_int16_t)
-      ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
-
-    /* Update the screen canvas in one step */
-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
-    /* Start next row */
-    new_ptr1 += PixelsPerLine;
-    old_ptr1 += PixelsPerLine;
-    FiltPtr += PixelsPerLine;
-    ReconPtr1 += ReconPixelsPerLine;
-    ReconPtr2 += ReconPixelsPerLine;
-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
-  }
-}
-
 static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
                                        ogg_uint32_t * TokenListPtr ){
   unsigned char tokens_added = 0;
@@ -452,13 +352,15 @@
 
   /* Is the MV offset exactly pixel alligned */
   if ( AbsRefOffset == 0 ){
-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
-               PixelsPerLine, ReconPixelsPerLine );
+    dsp_sub8x8(cpi->dsp, FiltPtr, ReconPtr1, DctInputPtr,
+               PixelsPerLine, ReconPixelsPerLine);
+    dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
   } else {
     /* Fractional pixel MVs. */
     /* Note that we only use two pixel values even for the diagonal */
-    Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
-                 new_ptr1, PixelsPerLine, ReconPixelsPerLine );
+    dsp_sub8x8avg2(cpi->dsp, FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
+                 PixelsPerLine, ReconPixelsPerLine);
+    dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
   }
 }
 
@@ -534,17 +436,18 @@
         pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
     }
 
-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
-               PixelsPerLine, ReconPixelsPerLine );
+    dsp_sub8x8(cpi->dsp, FiltPtr, ReconPtr1, DctInputPtr,
+               PixelsPerLine, ReconPixelsPerLine);
+    dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
   } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
-    Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
-
+    dsp_sub8x8_128(cpi->dsp, FiltPtr, DctInputPtr, PixelsPerLine);
+    dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
   }
 
   /* Proceed to encode the data into the encode buffer if the encoder
      is enabled. */
   /* Perform a 2D DCT transform on the data. */
-  fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
+  dsp_fdct_short(cpi->dsp, cpi->DCTDataBuffer, cpi->DCT_codes );
 
   /* Quantize that transform data. */
   quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );

Modified: trunk/theora/lib/decode.c
===================================================================
--- trunk/theora/lib/decode.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/decode.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -865,7 +865,9 @@
   if (pbi->DecoderErrorCode) return;
 
   /* Reconstruct and display the frame */
+  dsp_save_fpu (pbi->dsp);
   ReconRefFrames(pbi);
+  dsp_restore_fpu (pbi->dsp);
 
 }
 

Copied: trunk/theora/lib/dsp.c (from rev 11426, branches/theora-mmx/lib/dsp.c)

Copied: trunk/theora/lib/dsp.h (from rev 11426, branches/theora-mmx/lib/dsp.h)

Modified: trunk/theora/lib/encode.c
===================================================================
--- trunk/theora/lib/encode.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/encode.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -531,8 +531,7 @@
 
 static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
                                      ogg_int32_t BlockIndex ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  ErrorVal = 0;
+  ogg_uint32_t  ErrorVal;
 
   unsigned char * SrcDataPtr =
     &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
@@ -550,21 +549,8 @@
     RecStride = cpi->pb.UVStride;
   }
 
+  ErrorVal = dsp_sad8x8 (cpi->dsp, SrcDataPtr, SrcStride, RecDataPtr, RecStride);
 
-  /* Decide on standard or MMX implementation */
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
-    ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
-    ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
-    ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
-    ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
-    ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
-    ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
-    ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
-    /* Step to next row of block. */
-    SrcDataPtr += SrcStride;
-    RecDataPtr += RecStride;
-  }
   return ErrorVal;
 }
 
@@ -933,9 +919,13 @@
     /* Zero Decoder EOB run count */
     cpi->pb.EOB_Run = 0;
 
+    dsp_save_fpu (cpi->dsp);
+
     /* Encode any fragments coded using DCT. */
     coded_pixels += QuadCodeDisplayFragments (cpi);
 
+    dsp_restore_fpu (cpi->dsp);
+
     return coded_pixels;
 
 }

Modified: trunk/theora/lib/encoder_toplevel.c
===================================================================
--- trunk/theora/lib/encoder_toplevel.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/encoder_toplevel.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -23,6 +23,7 @@
 #include <string.h>
 #include "toplevel_lookup.h"
 #include "toplevel.h"
+#include "dsp.h"
 
 #define A_TABLE_SIZE        29
 #define DF_CANDIDATE_WINDOW 5
@@ -778,12 +779,15 @@
   if(c->pixelformat!=OC_PF_420)return OC_IMPL;
   th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
 
+  dsp_static_init (&cpi->dsp);
+  memcpy (&cpi->pb.dsp, &cpi->dsp, sizeof(DspFunctions));
+
   c->version_major=VERSION_MAJOR;
   c->version_minor=VERSION_MINOR;
   c->version_subminor=VERSION_SUB;
 
   InitTmpBuffers(&cpi->pb);
-  InitPPInstance(&cpi->pp);
+  InitPPInstance(&cpi->pp, &cpi->dsp);
 
   /* Initialise Configuration structure to legal values */
   if(c->quality>63)c->quality=63;

Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/idct.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -10,7 +10,7 @@
  *                                                                  *
  ********************************************************************
 
-  function:
+  function: C implementation of the Theora iDCT
   last mod: $Id: idct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
 
  ********************************************************************/
@@ -20,6 +20,8 @@
 #include "quant_lookup.h"
 
 #define IdctAdjustBeforeShift 8
+
+/* cos(n*pi/16) or sin(8-n)*pi/16) */
 #define xC1S7 64277
 #define xC2S6 60547
 #define xC3S5 54491
@@ -28,6 +30,85 @@
 #define xC6S2 25080
 #define xC7S1 12785
 
+/* compute the 16 bit signed 1D inverse DCT - spec version */
+static void idct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ) {
+  ogg_int32_t t[8], r;
+  ogg_int16_t *y = InputData;
+  ogg_int16_t *x = OutputData;
+
+  t[0] = y[0] + y[4];
+  t[0] &= 0xffff;
+  t[0] = (xC4S4 * t[0]) >> 16;
+
+  t[1] = y[0] - y[4];
+  t[1] &= 0xffff;
+  t[1] = (xC4S4 * t[1]) >> 16;
+
+  t[2] = ((xC6S2 * t[2]) >> 16) - ((xC2S6 * y[6]) >> 16);
+  t[3] = ((xC2S6 * t[2]) >> 16) + ((xC6S2 * y[6]) >> 16);
+  t[4] = ((xC7S1 * t[1]) >> 16) - ((xC1S7 * y[7]) >> 16);
+  t[5] = ((xC3S5 * t[5]) >> 16) - ((xC5S3 * y[3]) >> 16);
+  t[6] = ((xC5S3 * t[5]) >> 16) + ((xC3S5 * y[3]) >> 16);
+  t[7] = ((xC1S7 * t[1]) >> 16) + ((xC7S1 * y[7]) >> 16);
+
+  r = t[4] + t[5];
+  t[5] = t[4] - t[5];
+  t[5] &= 0xffff;
+  t[5] = (xC4S4 * (-t[5])) >> 16;
+  t[4] = r;
+
+  r = t[7] + t[6];
+  t[6] = t[7] - t[6];
+  t[6] &= 0xffff;
+  t[6] = (xC4S4 * t[6]) >> 16;
+  t[7] = r;
+
+  r = t[0] + t[3];
+  t[3] = t[0] - t[3];
+  t[0] = r;
+
+  r = t[1] + t[2];
+  t[2] = t[1] - t[2];
+  t[1] = r;
+
+  r = t[6] + t[5];
+  t[5] = t[6] - t[5];
+  t[6] = r;
+
+  r = t[0] + t[7];
+  r &= 0xffff;
+  x[0] = r;
+
+  r = t[1] + t[6];
+  r &= 0xffff;
+  x[1] = r;
+
+  r = t[2] + t[5];
+  r &= 0xffff;
+  x[2] = r;
+
+  r = t[3] + t[4];
+  r &= 0xffff;
+  x[3] = r;
+
+  r = t[3] - t[4];
+  r &= 0xffff;
+  x[4] = r;
+
+  r = t[2] - t[5];
+  r &= 0xffff;
+  x[5] = r;
+
+  r = t[1] - t[6];
+  r &= 0xffff;
+  x[6] = r;
+
+  r = t[0] - t[7];
+  r &= 0xffff;
+  x[7] = r;
+
+}
+
 static void dequant_slow( ogg_int16_t * dequant_coeffs,
                    ogg_int16_t * quantized_list,
                    ogg_int32_t * DCT_block) {
@@ -36,6 +117,8 @@
     DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
 }
 
+
+
 void IDctSlow(  Q_LIST_ENTRY * InputData,
                 ogg_int16_t *QuantMatrix,
                 ogg_int16_t * OutputData ) {

Modified: trunk/theora/lib/mcomp.c
===================================================================
--- trunk/theora/lib/mcomp.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/mcomp.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -17,6 +17,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include "dsp.h"
 #include "codec_internal.h"
 
 /* Initialises motion compentsation. */
@@ -96,288 +97,92 @@
     cpi->MVPixelOffsetY[i] = (cpi->MVOffsetY[i]*LineStepY) + cpi->MVOffsetX[i];
 }
 
-static ogg_uint32_t GetInterErr (unsigned char * NewDataPtr,
+static ogg_uint32_t GetInterErr (CP_INSTANCE *cpi, unsigned char * NewDataPtr,
                           unsigned char * RefDataPtr1,
                           unsigned char * RefDataPtr2,
                           ogg_uint32_t PixelsPerLine ) {
-  ogg_uint32_t  i;
-  ogg_int32_t   XSum=0;
-  ogg_int32_t   XXSum=0;
   ogg_int32_t   DiffVal;
-  ogg_int32_t   AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
+  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
+  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
 
   /* Mode of interpolation chosen based upon on the offset of the
      second reference pointer */
-  if ( AbsRefOffset == 0 ) {
-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
-      XSum += DiffVal;
-
-      /* negative array indexes are strictly forbidden by ANSI C and C99 */
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      /* Step to next row of block. */
-      NewDataPtr += PixelsPerLine;
-      RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
-    }
-
+  if ( RefOffset == 0 ) {
+    DiffVal = dsp_inter8x8_err (cpi->dsp, NewDataPtr, PixelsPerLine,
+		          RefDataPtr1, RefPixelsPerLine);
   }else{
-
-    /* Simple two reference interpolation */
-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal = ((int)NewDataPtr[0]) -
-        (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[1]) -
-        (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[2]) -
-        (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[3]) -
-        (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[4]) -
-        (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[5]) -
-        (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[6]) -
-        (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      DiffVal = ((int)NewDataPtr[7]) -
-        (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
-      XSum += DiffVal;
-      XXSum += DiffVal*DiffVal;
-
-      /* Step to next row of block. */
-      NewDataPtr += PixelsPerLine;
-      RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
-      RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
-    }
+    DiffVal = dsp_inter8x8_err_xy2 (cpi->dsp, NewDataPtr, PixelsPerLine,
+		          RefDataPtr1, 
+		          RefDataPtr2, RefPixelsPerLine);
   }
 
   /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t GetSumAbsDiffs  (unsigned char * NewDataPtr,
-                              unsigned char  * RefDataPtr,
-                              ogg_uint32_t PixelsPerLine,
-                              ogg_uint32_t ErrorSoFar) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  DiffVal = ErrorSoFar;
-
-  /* Decide on standard or MMX implementation */
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
-    /* Step to next row of block. */
-    NewDataPtr += PixelsPerLine;
-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
-  }
-
   return DiffVal;
 }
 
-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
-                                 unsigned char * RefDataPtr,
-                                 ogg_uint32_t PixelsPerLine,
-                                 ogg_uint32_t ErrorSoFar,
-                                 ogg_uint32_t BestSoFar ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  DiffVal = ErrorSoFar;
-
-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
-    if ( DiffVal > BestSoFar )break;
-
-    /* Step to next row of block. */
-    NewDataPtr += PixelsPerLine;
-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
-  }
-
-  return DiffVal;
-}
-
-static ogg_uint32_t GetHalfPixelSumAbsDiffs (unsigned char * SrcData,
+static ogg_uint32_t GetHalfPixelSumAbsDiffs (CP_INSTANCE *cpi,
+                                      unsigned char * SrcData,
                                       unsigned char * RefDataPtr1,
                                       unsigned char * RefDataPtr2,
                                       ogg_uint32_t PixelsPerLine,
                                       ogg_uint32_t ErrorSoFar,
                                       ogg_uint32_t BestSoFar ) {
 
-  ogg_uint32_t  i;
   ogg_uint32_t  DiffVal = ErrorSoFar;
   ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
   ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
 
   if ( RefOffset == 0 ) {
     /* Simple case as for non 0.5 pixel */
-    DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
-                               0);
+    DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine, 
+		               RefDataPtr1, RefPixelsPerLine);
   } else  {
-    for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
-      DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
-                                            (int)RefDataPtr2[0]) / 2) );
-      DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
-                                            (int)RefDataPtr2[1]) / 2) );
-      DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
-                                            (int)RefDataPtr2[2]) / 2) );
-      DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
-                                            (int)RefDataPtr2[3]) / 2) );
-      DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
-                                            (int)RefDataPtr2[4]) / 2) );
-      DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
-                                            (int)RefDataPtr2[5]) / 2) );
-      DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
-                                            (int)RefDataPtr2[6]) / 2) );
-      DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
-                                            (int)RefDataPtr2[7]) / 2) );
-
-      if ( DiffVal > BestSoFar ) break;
-
-      /* Step to next row of block. */
-      SrcData += PixelsPerLine;
-      RefDataPtr1 += RefPixelsPerLine;
-      RefDataPtr2 += RefPixelsPerLine;
-    }
+    DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine, 
+		               RefDataPtr1, 
+		               RefDataPtr2, RefPixelsPerLine, BestSoFar);
   }
 
   return DiffVal;
 }
 
-static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
-                            ogg_uint32_t PixelsPerLine ) {
-  ogg_uint32_t  i;
-  ogg_uint32_t  XSum=0;
-  ogg_uint32_t  XXSum=0;
-  unsigned char *DiffPtr;
-
-  /* Loop expanded out for speed. */
-  DiffPtr = DataPtr;
-
-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-
-    /* Examine alternate pixel locations. */
-    XSum += DiffPtr[0];
-    XXSum += DiffPtr[0]*DiffPtr[0];
-    XSum += DiffPtr[1];
-    XXSum += DiffPtr[1]*DiffPtr[1];
-    XSum += DiffPtr[2];
-    XXSum += DiffPtr[2]*DiffPtr[2];
-    XSum += DiffPtr[3];
-    XXSum += DiffPtr[3]*DiffPtr[3];
-    XSum += DiffPtr[4];
-    XXSum += DiffPtr[4]*DiffPtr[4];
-    XSum += DiffPtr[5];
-    XXSum += DiffPtr[5]*DiffPtr[5];
-    XSum += DiffPtr[6];
-    XXSum += DiffPtr[6]*DiffPtr[6];
-    XSum += DiffPtr[7];
-    XXSum += DiffPtr[7]*DiffPtr[7];
-
-    /* Step to next row of block. */
-    DiffPtr += PixelsPerLine;
-  }
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
 ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
                               ogg_uint32_t PixelsPerLine ) {
   ogg_uint32_t  LocalFragIndex = FragIndex;
   ogg_uint32_t  IntraError = 0;
 
+  dsp_save_fpu (cpi->dsp);
+
   /* Add together the intra errors for those blocks in the macro block
      that are coded (Y only) */
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_intra8x8_err (cpi->dsp, &cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
-
   LocalFragIndex++;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_intra8x8_err (cpi->dsp, &cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
   LocalFragIndex = FragIndex + cpi->pb.HFragments;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_intra8x8_err (cpi->dsp, &cpi->
                      ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                     PixelsPerLine );
+                    PixelsPerLine);
 
   LocalFragIndex++;
   if ( cpi->pb.display_fragments[LocalFragIndex] )
     IntraError +=
-      GetIntraError(&cpi->
+      dsp_intra8x8_err (cpi->dsp, &cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
-                    PixelsPerLine );
+                    PixelsPerLine);
 
+  dsp_restore_fpu (cpi->dsp);
+
   return IntraError;
 }
 
@@ -400,6 +205,8 @@
   unsigned char * SrcPtr1;
   unsigned char * RefPtr1;
 
+  dsp_save_fpu (cpi->dsp);
+
   /* Work out pixel offset into source buffer. */
   PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
 
@@ -428,7 +235,7 @@
   if ( cpi->pb.display_fragments[LocalFragIndex] ) {
     SrcPtr1 = &SrcPtr[PixelIndex];
     RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
-    InterError += GetInterErr( SrcPtr1, RefPtr1,
+    InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
   }
 
@@ -438,7 +245,7 @@
     RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
     SrcPtr1 = &SrcPtr[PixelIndex];
     RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
-    InterError += GetInterErr( SrcPtr1, RefPtr1,
+    InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
 
   }
@@ -449,7 +256,7 @@
     RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
     SrcPtr1 = &SrcPtr[PixelIndex];
     RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
-    InterError += GetInterErr( SrcPtr1, RefPtr1,
+    InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
   }
 
@@ -459,9 +266,12 @@
     RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
     SrcPtr1 = &SrcPtr[PixelIndex];
     RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
-    InterError += GetInterErr( SrcPtr1, RefPtr1,
+    InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
   }
+
+  dsp_restore_fpu (cpi->dsp);
+
   return InterError;
 }
 
@@ -496,6 +306,8 @@
   unsigned char * RefDataPtr1;
   unsigned char * RefDataPtr2;
 
+  dsp_save_fpu (cpi->dsp);
+
   /* Note which of the four blocks in the macro block are to be
      included in the search. */
   MBlockDispFrags[0] =
@@ -518,20 +330,20 @@
 
   /* Check the 0,0 candidate. */
   if ( MBlockDispFrags[0] ) {
-    Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
-                         PixelsPerLine, Error);
+    Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, RefPtr,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[1] ) {
-    Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
-                         PixelsPerLine, Error);
+    Error += dsp_sad8x8 (cpi->dsp, SrcPtr[1], PixelsPerLine, RefPtr + 8,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[2] ) {
-    Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
-                         PixelsPerLine, Error);
+    Error += dsp_sad8x8 (cpi->dsp, SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
   if ( MBlockDispFrags[3] ) {
-    Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
-                         PixelsPerLine, Error);
+    Error += dsp_sad8x8 (cpi->dsp, SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
+                         PixelsPerLine + STRIDE_EXTRA);
   }
 
   /* Set starting values to results of 0, 0 vector. */
@@ -554,24 +366,23 @@
 
       /* Get the score for the current offset */
       if ( MBlockDispFrags[0] ) {
-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
-                             PixelsPerLine, Error);
+        Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
 
       if ( MBlockDispFrags[1] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( MBlockDispFrags[2] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( MBlockDispFrags[3] && (Error < MinError) ) {
-        Error = GetNextSumAbsDiffs( SrcPtr[3],
-                                 CandidateBlockPtr + RefRow2Offset + 8,
-                                 PixelsPerLine, Error, MinError );
+        Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+                             PixelsPerLine + STRIDE_EXTRA, MinError);
       }
 
       if ( Error < MinError ) {
@@ -610,7 +421,7 @@
       RefDataPtr1 = BestBlockPtr;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[0], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[0], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -618,7 +429,7 @@
       RefDataPtr1 = BestBlockPtr + 8;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[1], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[1], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -626,7 +437,7 @@
       RefDataPtr1 = BestBlockPtr + RefRow2Offset;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[2], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[2], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -634,7 +445,7 @@
       RefDataPtr1 = BestBlockPtr + RefRow2Offset + 8;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[3], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[3], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -652,6 +463,8 @@
   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                   FragIndex, MV->x, MV->y, PixelsPerLine );
 
+  dsp_restore_fpu (cpi->dsp);
+
   /* Return score of best matching block. */
   return InterMVError;
 }
@@ -684,6 +497,8 @@
   unsigned char * RefDataPtr1;
   unsigned char * RefDataPtr2;
 
+  dsp_save_fpu (cpi->dsp);
+
   /* Note which of the four blocks in the macro block are to be
      included in the search. */
   MBlockDispFrags[0] = cpi->
@@ -717,20 +532,20 @@
 
       /* Summ errors for each block. */
       if ( MBlockDispFrags[0] ) {
-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
-                             PixelsPerLine, Error);
+        Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[1] ){
-        Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
-                             PixelsPerLine, Error);
+        Error += dsp_sad8x8 (cpi->dsp, SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[2] ){
-        Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
-                             PixelsPerLine, Error);
+        Error += dsp_sad8x8 (cpi->dsp, SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
       if ( MBlockDispFrags[3] ){
-        Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,
-                             PixelsPerLine, Error);
+        Error += dsp_sad8x8 (cpi->dsp, SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+                             PixelsPerLine + STRIDE_EXTRA);
       }
 
       /* Was this the best so far */
@@ -766,7 +581,7 @@
       RefDataPtr1 = BestBlockPtr;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[0], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[0], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -774,7 +589,7 @@
       RefDataPtr1 = BestBlockPtr + 8;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[1], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[1], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -782,7 +597,7 @@
       RefDataPtr1 = BestBlockPtr + RefRow2Offset;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[2], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[2], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -790,7 +605,7 @@
       RefDataPtr1 = BestBlockPtr + RefRow2Offset + 8;
       RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
       HalfPixelError =
-        GetHalfPixelSumAbsDiffs( SrcPtr[3], RefDataPtr1, RefDataPtr2,
+        GetHalfPixelSumAbsDiffs(cpi, SrcPtr[3], RefDataPtr1, RefDataPtr2,
                          PixelsPerLine, HalfPixelError, BestHalfPixelError );
     }
 
@@ -808,6 +623,8 @@
   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                   FragIndex, MV->x, MV->y, PixelsPerLine );
 
+  dsp_restore_fpu (cpi->dsp);
+
   /* Return score of best matching block. */
   return InterMVError;
 }
@@ -850,8 +667,8 @@
 
     for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
       /* Get the block error score. */
-      Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
-                           PixelsPerLine, 0);
+      Error = dsp_sad8x8 (cpi->dsp, SrcPtr, PixelsPerLine, CandidateBlockPtr,
+                             PixelsPerLine + STRIDE_EXTRA);
 
       /* Was this the best so far */
       if ( Error < MinError ) {
@@ -881,7 +698,7 @@
   for ( i=0; i < 9; i++ ) {
     RefDataPtr2 = BestBlockPtr + cpi->HalfPixelRef2Offset[i];
     HalfPixelError =
-      GetHalfPixelSumAbsDiffs( SrcPtr, BestBlockPtr, RefDataPtr2,
+      GetHalfPixelSumAbsDiffs(cpi, SrcPtr, BestBlockPtr, RefDataPtr2,
                             PixelsPerLine, 0, BestHalfPixelError );
 
     if ( HalfPixelError < BestHalfPixelError ){
@@ -898,7 +715,7 @@
   RefDataPtr2 = BestBlockPtr + cpi->HalfPixelRef2Offset[BestHalfOffset];
 
   InterMVError =
-    GetInterErr( SrcPtr, BestBlockPtr, RefDataPtr2, PixelsPerLine );
+    GetInterErr(cpi, SrcPtr, BestBlockPtr, RefDataPtr2, PixelsPerLine );
 
   /* Return score of best matching block. */
   return InterMVError;
@@ -911,6 +728,8 @@
                                         MOTION_VECTOR *MV ) {
   ogg_uint32_t  InterMVError;
 
+  dsp_save_fpu (cpi->dsp);
+
   /* For the moment the 4MV mode is only deemed to be valid 
      if all four Y blocks are to be updated */
   /* This may be adapted later. */
@@ -941,6 +760,8 @@
     InterMVError = HUGE_ERROR;
   }
 
+  dsp_restore_fpu (cpi->dsp);
+
   /* Return score of best matching block. */
   return InterMVError;
 }

Modified: trunk/theora/lib/pp.c
===================================================================
--- trunk/theora/lib/pp.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/pp.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -19,6 +19,7 @@
 #include <string.h>
 #include "codec_internal.h"
 #include "pp.h"
+#include "dsp.h"
 
 #define MAX(a, b) ((a>b)?a:b)
 #define MIN(a, b) ((a<b)?a:b)
@@ -150,10 +151,12 @@
 }
 
 
-void InitPPInstance(PP_INSTANCE *ppi){
+void InitPPInstance(PP_INSTANCE *ppi, DspFunctions *funcs){
 
   memset(ppi,0,sizeof(*ppi));
 
+  memcpy(&ppi->dsp, funcs, sizeof(DspFunctions));
+
   /* Initializations */
   ppi->PrevFrameLimit = 3; /* Must not exceed MAX_PREV_FRAMES (Note
                               that this number includes the current
@@ -490,7 +493,7 @@
 
       } else {
 
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
 
       }
 
@@ -529,7 +532,7 @@
         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                         LineLength,Quality,QuantScale);
       }else{
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
       }
 
       ++Block;
@@ -565,7 +568,7 @@
         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                         LineLength,Quality,QuantScale);
       }else{
-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+        dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
       }
 
       ++Block;

Modified: trunk/theora/lib/reconstruct.c
===================================================================
--- trunk/theora/lib/reconstruct.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/reconstruct.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -16,12 +16,28 @@
  ********************************************************************/
 
 #include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
 
-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                 ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void copy8x8__c (unsigned char *src,
+	                unsigned char *dest,
+	                unsigned int stride)
+{
+  int j;
+  for ( j = 0; j < 8; j++ ){
+    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+    src+=stride;
+    dest+=stride;
+  }
+}
+
+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+		      ogg_uint32_t LineStep)
+{
   ogg_uint32_t i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+  for (i = 8; i; i--){
     /* Convert the data back to 8 bit unsigned */
     /* Saturate the output to unsigend 8 bit values */
     ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
@@ -34,17 +50,16 @@
     ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
 
     ReconPtr += LineStep;
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
   }
-
 }
 
-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                 unsigned char * RefPtr, ogg_int16_t * ChangePtr,
-                 ogg_uint32_t LineStep ) {
+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
   ogg_uint32_t i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
+  for (i = 8; i; i--){
     ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
     ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
     ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
@@ -54,19 +69,19 @@
     ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
     ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
 
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
     ReconPtr += LineStep;
     RefPtr += LineStep;
   }
-
 }
 
-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
-                           unsigned char * RefPtr1, unsigned char * RefPtr2,
-                           ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+			   ogg_uint32_t LineStep)
+{
   ogg_uint32_t  i;
 
-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+  for (i = 8; i; i--){
     ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
     ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
     ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
@@ -76,10 +91,22 @@
     ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
     ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
 
-    ChangePtr += BLOCK_HEIGHT_WIDTH;
+    ChangePtr += 8;
     ReconPtr += LineStep;
     RefPtr1 += LineStep;
     RefPtr2 += LineStep;
   }
+}
 
+void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+  funcs->copy8x8 = copy8x8__c;
+  funcs->recon_intra8x8 = recon_intra8x8__c;
+  funcs->recon_inter8x8 = recon_inter8x8__c;
+  funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if (defined(__i386__) || defined(__x86_64__))
+  if (cpu_flags & CPU_X86_MMX) {
+    dsp_mmx_recon_init(funcs);
+  }
+#endif
 }

Modified: trunk/theora/lib/scan.c
===================================================================
--- trunk/theora/lib/scan.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/scan.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -19,9 +19,20 @@
 #include <math.h>
 #include <string.h>
 #include "codec_internal.h"
+#include "dsp.h"
 
 #define MAX_SEARCH_LINE_LEN                   7
 
+#define SET8_0(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
+  ((ogg_uint32_t *)ptr)[1] = 0x00000000;
+#define SET8_1(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
+  ((ogg_uint32_t *)ptr)[1] = 0x01010101;
+#define SET8_8(ptr) \
+  ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
+  ((ogg_uint32_t *)ptr)[1] = 0x08080808;
+
 static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
   0, 0, 0, 0, 2, 4, 12, 24
 };
@@ -384,69 +395,6 @@
   ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
 }
 
-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
-                                  unsigned char * Src2 ){
-  ogg_uint32_t SadValue;
-  ogg_uint32_t SadValue1;
-
-  SadValue    = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
-    abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
-
-  SadValue1   = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
-    abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
-
-  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
-  return SadValue;
-}
-
-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
-                           unsigned char * Src1,
-                           unsigned char * Src2 ){
-  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
-  ogg_uint32_t MaxSad = 0;
-  ogg_uint32_t i;
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue[0] += abs(Src1[0] - Src2[0]);
-    SadValue[1] += abs(Src1[1] - Src2[1]);
-    SadValue[2] += abs(Src1[2] - Src2[2]);
-    SadValue[3] += abs(Src1[3] - Src2[3]);
-    SadValue[4] += abs(Src1[4] - Src2[4]);
-    SadValue[5] += abs(Src1[5] - Src2[5]);
-    SadValue[6] += abs(Src1[6] - Src2[6]);
-    SadValue[7] += abs(Src1[7] - Src2[7]);
-
-    Src1 += ppi->PlaneStride;
-    Src2 += ppi->PlaneStride;
-  }
-
-  for ( i = 0; i < 4; i++ ){
-    SadValue2[0] += abs(Src1[0] - Src2[0]);
-    SadValue2[1] += abs(Src1[1] - Src2[1]);
-    SadValue2[2] += abs(Src1[2] - Src2[2]);
-    SadValue2[3] += abs(Src1[3] - Src2[3]);
-    SadValue2[4] += abs(Src1[4] - Src2[4]);
-    SadValue2[5] += abs(Src1[5] - Src2[5]);
-    SadValue2[6] += abs(Src1[6] - Src2[6]);
-    SadValue2[7] += abs(Src1[7] - Src2[7]);
-
-    Src1 += ppi->PlaneStride;
-    Src2 += ppi->PlaneStride;
-  }
-
-  for ( i = 0; i < 8; i++ ){
-    if ( SadValue[i] > MaxSad )
-      MaxSad = SadValue[i];
-    if ( SadValue2[i] > MaxSad )
-      MaxSad = SadValue2[i];
-  }
-
-  return MaxSad;
-}
-
-
 static int RowSadScan( PP_INSTANCE *ppi,
                        unsigned char * YuvPtr1,
                        unsigned char * YuvPtr2,
@@ -475,7 +423,7 @@
     for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
       if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
         /* Calculate the SAD score for the block row */
-        GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
+        GrpSad = dsp_row_sad8(ppi->dsp, LocalYuvPtr1,LocalYuvPtr2);
 
         /* Now test the group SAD score */
         if ( GrpSad > LocalGrpLowSadThresh ){
@@ -532,7 +480,7 @@
     /* Skip if block already marked to be coded. */
     if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
       /* Calculate the SAD score for the block column */
-      MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
+      MaxSad = dsp_col_sad8x8(ppi->dsp, LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
 
       /* Now test the group SAD score */
       if ( MaxSad > LocalGrpLowSadThresh ){
@@ -758,7 +706,7 @@
       if (*DispFragPtr == CANDIDATE_BLOCK){
 
         /* Clear down entries in changed locals array */
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
 
         for ( j = 0; j < HFRAGPIXELS; j++ ){
           /* Take a local copy of the measured difference. */
@@ -777,10 +725,10 @@
       }else{
         /* If we are breaking out here mark all pixels as changed. */
         if ( *DispFragPtr > BLOCK_NOT_CODED ){
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
       }
 
@@ -816,7 +764,7 @@
     /* Test for break out conditions to save time. */
     if (*DispFragPtr == CANDIDATE_BLOCK){
       /* Clear down entries in changed locals array */
-      memset(ChLocalsPtr,0,8);
+      SET8_0(ChLocalsPtr);
 
       for ( j = 0; j < HFRAGPIXELS; j++ ){
         /* Take a local copy of the measured difference. */
@@ -839,10 +787,10 @@
     }else{
       /* If we are breaking out here mark all pixels as changed. */
       if ( *DispFragPtr > BLOCK_NOT_CODED ){
-        memset(bits_map_ptr,1,8);
-        memset(ChLocalsPtr,8,8);
+        SET8_1(bits_map_ptr);
+        SET8_8(ChLocalsPtr);
       }else{
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
       }
     }
 
@@ -876,7 +824,7 @@
       /* Test for break out conditions to save time. */
       if (*DispFragPtr == CANDIDATE_BLOCK){
         /* Clear down entries in changed locals array */
-        memset(ChLocalsPtr,0,8);
+        SET8_0(ChLocalsPtr);
         for ( j = 0; j < HFRAGPIXELS; j++ ){
           /* Take a local copy of the measured difference. */
           Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -899,10 +847,10 @@
       }else{
         /* If we are breaking out here mark all pixels as changed. */
         if ( *DispFragPtr > BLOCK_NOT_CODED ){
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
       }
 
@@ -935,7 +883,7 @@
     /* Test for break out conditions to save time. */
     if (*DispFragPtr == CANDIDATE_BLOCK){
       /* Clear down entries in changed locals array */
-      memset(ChLocalsPtr,0,8);
+      SET8_0(ChLocalsPtr);
 
       for ( j = 0; j < HFRAGPIXELS; j++ ){
         /* Take a local copy of the measured difference. */
@@ -959,10 +907,10 @@
     }else{
       /* If we are breaking out here mark all pixels as changed.*/
       if ( *DispFragPtr > BLOCK_NOT_CODED ) {
-          memset(bits_map_ptr,1,8);
-          memset(ChLocalsPtr,8,8);
+          SET8_1(bits_map_ptr);
+          SET8_8(ChLocalsPtr);
         }else{
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
         }
     }
     /* If we have a lot of changed pixels for this fragment on this
@@ -1071,7 +1019,7 @@
         }
       }else{
         if ( *DispFragPtr > BLOCK_NOT_CODED )
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
 
         /* Step pointers */
         ChLocalsPtr += HFRAGPIXELS;
@@ -1133,7 +1081,7 @@
         }
       }else{
         if ( *DispFragPtr > BLOCK_NOT_CODED )
-          memset(ChLocalsPtr,0,8);
+          SET8_0(ChLocalsPtr);
 
         /* Step pointers */
         ChLocalsPtr += HFRAGPIXELS;
@@ -2126,10 +2074,12 @@
     /* Fast break out test for obvious yes and no cases in this row of
        blocks */
     if ( i < ppi->PlaneVFragments ){
+      dsp_save_fpu (ppi->dsp);
       UpdatedOrCandidateBlocks =
         RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
-      if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
-        UpdatedOrCandidateBlocks = 1;
+      UpdatedOrCandidateBlocks |=
+        ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
+      dsp_restore_fpu (ppi->dsp);
     }else{
       /* Make sure we still call other functions if RowSadScan() disabled */
       UpdatedOrCandidateBlocks = 1;

Modified: trunk/theora/lib/toplevel.c
===================================================================
--- trunk/theora/lib/toplevel.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/toplevel.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -23,6 +23,7 @@
 #include <string.h>
 #include "theora/theora.h"
 #include "toplevel.h"
+#include "dsp.h"
 
 static int _ilog(unsigned int v){
   int ret=0;
@@ -309,6 +310,9 @@
   th->internal_encode=NULL;
 
   InitPBInstance(pbi);
+
+  dsp_static_init (&pbi->dsp);
+
   memcpy(&pbi->info,c,sizeof(*c));
   pbi->info.codec_setup=NULL;
   th->i=&pbi->info;

Copied: trunk/theora/lib/x86_32 (from rev 11426, branches/theora-mmx/lib/x86_32)

Deleted: trunk/theora/lib/x86_32/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/dsp_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/dsp_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,644 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %2           \n\t"
-    "  add         %3, %0           \n\t"
-    "  add         %4, %1           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-    "  movq      "M(V128)", %%mm1   \n\t"
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %1           \n\t"
-    "  add         %2, %0           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine)
-     : "memory"
-  );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
-    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
-    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
-    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
-    /* average ReconPtr1 and ReconPtr2 */
-    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
-    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
-    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %3           \n\t"
-    "  add         %4, %0           \n\t"
-    "  add         %5, %1           \n\t"
-    "  add         %5, %2           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr1),
-       "+r" (ReconPtr2),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
-    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
-    "  psrlq       $32, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $16, %%mm2       \n\t"
-    "  psrlq       $16, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-
-    "  psubusw     %%mm0, %%mm1     \n\t"
-    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
-    "  movd        %%mm1, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  psubusw     %%mm6, %%mm7     \n\t"
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
-    "  psubusw     %%mm4, %%mm5     \n\t" 	
-    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
-    "  psubusw     %%mm5, %%mm7     \n\t" 	
-    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" (stride)
-     : "memory", "edi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    ".rept 8                         \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
-    "  paddb       %%mm5, %%mm5     \n\t"
-   
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    "  mov         $8, %%edi        \n\t"	/* 8 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%2), %%mm2      \n\t"
-    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm5, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=r" (XSum),
-       "=r" (XXSum),
-       "+r" (DataPtr) 
-     : "r" (Stride)
-     : "edi", "memory"
-  );
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%3), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
-    "  paddb       %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm4, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
-  __asm__ __volatile__ (
-    "  emms                         \n\t"
-  );
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_32 mmx dsp functions.\n");
-  funcs->restore_fpu = restore_fpu;
-  funcs->sub8x8 = sub8x8__mmx;
-  funcs->sub8x8_128 = sub8x8_128__mmx;
-  funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->row_sad8 = row_sad8__mmx;
-  funcs->col_sad8x8 = col_sad8x8__mmx;
-  funcs->sad8x8 = sad8x8__mmx;
-  funcs->sad8x8_thres = sad8x8_thres__mmx;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
-  funcs->intra8x8_err = intra8x8_err__mmx;
-  funcs->inter8x8_err = inter8x8_err__mmx;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-

Copied: trunk/theora/lib/x86_32/dsp_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/dsp_mmx.c)

Deleted: trunk/theora/lib/x86_32/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/x86_32/dsp_mmxext.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/dsp_mmxext.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,318 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  movq (%3), %%mm2             \n\t"
-    "  pavgb %%mm2, %%mm1           \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-		
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  movd        (%1), %%mm0      \n\t"
-    "  movd        (%2), %%mm1      \n\t"
-    "  psadbw      %%mm0, %%mm1     \n\t"
-    "  movd        4(%1), %%mm2     \n\t"
-    "  movd        4(%2), %%mm3     \n\t"
-    "  psadbw      %%mm2, %%mm3     \n\t"
-
-    "  pmaxsw      %%mm1, %%mm3     \n\t"
-    "  movd        %%mm3, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  pmaxsw      %%mm4, %%mm5     \n\t"
-    "  pmaxsw      %%mm5, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" (stride)
-     : "memory", "edi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
-    "  pavgb       %%mm2, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm4, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm4, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_mmxext_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_32 mmxext dsp functions.\n");
-  funcs->row_sad8 = row_sad8__mmxext;
-  funcs->col_sad8x8 = col_sad8x8__mmxext;
-  funcs->sad8x8 = sad8x8__mmxext;
-  funcs->sad8x8_thres = sad8x8_thres__mmxext;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}
-

Copied: trunk/theora/lib/x86_32/dsp_mmxext.c (from rev 11426, branches/theora-mmx/lib/x86_32/dsp_mmxext.c)

Deleted: trunk/theora/lib/x86_32/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/fdct_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/fdct_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,342 +0,0 @@
-;//==========================================================================
-;//
-;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;//  PURPOSE.
-;//
-;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- *	File:			fdct_m.asm
- *
- *	Description:
- *					This function perform 2-D Forward DCT on a 8x8 block
- *					
- *
- *	Input:			Pointers to input source data buffer and destination 
- *					buffer.
- *
- *	Note:			none
- *
- *	Special Notes:	We try to do the truncation right to match the result 
- *					of the c version. 
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
-  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
-  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
-  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
-  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
-  "  movq        %%mm0, %%mm4       \n\t"                                     \
-  "  movq        %%mm1, %%mm5       \n\t"                                     \
-  "  movq        %%mm2, %%mm6       \n\t"                                     \
-  "  movq        %%mm3, %%mm7       \n\t"                                     \
-                                                                              \
-  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
-  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
-  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
-  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
-  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
-  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
-                                                                              \
-  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
-                                                                              \
-  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
-                                                                              \
-  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
-                                                                              \
-  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
-  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
-  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
-                                                                              \
-  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
-  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
-  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
-  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
-                                                                              \
-  "  pmulhw   "M(xC4S4)", %%mm0     \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
-  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
-                                                                              \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
-                                                                              \
-  "  movq        %%mm3, %%mm0       \n\t"                                     \
-  "  pmulhw   "M(xC4S4)", %%mm3     \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
-                                                                              \
-  "  movq        %%mm3," #ip0 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
-  "  pmulhw   "M(xC2S6)", %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq      " #temp ", %%mm2     \n\t"                                     \
-  "  movq        %%mm2, %%mm0       \n\t"                                     \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
-  "  paddw       %%mm0, %%mm3       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-  "  pmulhw   "M(xC6S2)", %%mm0     \n\t" /* mm0 = xC6S2 * irot_input_x */    \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
-  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
-                                                                              \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw   "M(xC2S6)", %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  movq      " #temp ", %%mm3     \n\t"                                     \
-  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw   "M(xC6S2)", %%mm3     \n\t" /* mm3 = xC6S2 * irot_input_y */    \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3," #ip6 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC4S4)", %%mm0     \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
-                                                                              \
-  "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
-                                                                              \
-  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
-  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
-  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
-                                                                              \
-  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
-  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
-  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
-                                                                              \
-  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
-  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
-  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
-                                                                              \
-  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-                                                                              \
-  "  movq        %%mm0, %%mm5       \n\t"                                     \
-  "  movq        %%mm0, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
-                                                                              \
-  "  movq        %%mm1," #ip1 "     \n\t"                                     \
-  "  movq        %%mm3," #ip7 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC3S5)", %%mm0     \n\t"                                     \
-  "  movq     "M(xC5S3)", %%mm1     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm6, %%mm5       \n\t"                                     \
-  "  movq        %%mm6, %%mm7       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm4, %%mm2       \n\t"                                     \
-  "  movq        %%mm4, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  psrlw       $15, %%mm5         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
-  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
-  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
-  "  movq        %%mm4," #ip3 "     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3, %%mm4       \n\t"                                     \
-  "  movq        %%mm7, %%mm6       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t"                                     \
-  "  paddw       %%mm5, %%mm6       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
-  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
-  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
-  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
-  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
-  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
-  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
-  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
-  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
-  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
-  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
-   /* Transpose 2x8 block */                                            \
-  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
-  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
-  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
-  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
-  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
-  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
-  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
-  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
-  "  movq        %%mm4," #op4 "     \n\t"                               \
-  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
-  "  movq        %%mm5," #op5 "     \n\t"                               \
-  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
-  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
-  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
-  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
-  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
-  "  movq        %%mm6," #op7 "     \n\t"                               \
-  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
-  "  movq        %%mm1," #op6 "     \n\t"                               \
-  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
-  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
-  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
-  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
-  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
-  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
-  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
-  "  movq        %%mm0," #op0 "     \n\t"                               \
-  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
-  "  movq        %%mm1," #op1 "     \n\t"                               \
-  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
-  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
-  "  movq        %%mm4," #op3 "     \n\t"                               \
-  "  movq        %%mm2," #op2 "     \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
-  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
-  ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    /*
-     * Input data is an 8x8 block.  To make processing of the data more efficent
-     * we will transpose the block of data to two 4x8 blocks???
-     */
-    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
-    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
-    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
-    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
-    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    "  emms                         \n\t"
-    
-    : "+r" (InputData),
-      "+r" (OutputData)
-    : "r" (temp)
-    : "memory"
-  );
-}
-
-void dsp_mmx_fdct_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_32 mmx fdct function.\n");
-  funcs->fdct_short = fdct_short__mmx;
-}

Copied: trunk/theora/lib/x86_32/fdct_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/fdct_mmx.c)

Deleted: trunk/theora/lib/x86_32/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/recon_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/recon_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,187 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-static void copy8x8__mmx (unsigned char *src,
-	                unsigned char *dest,
-	                unsigned int stride)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  lea         (%2, %2, 2), %%edi  \n\t"
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
-
-    "  lea         (%1, %2, 4), %1     \n\t" 
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
-
-    "  lea         (%0, %2, 4), %0     \n\t" 
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
-      : "+a" (dest)
-      : "c" (src),
-        "d" (stride)
-      : "memory", "edi"
-  );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-		      ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
-
-    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
-    "1:                                \n\t" 
-    "  movq         (%1), %%mm2        \n\t" /* First four input values */
-
-    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
-    "  por         %%mm0, %%mm0        \n\t" 
-    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
-    "  lea         16(%1), %1          \n\t" /* Step source buffer */
-    "  cmp         %%edi, %1           \n\t" /* are we done */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store results */
-
-    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
-    "  jc          1b                  \n\t" /* Loop back if we are not done */
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
-    "  add         %3, %2              \n\t" /* next row of reference pixels */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  cmp         %%edi, %1            \n\t" /* are we done? */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-
-    "  lea         (%0, %3), %0        \n\t" /* next row of output */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr),
-        "r" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-			   ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
-    "  movq        %%mm4, %%mm5        \n\t"
-    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
-    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
-    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
-    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
-    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
-    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
-    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
-    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
-    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
-    "  add         %4, %2              \n\t" /* next row of reference pixels */
-    "  add         %4, %3              \n\t" /* next row of reference pixels */
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-    "  add         %4, %0              \n\t" /* next row of output */
-    "  cmp         %%edi, %1           \n\t" /* are we done? */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr1),
-        "r" (RefPtr2),
-        "m" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-void dsp_mmx_recon_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_32 mmx recon functions.\n");
-  funcs->copy8x8 = copy8x8__mmx;
-  funcs->recon_intra8x8 = recon_intra8x8__mmx;
-  funcs->recon_inter8x8 = recon_inter8x8__mmx;
-  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-

Copied: trunk/theora/lib/x86_32/recon_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/recon_mmx.c)

Copied: trunk/theora/lib/x86_64 (from rev 11426, branches/theora-mmx/lib/x86_64)

Deleted: trunk/theora/lib/x86_64/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/dsp_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/dsp_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,298 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %2          \n\t"
-    "  add         %3, %0           \n\t"
-    "  add         %4, %1           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr),
-       "+r" (DctInputPtr)
-     : "r" ((ogg_uint64_t)PixelsPerLine),
-       "r" ((ogg_uint64_t)ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
-{
-  ogg_uint64_t ppl = PixelsPerLine;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-    "  movq        %[V128], %%mm1   \n\t"
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %1           \n\t"
-    "  add         %2, %0           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (DctInputPtr)
-     : "r" (ppl), /* gcc bug? a cast won't work here, e.g. (ogg_uint64_t)PixelsPerLine */
-       [V128] "m" (V128)
-     : "memory"
-  );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
-    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
-    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
-    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
-    /* average ReconPtr1 and ReconPtr2 */
-    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
-    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
-    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %3           \n\t"
-    "  add         %4, %0           \n\t"
-    "  add         %5, %1           \n\t"
-    "  add         %5, %2           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr1),
-       "+r" (ReconPtr2),
-       "+r" (DctInputPtr)
-     : "r" ((ogg_uint64_t)PixelsPerLine),
-       "r" ((ogg_uint64_t)ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint64_t  XSum;
-  ogg_uint64_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%rdi     \n\t"
-    "  movsx       %%di, %%rdi      \n\t"
-    "  mov         %%rdi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=r" (XSum),
-       "=r" (XXSum),
-       "+r" (DataPtr) 
-     : "r" ((ogg_uint64_t)Stride)
-     : "rdi", "memory"
-  );
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
-  ogg_uint64_t  XSum;
-  ogg_uint64_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%3), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%rdi     \n\t"
-    "  movsx       %%di, %%rdi      \n\t"
-    "  mov         %%rdi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
-     : "r" ((ogg_uint64_t)SrcStride),
-       "r" ((ogg_uint64_t)RefStride)
-     : "rdi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
-  __asm__ __volatile__ (
-    "  emms                         \n\t"
-  );
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "setting accelerated x86_64 mmx dsp functions.\n");
-  funcs->restore_fpu = restore_fpu;
-  funcs->sub8x8 = sub8x8__mmx;
-  funcs->sub8x8_128 = sub8x8_128__mmx;
-  funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->intra8x8_err = intra8x8_err__mmx;
-  funcs->inter8x8_err = inter8x8_err__mmx;
-}
-

Copied: trunk/theora/lib/x86_64/dsp_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/dsp_mmx.c)

Deleted: trunk/theora/lib/x86_64/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/x86_64/dsp_mmxext.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/dsp_mmxext.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,317 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-                                    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" ((ogg_uint64_t)stride1),
-       "r" ((ogg_uint64_t)stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-                                          unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" ((ogg_uint64_t)stride1),
-       "r" ((ogg_uint64_t)stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-                                              unsigned char *RefDataPtr1,
-                                              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-                                              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  movq (%3), %%mm2             \n\t"
-    "  pavgb %%mm2, %%mm1           \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "r" ((ogg_uint64_t)SrcStride),
-       "r" ((ogg_uint64_t)RefStride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-		
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  movd        (%1), %%mm0      \n\t"
-    "  movd        (%2), %%mm1      \n\t"
-    "  psadbw      %%mm0, %%mm1     \n\t"
-    "  movd        4(%1), %%mm2     \n\t"
-    "  movd        4(%2), %%mm3     \n\t"
-    "  psadbw      %%mm2, %%mm3     \n\t"
-
-    "  pmaxsw      %%mm1, %%mm3     \n\t"
-    "  movd        %%mm3, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  pmaxsw      %%mm4, %%mm5     \n\t"
-    "  pmaxsw      %%mm5, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" ((ogg_uint64_t)stride)
-     : "memory", "rdi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-                                              unsigned char *RefDataPtr1,
-                                              unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint64_t XSum;
-  ogg_uint64_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%rdi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
-    "  pavgb       %%mm2, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm4, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm4, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%rdi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "r" ((ogg_uint64_t)SrcStride),
-       "r" ((ogg_uint64_t)RefStride)
-     : "rdi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_mmxext_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accerated x86_64 mmxext dsp functions.\n");
-  funcs->row_sad8 = row_sad8__mmxext;
-  funcs->col_sad8x8 = col_sad8x8__mmxext;
-  funcs->sad8x8 = sad8x8__mmxext;
-  funcs->sad8x8_thres = sad8x8_thres__mmxext;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}

Copied: trunk/theora/lib/x86_64/dsp_mmxext.c (from rev 11426, branches/theora-mmx/lib/x86_64/dsp_mmxext.c)

Deleted: trunk/theora/lib/x86_64/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/fdct_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/fdct_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,349 +0,0 @@
-;//==========================================================================
-;//
-;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;//  PURPOSE.
-;//
-;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- *	File:			fdct_m.asm
- *
- *	Description:
- *					This function perform 2-D Forward DCT on a 8x8 block
- *					
- *
- *	Input:			Pointers to input source data buffer and destination 
- *					buffer.
- *
- *	Note:			none
- *
- *	Special Notes:	We try to do the truncation right to match the result 
- *					of the c version. 
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
-  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
-  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
-  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
-  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
-  "  movq        %%mm0, %%mm4       \n\t"                                     \
-  "  movq        %%mm1, %%mm5       \n\t"                                     \
-  "  movq        %%mm2, %%mm6       \n\t"                                     \
-  "  movq        %%mm3, %%mm7       \n\t"                                     \
-                                                                              \
-  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
-  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
-  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
-  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
-  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
-  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
-                                                                              \
-  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
-                                                                              \
-  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
-                                                                              \
-  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
-                                                                              \
-  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
-  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
-  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
-                                                                              \
-  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
-  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
-  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
-  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
-                                                                              \
-  "  pmulhw      %[xC4S4], %%mm0    \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
-  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
-                                                                              \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
-                                                                              \
-  "  movq        %%mm3, %%mm0       \n\t"                                     \
-  "  pmulhw      %[xC4S4], %%mm3    \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
-                                                                              \
-  "  movq        %%mm3," #ip0 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
-  "  pmulhw      %[xC2S6], %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq      " #temp ", %%mm2     \n\t"                                     \
-  "  movq        %%mm2, %%mm0       \n\t"                                     \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
-  "  paddw       %%mm0, %%mm3       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-  "  pmulhw      %[xC6S2], %%mm0    \n\t" /* mm0 = xC6S2 * irot_input_x */    \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
-  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
-                                                                              \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %[xC2S6], %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  movq      " #temp ", %%mm3     \n\t"                                     \
-  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %[xC6S2], %%mm3    \n\t" /* mm3 = xC6S2 * irot_input_y */    \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3," #ip6 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC4S4], %%mm0    \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
-                                                                              \
-  "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
-                                                                              \
-  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
-  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
-  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
-                                                                              \
-  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
-  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
-  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
-                                                                              \
-  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
-  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
-  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC1S7], %%mm7    \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
-                                                                              \
-  "  movq        %[xC7S1], %%mm7    \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-                                                                              \
-  "  movq        %%mm0, %%mm5       \n\t"                                     \
-  "  movq        %%mm0, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %[xC1S7], %%mm7    \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq        %[xC7S1], %%mm7    \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
-                                                                              \
-  "  movq        %%mm1," #ip1 "     \n\t"                                     \
-  "  movq        %%mm3," #ip7 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC3S5], %%mm0    \n\t"                                     \
-  "  movq        %[xC5S3], %%mm1    \n\t"                                     \
-                                                                              \
-  "  movq        %%mm6, %%mm5       \n\t"                                     \
-  "  movq        %%mm6, %%mm7       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm4, %%mm2       \n\t"                                     \
-  "  movq        %%mm4, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  psrlw       $15, %%mm5         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
-  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
-  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
-  "  movq        %%mm4," #ip3 "     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3, %%mm4       \n\t"                                     \
-  "  movq        %%mm7, %%mm6       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t"                                     \
-  "  paddw       %%mm5, %%mm6       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
-  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
-  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
-  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
-  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
-  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
-  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
-  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
-  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
-  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
-  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
-   /* Transpose 2x8 block */                                            \
-  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
-  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
-  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
-  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
-  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
-  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
-  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
-  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
-  "  movq        %%mm4," #op4 "     \n\t"                               \
-  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
-  "  movq        %%mm5," #op5 "     \n\t"                               \
-  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
-  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
-  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
-  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
-  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
-  "  movq        %%mm6," #op7 "     \n\t"                               \
-  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
-  "  movq        %%mm1," #op6 "     \n\t"                               \
-  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
-  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
-  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
-  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
-  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
-  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
-  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
-  "  movq        %%mm0," #op0 "     \n\t"                               \
-  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
-  "  movq        %%mm1," #op1 "     \n\t"                               \
-  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
-  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
-  "  movq        %%mm4," #op3 "     \n\t"                               \
-  "  movq        %%mm2," #op2 "     \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
-  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
-  ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    /*
-     * Input data is an 8x8 block.  To make processing of the data more efficent
-     * we will transpose the block of data to two 4x8 blocks???
-     */
-    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
-    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
-    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
-    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
-    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    "  emms                         \n\t"
-    
-    : "+r" (InputData),
-      "+r" (OutputData)
-    : "r" (temp),
-      [xC1S7] "m" (xC1S7),      /* gcc 3.1+ allows named asm parameters */
-      [xC2S6] "m" (xC2S6),
-      [xC3S5] "m" (xC3S5),
-      [xC4S4] "m" (xC4S4),
-      [xC5S3] "m" (xC5S3),
-      [xC6S2] "m" (xC6S2),
-      [xC7S1] "m" (xC7S1)
-    : "memory"
-  );
-}
-
-void dsp_mmx_fdct_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_64 mmx fdct function.\n");
-  funcs->fdct_short = fdct_short__mmx;
-}

Copied: trunk/theora/lib/x86_64/fdct_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/fdct_mmx.c)

Deleted: trunk/theora/lib/x86_64/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/recon_mmx.c	2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/recon_mmx.c	2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,181 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-static void copy8x8__mmx (unsigned char *src,
-                          unsigned char *dest,
-                          ogg_uint32_t stride)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  lea         (%2, %2, 2), %%rdi  \n\t"
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%rdi), %%mm3  \n\t"
-
-    "  lea         (%1, %2, 4), %1     \n\t" 
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%rdi)  \n\t"
-
-    "  lea         (%0, %2, 4), %0     \n\t" 
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%rdi), %%mm3  \n\t"
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%rdi)  \n\t"
-      : "+a" (dest)
-      : "c" (src),
-        "d" ((ogg_uint64_t)stride)
-      : "memory", "rdi"
-  );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-                                 ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  movq        %[V128], %%mm0      \n\t" /* Set mm0 to 0x8080808080808080 */
-
-    "  lea         128(%1), %%rdi      \n\t" /* Endpoint in input buffer */
-    "1:                                \n\t" 
-    "  movq         (%1), %%mm2        \n\t" /* First four input values */
-
-    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
-    "  por         %%mm0, %%mm0        \n\t" 
-    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
-    "  lea         16(%1), %1          \n\t" /* Step source buffer */
-    "  cmp         %%rdi, %1           \n\t" /* are we done */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store results */
-
-    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
-    "  jc          1b                  \n\t" /* Loop back if we are not done */
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" ((ogg_uint64_t)LineStep),
-        [V128] "m" (V128)
-      : "memory", "rdi"
-  );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-                                 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%rdi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
-    "  add         %3, %2              \n\t" /* next row of reference pixels */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  cmp         %%rdi, %1           \n\t" /* are we done? */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-
-    "  lea         (%0, %3), %0        \n\t" /* next row of output */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr),
-        "r" ((ogg_uint64_t)LineStep)
-      : "memory", "rdi"
-  );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-                                      unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-                                      ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%rdi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
-    "  movq        %%mm4, %%mm5        \n\t"
-    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
-    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
-    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
-    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
-    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
-    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
-    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
-    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
-    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
-    "  add         %4, %2              \n\t" /* next row of reference pixels */
-    "  add         %4, %3              \n\t" /* next row of reference pixels */
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-    "  add         %4, %0              \n\t" /* next row of output */
-    "  cmp         %%rdi, %1           \n\t" /* are we done? */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr1),
-        "r" (RefPtr2),
-        "r" ((ogg_uint64_t)LineStep)
-      : "memory", "rdi"
-  );
-}
-
-void dsp_mmx_recon_init(DspFunctions *funcs)
-{
-  fprintf(stderr, "enabling accelerated x86_64 mmx recon functions.\n");
-  funcs->copy8x8 = copy8x8__mmx;
-  funcs->recon_intra8x8 = recon_intra8x8__mmx;
-  funcs->recon_inter8x8 = recon_inter8x8__mmx;
-  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-

Copied: trunk/theora/lib/x86_64/recon_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/recon_mmx.c)



More information about the commits mailing list