[xiph-commits] r11338 - in branches/theora-mmx: . lib lib/x86_32

Wed May 3 16:31:11 PDT 2006

Author: j
Date: 2006-05-03 16:31:00 -0700 (Wed, 03 May 2006)
New Revision: 11338

Added:
   branches/theora-mmx/lib/x86_32/
   branches/theora-mmx/lib/x86_32/dsp_mmx.c
   branches/theora-mmx/lib/x86_32/dsp_mmxext.c
   branches/theora-mmx/lib/x86_32/fdct_mmx.c
   branches/theora-mmx/lib/x86_32/recon_mmx.c
Removed:
   branches/theora-mmx/lib/i386/
   branches/theora-mmx/lib/x86_32/dsp_mmx.c
   branches/theora-mmx/lib/x86_32/dsp_mmxext.c
   branches/theora-mmx/lib/x86_32/fdct_mmx.c
   branches/theora-mmx/lib/x86_32/recon_mmx.c
Modified:
   branches/theora-mmx/configure.ac
   branches/theora-mmx/lib/Makefile.am
   branches/theora-mmx/lib/cpu.c
   branches/theora-mmx/lib/dct.c
   branches/theora-mmx/lib/dsp.c
   branches/theora-mmx/lib/dsp.h
   branches/theora-mmx/lib/reconstruct.c
Log:
- move i386 -> x86_32
- make mmx optional(also compiles on ppc like trunk now)



Modified: branches/theora-mmx/configure.ac
===================================================================

--- branches/theora-mmx/configure.ac	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/configure.ac	2006-05-03 23:31:00 UTC (rev 11338)
@@ -102,12 +102,21 @@
 fi
 CFLAGS="$CFLAGS $cflags_save"
 
+cpu_optimization="no optimization for your platform, please send a patch"
 cpu_x86_64=no
-case $target in
-	x86_64-*)
-		cpu_x86_64=yes ;;
+cpu_x86_32=no
+case $target_cpu in
+	i[[3456]]86)
+		cpu_x86_32=yes 
+    cpu_optimization="32bit x86"
+    ;;
+	x86_64)
+		cpu_x86_64=yes
+    cpu_optimization="64bit x86"
+    ;;
 esac
 AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
 
 # Test whenever ld supports -version-script
 AC_PROG_LD
@@ -295,8 +304,9 @@
 
   General configuration:
 
-    Encoding support: ............ ${ac_enable_encode}
-    Floating point support: ...... ${ac_enable_float}
+    Encoding support: ........... ${ac_enable_encode}
+    Floating point support: ..... ${ac_enable_float}
+    Assembly optimization: ...... ${cpu_optimization}
 
   Installation paths:
 

Modified: branches/theora-mmx/lib/Makefile.am
===================================================================
--- branches/theora-mmx/lib/Makefile.am	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/Makefile.am	2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,10 +1,10 @@
 INCLUDES = -I$(top_srcdir)/include
 
 EXTRA_DIST = Version_script.in \
-	i386/dsp_mmx.c \
-	i386/dsp_mmxext.c \
-	i386/recon_mmx.c \
-	i386/fdct_mmx.c \
+	x86_32/dsp_mmx.c \
+	x86_32/dsp_mmxext.c \
+	x86_32/recon_mmx.c \
+	x86_32/fdct_mmx.c \
 	x86_64/dsp_mmx.c \
 	x86_64/dsp_mmxext.c \
 	x86_64/recon_mmx.c \
@@ -20,10 +20,23 @@
 
 if CPU_x86_64
 arch_dir = x86_64
+arch_sources= \
+	$(arch_dir)/dsp_mmx.c \
+	$(arch_dir)/dsp_mmxext.c \
+	$(arch_dir)/recon_mmx.c \
+	$(arch_dir)/fdct_mmx.c
 else
-arch_dir = i386
+if CPU_x86_32
+arch_dir = x86_32
+arch_sources= \
+	$(arch_dir)/dsp_mmx.c \
+	$(arch_dir)/dsp_mmxext.c \
+	$(arch_dir)/recon_mmx.c \
+	$(arch_dir)/fdct_mmx.c
 endif
+endif
 
+
 libtheora_la_SOURCES = \
 	blockmap.c \
 	comment.c \
@@ -44,10 +57,7 @@
 	toplevel.c \
 	cpu.c \
 	dsp.c \
-	$(arch_dir)/dsp_mmx.c \
-	$(arch_dir)/dsp_mmxext.c \
-	$(arch_dir)/recon_mmx.c \
-	$(arch_dir)/fdct_mmx.c \
+  $(arch_sources) \
 	$(encoder_sources)
 
 noinst_HEADERS = \

Modified: branches/theora-mmx/lib/cpu.c
===================================================================
--- branches/theora-mmx/lib/cpu.c	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/cpu.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -33,7 +33,7 @@
                 "=d" (*edx)          
               : "a" (op)            
               : "cc");
-#else
+#elif defined(__x86)
   asm volatile ("pushl %%ebx   \n\t"
                 "cpuid         \n\t"
                 "movl %%ebx,%1 \n\t"
@@ -53,11 +53,11 @@
   ogg_uint32_t eax, ebx, ecx, edx;
   ogg_uint32_t flags;
 
-# if defined(__x86_64__)
+#if defined(__x86_64__)
 
   /* no need to check, we have cpuid on x86_64 */
 
-#else /* assume i386 */
+#elif defined(__x86)
   asm volatile ("pushfl              \n\t"
                 "pushfl              \n\t"
                 "popl %0             \n\t"

Modified: branches/theora-mmx/lib/dct.c
===================================================================
--- branches/theora-mmx/lib/dct.c	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dct.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -256,8 +256,10 @@
 void dsp_dct_init (DspFunctions *funcs)
 {
   funcs->fdct_short = fdct_short__c;
+#if defined(__x86)
   if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_fdct_init(&dsp_funcs);
   }
+#endif
 }
 

Modified: branches/theora-mmx/lib/dsp.c
===================================================================
--- branches/theora-mmx/lib/dsp.c	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dsp.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -405,13 +405,16 @@
 {
   cpu_init ();
   dsp_init (&dsp_funcs);
+
   dsp_recon_init (&dsp_funcs);
   dsp_dct_init (&dsp_funcs);
+#if defined(__x86)
   if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_init(&dsp_funcs);
   }
   if (cpu_flags & CPU_X86_MMXEXT) {
     dsp_mmxext_init(&dsp_funcs);
   }
+#endif
 }
 

Modified: branches/theora-mmx/lib/dsp.h
===================================================================
--- branches/theora-mmx/lib/dsp.h	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dsp.h	2006-05-03 23:31:00 UTC (rev 11338)
@@ -85,11 +85,13 @@
 extern void dsp_recon_init (DspFunctions *funcs);
 
 void dsp_init(DspFunctions *funcs);
-void dsp_mmx_init(DspFunctions *funcs);
-void dsp_mmxext_init(DspFunctions *funcs);
-void dsp_mmx_fdct_init(DspFunctions *funcs);
-void dsp_mmx_recon_init(DspFunctions *funcs);
 void dsp_static_init(void);
+#if defined(__x86)
+extern void dsp_mmx_init(DspFunctions *funcs);
+extern void dsp_mmxext_init(DspFunctions *funcs);
+extern void dsp_mmx_fdct_init(DspFunctions *funcs);
+extern void dsp_mmx_recon_init(DspFunctions *funcs);
+#endif
 
 #define dsp_save_fpu(funcs) (funcs.save_fpu ())
 #define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)

Modified: branches/theora-mmx/lib/reconstruct.c
===================================================================
--- branches/theora-mmx/lib/reconstruct.c	2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/reconstruct.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -104,7 +104,9 @@
   funcs->recon_intra8x8 = recon_intra8x8__c;
   funcs->recon_inter8x8 = recon_inter8x8__c;
   funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if defined(__x86)
   if (cpu_flags & CPU_X86_MMX) {
     dsp_mmx_recon_init(&dsp_funcs);
   }
+#endif
 }

Copied: branches/theora-mmx/lib/x86_32 (from rev 11336, branches/theora-mmx/lib/i386)

Deleted: branches/theora-mmx/lib/x86_32/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/dsp_mmx.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,642 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
-                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %2           \n\t"
-    "  add         %3, %0           \n\t"
-    "  add         %4, %1           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                      ogg_uint32_t PixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-    "  movq      "M(V128)", %%mm1   \n\t"
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %1           \n\t"
-    "  add         %2, %0           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine)
-     : "memory"
-  );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
-                     ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
-{
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
-    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
-    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
-    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
-    /* average ReconPtr1 and ReconPtr2 */
-    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
-    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
-    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %3           \n\t"
-    "  add         %4, %0           \n\t"
-    "  add         %5, %1           \n\t"
-    "  add         %5, %2           \n\t"
-    ".endr                          \n\t"
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr1),
-       "+r" (ReconPtr2),
-       "+r" (DctInputPtr)
-     : "m" (PixelsPerLine),
-       "m" (ReconPixelsPerLine) 
-     : "memory"
-  );
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
-    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
-    "  psrlq       $32, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $16, %%mm2       \n\t"
-    "  psrlq       $16, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-
-    "  psubusw     %%mm0, %%mm1     \n\t"
-    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
-    "  movd        %%mm1, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  psubusw     %%mm6, %%mm7     \n\t"
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
-    "  psubusw     %%mm4, %%mm5     \n\t" 	
-    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
-    "  psubusw     %%mm5, %%mm7     \n\t" 	
-    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" (stride)
-     : "memory", "edi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    ".rept 8                         \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
-    "  paddb       %%mm5, %%mm5     \n\t"
-   
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    "  mov         $8, %%edi        \n\t"	/* 8 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%2), %%mm2      \n\t"
-    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm5, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=r" (XSum),
-       "=r" (XXSum),
-       "+r" (DataPtr) 
-     : "r" (Stride)
-     : "edi", "memory"
-  );
-
-  /* Compute population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%3), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
-    "  paddb       %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm4, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
-  __asm__ __volatile__ (
-    "  emms                         \n\t"
-  );
-}
-
-void dsp_i386_mmx_init(DspFunctions *funcs)
-{
-  funcs->restore_fpu = restore_fpu;
-  funcs->sub8x8 = sub8x8__mmx;
-  funcs->sub8x8_128 = sub8x8_128__mmx;
-  funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->row_sad8 = row_sad8__mmx;
-  funcs->col_sad8x8 = col_sad8x8__mmx;
-  funcs->sad8x8 = sad8x8__mmx;
-  funcs->sad8x8_thres = sad8x8_thres__mmx;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
-  funcs->intra8x8_err = intra8x8_err__mmx;
-  funcs->inter8x8_err = inter8x8_err__mmx;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-

Copied: branches/theora-mmx/lib/x86_32/dsp_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/dsp_mmx.c)

Deleted: branches/theora-mmx/lib/x86_32/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/dsp_mmxext.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,316 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  movq (%3), %%mm2             \n\t"
-    "  pavgb %%mm2, %%mm1           \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movd %%mm7, %0               \n\t"
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-		
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  movd        (%1), %%mm0      \n\t"
-    "  movd        (%2), %%mm1      \n\t"
-    "  psadbw      %%mm0, %%mm1     \n\t"
-    "  movd        4(%1), %%mm2     \n\t"
-    "  movd        4(%2), %%mm3     \n\t"
-    "  psadbw      %%mm2, %%mm3     \n\t"
-
-    "  pmaxsw      %%mm1, %%mm3     \n\t"
-    "  movd        %%mm3, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  pmaxsw      %%mm4, %%mm5     \n\t"
-    "  pmaxsw      %%mm5, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  pmaxsw      %%mm6, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" (stride)
-     : "memory", "edi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
-    "  pavgb       %%mm2, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm4, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm4, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_i386_mmxext_init(DspFunctions *funcs)
-{
-  funcs->row_sad8 = row_sad8__mmxext;
-  funcs->col_sad8x8 = col_sad8x8__mmxext;
-  funcs->sad8x8 = sad8x8__mmxext;
-  funcs->sad8x8_thres = sad8x8_thres__mmxext;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}
-

Copied: branches/theora-mmx/lib/x86_32/dsp_mmxext.c (from rev 11337, branches/theora-mmx/lib/i386/dsp_mmxext.c)

Deleted: branches/theora-mmx/lib/x86_32/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/fdct_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/fdct_mmx.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,340 +0,0 @@
-;//==========================================================================
-;//
-;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;//  PURPOSE.
-;//
-;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- *	File:			fdct_m.asm
- *
- *	Description:
- *					This function perform 2-D Forward DCT on a 8x8 block
- *					
- *
- *	Input:			Pointers to input source data buffer and destination 
- *					buffer.
- *
- *	Note:			none
- *
- *	Special Notes:	We try to do the truncation right to match the result 
- *					of the c version. 
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
-  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
-  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
-  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
-  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
-  "  movq        %%mm0, %%mm4       \n\t"                                     \
-  "  movq        %%mm1, %%mm5       \n\t"                                     \
-  "  movq        %%mm2, %%mm6       \n\t"                                     \
-  "  movq        %%mm3, %%mm7       \n\t"                                     \
-                                                                              \
-  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
-  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
-  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
-  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
-  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
-  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
-                                                                              \
-  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
-                                                                              \
-  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
-                                                                              \
-  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
-                                                                              \
-  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
-  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
-  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
-                                                                              \
-  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
-  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
-  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
-  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
-                                                                              \
-  "  pmulhw   "M(xC4S4)", %%mm0     \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
-  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
-                                                                              \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
-                                                                              \
-  "  movq        %%mm3, %%mm0       \n\t"                                     \
-  "  pmulhw   "M(xC4S4)", %%mm3     \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
-                                                                              \
-  "  movq        %%mm3," #ip0 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
-  "  pmulhw   "M(xC2S6)", %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq      " #temp ", %%mm2     \n\t"                                     \
-  "  movq        %%mm2, %%mm0       \n\t"                                     \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
-  "  paddw       %%mm0, %%mm3       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-  "  pmulhw   "M(xC6S2)", %%mm0     \n\t" /* mm0 = xC6S2 * irot_input_x */    \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
-  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
-                                                                              \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw   "M(xC2S6)", %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  movq      " #temp ", %%mm3     \n\t"                                     \
-  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw   "M(xC6S2)", %%mm3     \n\t" /* mm3 = xC6S2 * irot_input_y */    \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3," #ip6 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC4S4)", %%mm0     \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
-                                                                              \
-  "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
-                                                                              \
-  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
-  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
-  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
-                                                                              \
-  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
-  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
-  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
-                                                                              \
-  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
-  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
-  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
-                                                                              \
-  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-                                                                              \
-  "  movq        %%mm0, %%mm5       \n\t"                                     \
-  "  movq        %%mm0, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
-                                                                              \
-  "  movq        %%mm1," #ip1 "     \n\t"                                     \
-  "  movq        %%mm3," #ip7 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq     "M(xC3S5)", %%mm0     \n\t"                                     \
-  "  movq     "M(xC5S3)", %%mm1     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm6, %%mm5       \n\t"                                     \
-  "  movq        %%mm6, %%mm7       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm4, %%mm2       \n\t"                                     \
-  "  movq        %%mm4, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  psrlw       $15, %%mm5         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
-  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
-  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
-  "  movq        %%mm4," #ip3 "     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3, %%mm4       \n\t"                                     \
-  "  movq        %%mm7, %%mm6       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t"                                     \
-  "  paddw       %%mm5, %%mm6       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
-  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
-  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
-  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
-  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
-  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
-  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
-  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
-  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
-  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
-  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
-   /* Transpose 2x8 block */                                            \
-  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
-  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
-  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
-  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
-  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
-  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
-  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
-  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
-  "  movq        %%mm4," #op4 "     \n\t"                               \
-  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
-  "  movq        %%mm5," #op5 "     \n\t"                               \
-  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
-  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
-  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
-  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
-  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
-  "  movq        %%mm6," #op7 "     \n\t"                               \
-  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
-  "  movq        %%mm1," #op6 "     \n\t"                               \
-  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
-  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
-  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
-  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
-  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
-  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
-  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
-  "  movq        %%mm0," #op0 "     \n\t"                               \
-  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
-  "  movq        %%mm1," #op1 "     \n\t"                               \
-  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
-  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
-  "  movq        %%mm4," #op3 "     \n\t"                               \
-  "  movq        %%mm2," #op2 "     \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
-  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
-  ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    /*
-     * Input data is an 8x8 block.  To make processing of the data more efficent
-     * we will transpose the block of data to two 4x8 blocks???
-     */
-    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
-    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
-    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
-    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
-    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    "  emms                         \n\t"
-    
-    : "+r" (InputData),
-      "+r" (OutputData)
-    : "r" (temp)
-    : "memory"
-  );
-}
-
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
-{
-  funcs->fdct_short = fdct_short__mmx;
-}

Copied: branches/theora-mmx/lib/x86_32/fdct_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/fdct_mmx.c)

Deleted: branches/theora-mmx/lib/x86_32/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/recon_mmx.c	2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,185 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
-	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-static void copy8x8__mmx (unsigned char *src,
-	                unsigned char *dest,
-	                unsigned int stride)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  lea         (%2, %2, 2), %%edi  \n\t"
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
-
-    "  lea         (%1, %2, 4), %1     \n\t" 
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
-
-    "  lea         (%0, %2, 4), %0     \n\t" 
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
-      : "+a" (dest)
-      : "c" (src),
-        "d" (stride)
-      : "memory", "edi"
-  );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-		      ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
-
-    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
-    "1:                                \n\t" 
-    "  movq         (%1), %%mm2        \n\t" /* First four input values */
-
-    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
-    "  por         %%mm0, %%mm0        \n\t" 
-    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
-    "  lea         16(%1), %1          \n\t" /* Step source buffer */
-    "  cmp         %%edi, %1           \n\t" /* are we done */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store results */
-
-    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
-    "  jc          1b                  \n\t" /* Loop back if we are not done */
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
-    "  add         %3, %2              \n\t" /* next row of reference pixels */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  cmp         %%edi, %1            \n\t" /* are we done? */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-
-    "  lea         (%0, %3), %0        \n\t" /* next row of output */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr),
-        "r" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-			   ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
-    "  movq        %%mm4, %%mm5        \n\t"
-    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
-    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
-    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
-    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
-    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
-    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
-    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
-    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
-    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
-    "  add         %4, %2              \n\t" /* next row of reference pixels */
-    "  add         %4, %3              \n\t" /* next row of reference pixels */
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-    "  add         %4, %0              \n\t" /* next row of output */
-    "  cmp         %%edi, %1           \n\t" /* are we done? */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" (RefPtr1),
-        "r" (RefPtr2),
-        "m" (LineStep)
-      : "memory", "edi"
-  );
-}
-
-void dsp_i386_mmx_recon_init(DspFunctions *funcs)
-{
-  funcs->copy8x8 = copy8x8__mmx;
-  funcs->recon_intra8x8 = recon_intra8x8__mmx;
-  funcs->recon_inter8x8 = recon_inter8x8__mmx;
-  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-

Copied: branches/theora-mmx/lib/x86_32/recon_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/recon_mmx.c)