[xiph-commits] r11337 - in branches/theora-mmx: . lib lib/i386 lib/x86_64

Wed May 3 15:32:28 PDT 2006

Author: j
Date: 2006-05-03 15:32:17 -0700 (Wed, 03 May 2006)
New Revision: 11337

Added:
   branches/theora-mmx/lib/x86_64/
Modified:
   branches/theora-mmx/configure.ac
   branches/theora-mmx/lib/Makefile.am
   branches/theora-mmx/lib/cpu.c
   branches/theora-mmx/lib/dct.c
   branches/theora-mmx/lib/dsp.c
   branches/theora-mmx/lib/dsp.h
   branches/theora-mmx/lib/i386/dsp_mmx.c
   branches/theora-mmx/lib/i386/dsp_mmxext.c
   branches/theora-mmx/lib/i386/fdct_mmx.c
   branches/theora-mmx/lib/i386/recon_mmx.c
   branches/theora-mmx/lib/reconstruct.c
   branches/theora-mmx/lib/x86_64/dsp_mmx.c
   branches/theora-mmx/lib/x86_64/dsp_mmxext.c
   branches/theora-mmx/lib/x86_64/recon_mmx.c
Log:
support x86_64 processors
patch by Dan Lenski.

[ does not build as shared lib right now ]



Modified: branches/theora-mmx/configure.ac
===================================================================

--- branches/theora-mmx/configure.ac	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/configure.ac	2006-05-03 22:32:17 UTC (rev 11337)
@@ -4,7 +4,7 @@
 dnl Initialization and Versioning
 dnl ------------------------------------------------
 
-AC_INIT(libtheora-mmx,[1.0alpha5])
+AC_INIT(libtheora-mmx,[1.0alpha6-svn])
 
 AC_CANONICAL_HOST
 AC_CANONICAL_TARGET
@@ -95,13 +95,20 @@
 
         case $host in 
         *)
-                DEBUG="-g -W -D__NO_MATH_INLINES"
+                DEBUG="-g -Wall -D__NO_MATH_INLINES"
                 CFLAGS="-Wall -O3 -fforce-addr -fomit-frame-pointer -finline-functions -funroll-loops"
-                PROFILE="-W -pg -g -O3 -fno-inline-functions";;
+                PROFILE="-Wall -pg -g -O3 -fno-inline-functions";;
         esac
 fi
 CFLAGS="$CFLAGS $cflags_save"
 
+cpu_x86_64=no
+case $target in
+	x86_64-*)
+		cpu_x86_64=yes ;;
+esac
+AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+
 # Test whenever ld supports -version-script
 AC_PROG_LD
 AC_PROG_LD_GNU

Modified: branches/theora-mmx/lib/Makefile.am
===================================================================
--- branches/theora-mmx/lib/Makefile.am	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/Makefile.am	2006-05-03 22:32:17 UTC (rev 11337)
@@ -1,6 +1,14 @@
 INCLUDES = -I$(top_srcdir)/include
 
-EXTRA_DIST = Version_script.in
+EXTRA_DIST = Version_script.in \
+	i386/dsp_mmx.c \
+	i386/dsp_mmxext.c \
+	i386/recon_mmx.c \
+	i386/fdct_mmx.c \
+	x86_64/dsp_mmx.c \
+	x86_64/dsp_mmxext.c \
+	x86_64/recon_mmx.c \
+	x86_64/fdct_mmx.c
 
 lib_LTLIBRARIES = libtheora.la
 
@@ -10,6 +18,12 @@
 encoder_sources = dct_encode.c encode.c encoder_toplevel.c
 endif
 
+if CPU_x86_64
+arch_dir = x86_64
+else
+arch_dir = i386
+endif
+
 libtheora_la_SOURCES = \
 	blockmap.c \
 	comment.c \
@@ -30,10 +44,10 @@
 	toplevel.c \
 	cpu.c \
 	dsp.c \
-	i386/dsp_mmx.c \
-	i386/dsp_mmxext.c \
- 	i386/recon_mmx.c \
-	i386/fdct_mmx.c \
+	$(arch_dir)/dsp_mmx.c \
+	$(arch_dir)/dsp_mmxext.c \
+	$(arch_dir)/recon_mmx.c \
+	$(arch_dir)/fdct_mmx.c \
 	$(encoder_sources)
 
 noinst_HEADERS = \

Modified: branches/theora-mmx/lib/cpu.c
===================================================================
--- branches/theora-mmx/lib/cpu.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/cpu.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -19,24 +19,45 @@
 
 ogg_uint32_t cpu_flags = 0;
 
+void
+cpuid(ogg_int32_t op, ogg_uint32_t *eax, ogg_uint32_t *ebx, ogg_uint32_t *ecx, ogg_uint32_t *edx) 
+{
+#if defined(__x86_64__)
+  asm volatile ("pushq %%rbx   \n\t"
+                "cpuid         \n\t"
+                "movl %%ebx,%1 \n\t"
+                "popq %%rbx"        
+              : "=a" (*eax),         
+                "=r" (*ebx),         
+                "=c" (*ecx),         
+                "=d" (*edx)          
+              : "a" (op)            
+              : "cc");
+#else
+  asm volatile ("pushl %%ebx   \n\t"
+                "cpuid         \n\t"
+                "movl %%ebx,%1 \n\t"
+                "popl %%ebx"        
+              : "=a" (*eax),         
+                "=r" (*ebx),         
+                "=c" (*ecx),         
+                "=d" (*edx)          
+              : "a" (op)            
+              : "cc");
+#endif
+}
+
 #if 1
 static ogg_uint32_t cpu_get_flags (void)
 {
   ogg_uint32_t eax, ebx, ecx, edx;
   ogg_uint32_t flags;
 
-#define cpuid(op,eax,ebx,ecx,edx)      \
-  asm volatile ("pushl %%ebx   \n\t"   \
-                "cpuid         \n\t"   \
-                "movl %%ebx,%1 \n\t"   \
-                "popl %%ebx"           \
-              : "=a" (eax),            \
-                "=r" (ebx),            \
-                "=c" (ecx),            \
-                "=d" (edx)             \
-              : "a" (op)               \
-              : "cc")
+# if defined(__x86_64__)
 
+  /* no need to check, we have cpuid on x86_64 */
+
+#else /* assume i386 */
   asm volatile ("pushfl              \n\t"
                 "pushfl              \n\t"
                 "popl %0             \n\t"
@@ -51,11 +72,12 @@
                 "=r" (ebx)
               :
               : "cc");
-         
+
   if (eax == ebx)             /* no cpuid */
     return 0;
+#endif
 
-  cpuid(0, eax, ebx, ecx, edx);
+  cpuid(0, &eax, &ebx, &ecx, &edx);
 
   if (ebx == 0x756e6547 &&
       edx == 0x49656e69 &&
@@ -63,7 +85,7 @@
     /* intel */
 
   inteltest:
-    cpuid(1, eax, ebx, ecx, edx);
+    cpuid(1, &eax, &ebx, &ecx, &edx);
     if ((edx & 0x00800000) == 0)
       return 0;
     flags = CPU_X86_MMX;
@@ -76,10 +98,10 @@
              edx == 0x69746e65 &&
              ecx == 0x444d4163) {
     /* AMD */
-    cpuid(0x80000000, eax, ebx, ecx, edx);
+    cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
     if ((unsigned)eax < 0x80000001)
       goto inteltest;
-    cpuid(0x80000001, eax, ebx, ecx, edx);
+    cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
     if ((edx & 0x00800000) == 0)
       return 0;
     flags = CPU_X86_MMX;

Modified: branches/theora-mmx/lib/dct.c
===================================================================
--- branches/theora-mmx/lib/dct.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dct.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -257,7 +257,7 @@
 {
   funcs->fdct_short = fdct_short__c;
   if (cpu_flags & CPU_X86_MMX) {
-    dsp_i386_mmx_fdct_init(&dsp_funcs);
+    dsp_mmx_fdct_init(&dsp_funcs);
   }
 }
 

Modified: branches/theora-mmx/lib/dsp.c
===================================================================
--- branches/theora-mmx/lib/dsp.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dsp.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -408,10 +408,10 @@
   dsp_recon_init (&dsp_funcs);
   dsp_dct_init (&dsp_funcs);
   if (cpu_flags & CPU_X86_MMX) {
-    dsp_i386_mmx_init(&dsp_funcs);
+    dsp_mmx_init(&dsp_funcs);
   }
   if (cpu_flags & CPU_X86_MMXEXT) {
-    dsp_i386_mmxext_init(&dsp_funcs);
+    dsp_mmxext_init(&dsp_funcs);
   }
 }
 

Modified: branches/theora-mmx/lib/dsp.h
===================================================================
--- branches/theora-mmx/lib/dsp.h	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dsp.h	2006-05-03 22:32:17 UTC (rev 11337)
@@ -19,6 +19,7 @@
 #define DSP_H
 
 #include <theora/theora.h>
+typedef unsigned long int ogg_uint64_t;
 
 typedef struct
 {
@@ -84,10 +85,10 @@
 extern void dsp_recon_init (DspFunctions *funcs);
 
 void dsp_init(DspFunctions *funcs);
-void dsp_i386_mmx_init(DspFunctions *funcs);
-void dsp_i386_mmxext_init(DspFunctions *funcs);
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs);
-void dsp_i386_mmx_recon_init(DspFunctions *funcs);
+void dsp_mmx_init(DspFunctions *funcs);
+void dsp_mmxext_init(DspFunctions *funcs);
+void dsp_mmx_fdct_init(DspFunctions *funcs);
+void dsp_mmx_recon_init(DspFunctions *funcs);
 void dsp_static_init(void);
 
 #define dsp_save_fpu(funcs) (funcs.save_fpu ())

Modified: branches/theora-mmx/lib/i386/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/dsp_mmx.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -624,7 +624,7 @@
   );
 }
 
-void dsp_i386_mmx_init(DspFunctions *funcs)
+void dsp_mmx_init(DspFunctions *funcs)
 {
   funcs->restore_fpu = restore_fpu;
   funcs->sub8x8 = sub8x8__mmx;

Modified: branches/theora-mmx/lib/i386/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/dsp_mmxext.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -304,7 +304,7 @@
   return (( (XXSum<<6) - XSum*XSum ));
 }
 
-void dsp_i386_mmxext_init(DspFunctions *funcs)
+void dsp_mmxext_init(DspFunctions *funcs)
 {
   funcs->row_sad8 = row_sad8__mmxext;
   funcs->col_sad8x8 = col_sad8x8__mmxext;

Modified: branches/theora-mmx/lib/i386/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/fdct_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/fdct_mmx.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -334,7 +334,7 @@
   );
 }
 
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
+void dsp_mmx_fdct_init(DspFunctions *funcs)
 {
   funcs->fdct_short = fdct_short__mmx;
 }

Modified: branches/theora-mmx/lib/i386/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/recon_mmx.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -175,7 +175,7 @@
   );
 }
 
-void dsp_i386_mmx_recon_init(DspFunctions *funcs)
+void dsp_mmx_recon_init(DspFunctions *funcs)
 {
   funcs->copy8x8 = copy8x8__mmx;
   funcs->recon_intra8x8 = recon_intra8x8__mmx;

Modified: branches/theora-mmx/lib/reconstruct.c
===================================================================
--- branches/theora-mmx/lib/reconstruct.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/reconstruct.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -105,6 +105,6 @@
   funcs->recon_inter8x8 = recon_inter8x8__c;
   funcs->recon_inter8x8_half = recon_inter8x8_half__c;
   if (cpu_flags & CPU_X86_MMX) {
-    dsp_i386_mmx_recon_init(&dsp_funcs);
+    dsp_mmx_recon_init(&dsp_funcs);
   }
 }

Copied: branches/theora-mmx/lib/x86_64 (from rev 11336, branches/theora-mmx/lib/i386)

Modified: branches/theora-mmx/lib/x86_64/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/dsp_mmx.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -33,7 +33,7 @@
 
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
-                  ogg_uint32_t ReconPixelsPerLine) 
+                  ogg_uint64_t ReconPixelsPerLine) 
 {
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
@@ -105,7 +105,7 @@
 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
-                     ogg_uint32_t ReconPixelsPerLine) 
+                     ogg_uint64_t ReconPixelsPerLine) 
 {
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
@@ -152,270 +152,18 @@
   );
 }
 
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint64_t Stride)
 {
-  ogg_uint32_t MaxSad;
+  ogg_uint64_t  XSum;
+  ogg_uint64_t  XXSum;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
-    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
-    "  psrlq       $32, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-    "  psrlq       $16, %%mm2       \n\t"
-    "  psrlq       $16, %%mm3       \n\t"
-    "  paddw       %%mm2, %%mm0     \n\t"
-    "  paddw       %%mm3, %%mm1     \n\t"
-
-    "  psubusw     %%mm0, %%mm1     \n\t"
-    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
-    "  movd        %%mm1, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     :
-     : "memory"
-  );
-  return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
-{
-  ogg_uint32_t MaxSad;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
-    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
-    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
-    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
-    "2:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 2b                       \n\t"
-
-    "  psubusw     %%mm6, %%mm7     \n\t"
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
-    "  psubusw     %%mm4, %%mm5     \n\t" 	
-    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
-    "  psubusw     %%mm5, %%mm7     \n\t" 	
-    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $32, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movq        %%mm7, %%mm6     \n\t"
-    "  psrlq       $16, %%mm6       \n\t"
-    "  psubusw     %%mm6, %%mm7     \n\t" 	
-    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=r" (MaxSad),
-       "+r" (Src1), 
-       "+r" (Src2) 
-     : "r" (stride)
-     : "memory", "edi"
-  );
-
-  return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    ".rept 8                         \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" (stride1),
-       "r" (stride2)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
-{
-  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
-    "  paddb       %%mm5, %%mm5     \n\t"
-   
-    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
-    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    "  mov         $8, %%edi        \n\t"	/* 8 rows */
-    "1:                             \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%2), %%mm2      \n\t"
-    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm5, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
-    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $16, %%mm7       \n\t"
-    "  paddw       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %0        \n\t"
-    "  andl        $0xffff, %0      \n\t"
-
-     : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-  ogg_uint32_t  XSum;
-  ogg_uint32_t  XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
     "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
+    "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
     "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
     "  movq        %%mm0, %%mm2     \n\t"
@@ -434,7 +182,7 @@
 
     "  add         %3, %2           \n\t"	/* Inc pointer into src data */
 
-    "  dec         %%edi            \n\t"
+    "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
 
     "  movq        %%mm5, %%mm0     \n\t"
@@ -443,9 +191,9 @@
     "  movq        %%mm5, %%mm0     \n\t"
     "  psrlq       $16, %%mm5       \n\t"
     "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
+    "  movd        %%mm5, %%rdi     \n\t"
+    "  movsx       %%di, %%rdi      \n\t"
+    "  mov        %%rdi, %0        \n\t"
 
     "  movq        %%mm7, %%mm0     \n\t"
     "  psrlq       $32, %%mm7       \n\t"
@@ -456,7 +204,7 @@
        "=r" (XXSum),
        "+r" (DataPtr) 
      : "r" (Stride)
-     : "edi", "memory"
+     : "rdi", "memory"
   );
 
   /* Compute population variance as mis-match metric. */
@@ -475,7 +223,7 @@
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
     "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
+    "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
     "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
     "  movq        (%3), %%mm1      \n\t"
@@ -502,7 +250,7 @@
     "  add         %4, %2           \n\t"	/* Inc pointer into src data */
     "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
 
-    "  dec         %%edi            \n\t"
+    "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
 
     "  movq        %%mm5, %%mm0     \n\t"
@@ -511,9 +259,9 @@
     "  movq        %%mm5, %%mm0     \n\t"
     "  psrlq       $16, %%mm5       \n\t"
     "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
+    "  movd        %%mm5, %%rdi     \n\t"
+    "  movsx       %%di, %%rdi      \n\t"
+    "  mov        %%rdi, %0        \n\t"
 
     "  movq        %%mm7, %%mm0     \n\t"
     "  psrlq       $32, %%mm7       \n\t"
@@ -526,97 +274,13 @@
        "+r" (RefDataPtr) 
      : "m" (SrcStride),
        "m" (RefStride)
-     : "edi", "memory"
+     : "rdi", "memory"
   );
 
   /* Compute and return population variance as mis-match metric. */
   return (( (XXSum<<6) - XSum*XSum ));
 }
 
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
-
-  __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
-
-    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
-    "  paddb       %%mm4, %%mm4     \n\t"
-    "  pxor        %%mm5, %%mm5     \n\t"
-    "  pxor        %%mm6, %%mm6     \n\t"
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
-    "1:                             \n\t"
-    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
-
-    "  movq        (%3), %%mm2      \n\t"
-    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
-    "  movq        %%mm2, %%mm1     \n\t"
-    "  pand        %%mm3, %%mm1     \n\t"
-    "  pxor        %%mm2, %%mm3     \n\t"
-    "  pand        %%mm4, %%mm3     \n\t"
-    "  psrlq       $1, %%mm3        \n\t"
-    "  paddb       %%mm3, %%mm1     \n\t"
-
-    "  movq        %%mm0, %%mm2     \n\t"
-    "  movq        %%mm1, %%mm3     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"
-    "  punpcklbw   %%mm6, %%mm1     \n\t"
-    "  punpckhbw   %%mm6, %%mm2     \n\t"
-    "  punpckhbw   %%mm6, %%mm3     \n\t"
-
-    "  psubsw      %%mm1, %%mm0     \n\t"
-    "  psubsw      %%mm3, %%mm2     \n\t"
-
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  paddw       %%mm2, %%mm5     \n\t"
-
-    "  pmaddwd     %%mm0, %%mm0     \n\t"
-    "  pmaddwd     %%mm2, %%mm2     \n\t"
-    
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  paddd       %%mm2, %%mm7     \n\t"
-
-    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
-    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
-    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
-
-    "  dec         %%edi            \n\t"
-    "  jnz 1b                       \n\t"
-
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $32, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movq        %%mm5, %%mm0     \n\t"
-    "  psrlq       $16, %%mm5       \n\t"
-    "  paddw       %%mm0, %%mm5     \n\t"
-    "  movd        %%mm5, %%edi     \n\t"
-    "  movsx       %%di, %%edi      \n\t"
-    "  movl        %%edi, %0        \n\t"
-
-    "  movq        %%mm7, %%mm0     \n\t"
-    "  psrlq       $32, %%mm7       \n\t"
-    "  paddd       %%mm0, %%mm7     \n\t"
-    "  movd        %%mm7, %1        \n\t"
-
-     : "=m" (XSum),
-       "=m" (XXSum),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2) 
-     : "m" (SrcStride),
-       "m" (RefStride)
-     : "edi", "memory"
-  );
-
-  /* Compute and return population variance as mis-match metric. */
-  return (( (XXSum<<6) - XSum*XSum ));
-}
-
 static void restore_fpu (void)
 {
   __asm__ __volatile__ (
@@ -630,13 +294,7 @@
   funcs->sub8x8 = sub8x8__mmx;
   funcs->sub8x8_128 = sub8x8_128__mmx;
   funcs->sub8x8avg2 = sub8x8avg2__mmx;
-  funcs->row_sad8 = row_sad8__mmx;
-  funcs->col_sad8x8 = col_sad8x8__mmx;
-  funcs->sad8x8 = sad8x8__mmx;
-  funcs->sad8x8_thres = sad8x8_thres__mmx;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
   funcs->intra8x8_err = intra8x8_err__mmx;
   funcs->inter8x8_err = inter8x8_err__mmx;
-  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
 }
 

Modified: branches/theora-mmx/lib/x86_64/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/dsp_mmxext.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -18,8 +18,8 @@
 #include <stdlib.h>
 #include "dsp.h"
 
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       	    unsigned char *ptr2, ogg_uint32_t stride2)
+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint64_t stride1,
+                                    unsigned char *ptr2, ogg_uint64_t stride2)
 {
   ogg_uint32_t  DiffVal;
 
@@ -53,9 +53,9 @@
   return DiffVal;
 }
 
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
-		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
-			   	  ogg_uint32_t thres)
+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint64_t stride1,
+                                          unsigned char *ptr2, ogg_uint64_t stride2, 
+			   	  ogg_uint64_t thres)
 {
   ogg_uint32_t  DiffVal;
 
@@ -85,10 +85,10 @@
   return DiffVal;
 }
 
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                      unsigned char *RefDataPtr1,
-			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
-			              ogg_uint32_t thres)
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint64_t SrcStride,
+                                              unsigned char *RefDataPtr1,
+                                              unsigned char *RefDataPtr2, ogg_uint64_t RefStride,
+                                              ogg_uint64_t thres)
 {
   ogg_uint32_t  DiffVal;
 
@@ -150,7 +150,7 @@
 }
 
 static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
-		                    ogg_uint32_t stride)
+		                    ogg_uint64_t stride)
 {
   ogg_uint32_t MaxSad;
 
@@ -162,7 +162,7 @@
     "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
     "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
     "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
     "1:                             \n\t"
     "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
     "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
@@ -180,10 +180,10 @@
     "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
     "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
 
-    "  dec         %%edi            \n\t"
+    "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
 
-    "  mov         $4, %%edi        \n\t"	/* 4 rows */
+    "  mov         $4, %%rdi        \n\t"	/* 4 rows */
     "2:                             \n\t"
     "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
     "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
@@ -201,7 +201,7 @@
     "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
     "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
 
-    "  dec         %%edi            \n\t"
+    "  dec         %%rdi            \n\t"
     "  jnz 2b                       \n\t"
 
     "  pmaxsw      %%mm6, %%mm7     \n\t"
@@ -220,18 +220,18 @@
        "+r" (Src1), 
        "+r" (Src2) 
      : "r" (stride)
-     : "memory", "edi"
+     : "memory", "rdi"
   );
 
   return MaxSad;
 }
 
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
-		                     unsigned char *RefDataPtr1,
-				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint64_t SrcStride,
+                                              unsigned char *RefDataPtr1,
+                                              unsigned char *RefDataPtr2, ogg_uint64_t RefStride)
 {
-  ogg_uint32_t XSum;
-  ogg_uint32_t XXSum;
+  ogg_uint64_t XSum;
+  ogg_uint64_t XXSum;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
@@ -240,7 +240,7 @@
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
     "  pxor        %%mm7, %%mm7     \n\t"
-    "  mov         $8, %%edi        \n\t"
+    "  mov         $8, %%rdi        \n\t"
     "1:                             \n\t"
     "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
 
@@ -272,7 +272,7 @@
     "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
     "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
 
-    "  dec         %%edi            \n\t"
+    "  dec         %%rdi            \n\t"
     "  jnz 1b                       \n\t"
 
     "  movq        %%mm5, %%mm0     \n\t"
@@ -297,7 +297,7 @@
        "+r" (RefDataPtr2) 
      : "m" (SrcStride),
        "m" (RefStride)
-     : "edi", "memory"
+     : "rdi", "memory"
   );
 
   /* Compute and return population variance as mis-match metric. */
@@ -313,4 +313,3 @@
   funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
   funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
 }
-

Modified: branches/theora-mmx/lib/x86_64/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c	2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/recon_mmx.c	2006-05-03 22:32:17 UTC (rev 11337)
@@ -27,53 +27,53 @@
 #endif
 
 static void copy8x8__mmx (unsigned char *src,
-	                unsigned char *dest,
-	                unsigned int stride)
+                          unsigned char *dest,
+                          ogg_uint64_t stride)
 {
   __asm__ __volatile__ (
     "  .balign 16                      \n\t"
 
-    "  lea         (%2, %2, 2), %%edi  \n\t"
+    "  lea         (%2, %2, 2), %%rdi  \n\t"
 
     "  movq        (%1), %%mm0         \n\t"
     "  movq        (%1, %2), %%mm1     \n\t"
     "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
+    "  movq        (%1, %%rdi), %%mm3  \n\t"
 
     "  lea         (%1, %2, 4), %1     \n\t" 
 
     "  movq        %%mm0, (%0)         \n\t"
     "  movq        %%mm1, (%0, %2)     \n\t"
     "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
+    "  movq        %%mm3, (%0, %%rdi)  \n\t"
 
     "  lea         (%0, %2, 4), %0     \n\t" 
 
     "  movq        (%1), %%mm0         \n\t"
     "  movq        (%1, %2), %%mm1     \n\t"
     "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%edi), %%mm3  \n\t"
+    "  movq        (%1, %%rdi), %%mm3  \n\t"
 
     "  movq        %%mm0, (%0)         \n\t"
     "  movq        %%mm1, (%0, %2)     \n\t"
     "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%edi)  \n\t"
+    "  movq        %%mm3, (%0, %%rdi)  \n\t"
       : "+a" (dest)
       : "c" (src),
         "d" (stride)
-      : "memory", "edi"
+      : "memory", "rdi"
   );
 }
 
 static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
-		      ogg_uint32_t LineStep)
+                                 ogg_uint64_t LineStep)
 {
   __asm__ __volatile__ (
     "  .balign 16                      \n\t"
 
     "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
 
-    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
+    "  lea         128(%1), %%rdi      \n\t" /* Endpoint in input buffer */
     "1:                                \n\t" 
     "  movq         (%1), %%mm2        \n\t" /* First four input values */
 
@@ -81,7 +81,7 @@
     "  por         %%mm0, %%mm0        \n\t" 
     "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
     "  lea         16(%1), %1          \n\t" /* Step source buffer */
-    "  cmp         %%edi, %1           \n\t" /* are we done */
+    "  cmp         %%rdi, %1           \n\t" /* are we done */
 
     "  movq        %%mm2, (%0)         \n\t" /* store results */
 
@@ -90,18 +90,18 @@
       : "+r" (ReconPtr)
       : "r" (ChangePtr),
         "r" (LineStep)
-      : "memory", "edi"
+      : "memory", "rdi"
   );
 }
 
 static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
-		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+                                 ogg_int16_t *ChangePtr, ogg_uint64_t LineStep)
 {
   __asm__ __volatile__ (
     "  .balign 16                      \n\t"
 
     "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
+    "  lea         128(%1), %%rdi      \n\t"
 
     "1:                                \n\t"
     "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
@@ -116,7 +116,7 @@
     "  add         %3, %2              \n\t" /* next row of reference pixels */
     "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
     "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  cmp         %%edi, %1            \n\t" /* are we done? */
+    "  cmp         %%rdi, %1           \n\t" /* are we done? */
 
     "  movq        %%mm2, (%0)         \n\t" /* store result */
 
@@ -126,19 +126,19 @@
       : "r" (ChangePtr),
         "r" (RefPtr),
         "r" (LineStep)
-      : "memory", "edi"
+      : "memory", "rdi"
   );
 }
 
 static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
-		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
-			   ogg_uint32_t LineStep)
+                                      unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+                                      ogg_uint64_t LineStep)
 {
   __asm__ __volatile__ (
     "  .balign 16                      \n\t"
 
     "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%edi      \n\t"
+    "  lea         128(%1), %%rdi      \n\t"
 
     "1:                                \n\t"
     "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
@@ -164,14 +164,14 @@
     "  add         %4, %3              \n\t" /* next row of reference pixels */
     "  movq        %%mm2, (%0)         \n\t" /* store result */
     "  add         %4, %0              \n\t" /* next row of output */
-    "  cmp         %%edi, %1           \n\t" /* are we done? */
+    "  cmp         %%rdi, %1           \n\t" /* are we done? */
     "  jc          1b                  \n\t"
       : "+r" (ReconPtr)
       : "r" (ChangePtr),
         "r" (RefPtr1),
         "r" (RefPtr2),
         "m" (LineStep)
-      : "memory", "edi"
+      : "memory", "rdi"
   );
 }