[xiph-commits] r11337 - in branches/theora-mmx: . lib lib/i386
lib/x86_64
j at svn.xiph.org
j at svn.xiph.org
Wed May 3 15:32:28 PDT 2006
Author: j
Date: 2006-05-03 15:32:17 -0700 (Wed, 03 May 2006)
New Revision: 11337
Added:
branches/theora-mmx/lib/x86_64/
Modified:
branches/theora-mmx/configure.ac
branches/theora-mmx/lib/Makefile.am
branches/theora-mmx/lib/cpu.c
branches/theora-mmx/lib/dct.c
branches/theora-mmx/lib/dsp.c
branches/theora-mmx/lib/dsp.h
branches/theora-mmx/lib/i386/dsp_mmx.c
branches/theora-mmx/lib/i386/dsp_mmxext.c
branches/theora-mmx/lib/i386/fdct_mmx.c
branches/theora-mmx/lib/i386/recon_mmx.c
branches/theora-mmx/lib/reconstruct.c
branches/theora-mmx/lib/x86_64/dsp_mmx.c
branches/theora-mmx/lib/x86_64/dsp_mmxext.c
branches/theora-mmx/lib/x86_64/recon_mmx.c
Log:
support x86_64 processors
patch by Dan Lenski.
[ does not build as shared lib right now ]
Modified: branches/theora-mmx/configure.ac
===================================================================
--- branches/theora-mmx/configure.ac 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/configure.ac 2006-05-03 22:32:17 UTC (rev 11337)
@@ -4,7 +4,7 @@
dnl Initialization and Versioning
dnl ------------------------------------------------
-AC_INIT(libtheora-mmx,[1.0alpha5])
+AC_INIT(libtheora-mmx,[1.0alpha6-svn])
AC_CANONICAL_HOST
AC_CANONICAL_TARGET
@@ -95,13 +95,20 @@
case $host in
*)
- DEBUG="-g -W -D__NO_MATH_INLINES"
+ DEBUG="-g -Wall -D__NO_MATH_INLINES"
CFLAGS="-Wall -O3 -fforce-addr -fomit-frame-pointer -finline-functions -funroll-loops"
- PROFILE="-W -pg -g -O3 -fno-inline-functions";;
+ PROFILE="-Wall -pg -g -O3 -fno-inline-functions";;
esac
fi
CFLAGS="$CFLAGS $cflags_save"
+cpu_x86_64=no
+case $target in
+ x86_64-*)
+ cpu_x86_64=yes ;;
+esac
+AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+
# Test whenever ld supports -version-script
AC_PROG_LD
AC_PROG_LD_GNU
Modified: branches/theora-mmx/lib/Makefile.am
===================================================================
--- branches/theora-mmx/lib/Makefile.am 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/Makefile.am 2006-05-03 22:32:17 UTC (rev 11337)
@@ -1,6 +1,14 @@
INCLUDES = -I$(top_srcdir)/include
-EXTRA_DIST = Version_script.in
+EXTRA_DIST = Version_script.in \
+ i386/dsp_mmx.c \
+ i386/dsp_mmxext.c \
+ i386/recon_mmx.c \
+ i386/fdct_mmx.c \
+ x86_64/dsp_mmx.c \
+ x86_64/dsp_mmxext.c \
+ x86_64/recon_mmx.c \
+ x86_64/fdct_mmx.c
lib_LTLIBRARIES = libtheora.la
@@ -10,6 +18,12 @@
encoder_sources = dct_encode.c encode.c encoder_toplevel.c
endif
+if CPU_x86_64
+arch_dir = x86_64
+else
+arch_dir = i386
+endif
+
libtheora_la_SOURCES = \
blockmap.c \
comment.c \
@@ -30,10 +44,10 @@
toplevel.c \
cpu.c \
dsp.c \
- i386/dsp_mmx.c \
- i386/dsp_mmxext.c \
- i386/recon_mmx.c \
- i386/fdct_mmx.c \
+ $(arch_dir)/dsp_mmx.c \
+ $(arch_dir)/dsp_mmxext.c \
+ $(arch_dir)/recon_mmx.c \
+ $(arch_dir)/fdct_mmx.c \
$(encoder_sources)
noinst_HEADERS = \
Modified: branches/theora-mmx/lib/cpu.c
===================================================================
--- branches/theora-mmx/lib/cpu.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/cpu.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -19,24 +19,45 @@
ogg_uint32_t cpu_flags = 0;
+void
+cpuid(ogg_int32_t op, ogg_uint32_t *eax, ogg_uint32_t *ebx, ogg_uint32_t *ecx, ogg_uint32_t *edx)
+{
+#if defined(__x86_64__)
+ asm volatile ("pushq %%rbx \n\t"
+ "cpuid \n\t"
+ "movl %%ebx,%1 \n\t"
+ "popq %%rbx"
+ : "=a" (*eax),
+ "=r" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "a" (op)
+ : "cc");
+#else
+ asm volatile ("pushl %%ebx \n\t"
+ "cpuid \n\t"
+ "movl %%ebx,%1 \n\t"
+ "popl %%ebx"
+ : "=a" (*eax),
+ "=r" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "a" (op)
+ : "cc");
+#endif
+}
+
#if 1
static ogg_uint32_t cpu_get_flags (void)
{
ogg_uint32_t eax, ebx, ecx, edx;
ogg_uint32_t flags;
-#define cpuid(op,eax,ebx,ecx,edx) \
- asm volatile ("pushl %%ebx \n\t" \
- "cpuid \n\t" \
- "movl %%ebx,%1 \n\t" \
- "popl %%ebx" \
- : "=a" (eax), \
- "=r" (ebx), \
- "=c" (ecx), \
- "=d" (edx) \
- : "a" (op) \
- : "cc")
+# if defined(__x86_64__)
+ /* no need to check, we have cpuid on x86_64 */
+
+#else /* assume i386 */
asm volatile ("pushfl \n\t"
"pushfl \n\t"
"popl %0 \n\t"
@@ -51,11 +72,12 @@
"=r" (ebx)
:
: "cc");
-
+
if (eax == ebx) /* no cpuid */
return 0;
+#endif
- cpuid(0, eax, ebx, ecx, edx);
+ cpuid(0, &eax, &ebx, &ecx, &edx);
if (ebx == 0x756e6547 &&
edx == 0x49656e69 &&
@@ -63,7 +85,7 @@
/* intel */
inteltest:
- cpuid(1, eax, ebx, ecx, edx);
+ cpuid(1, &eax, &ebx, &ecx, &edx);
if ((edx & 0x00800000) == 0)
return 0;
flags = CPU_X86_MMX;
@@ -76,10 +98,10 @@
edx == 0x69746e65 &&
ecx == 0x444d4163) {
/* AMD */
- cpuid(0x80000000, eax, ebx, ecx, edx);
+ cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
if ((unsigned)eax < 0x80000001)
goto inteltest;
- cpuid(0x80000001, eax, ebx, ecx, edx);
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((edx & 0x00800000) == 0)
return 0;
flags = CPU_X86_MMX;
Modified: branches/theora-mmx/lib/dct.c
===================================================================
--- branches/theora-mmx/lib/dct.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dct.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -257,7 +257,7 @@
{
funcs->fdct_short = fdct_short__c;
if (cpu_flags & CPU_X86_MMX) {
- dsp_i386_mmx_fdct_init(&dsp_funcs);
+ dsp_mmx_fdct_init(&dsp_funcs);
}
}
Modified: branches/theora-mmx/lib/dsp.c
===================================================================
--- branches/theora-mmx/lib/dsp.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dsp.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -408,10 +408,10 @@
dsp_recon_init (&dsp_funcs);
dsp_dct_init (&dsp_funcs);
if (cpu_flags & CPU_X86_MMX) {
- dsp_i386_mmx_init(&dsp_funcs);
+ dsp_mmx_init(&dsp_funcs);
}
if (cpu_flags & CPU_X86_MMXEXT) {
- dsp_i386_mmxext_init(&dsp_funcs);
+ dsp_mmxext_init(&dsp_funcs);
}
}
Modified: branches/theora-mmx/lib/dsp.h
===================================================================
--- branches/theora-mmx/lib/dsp.h 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/dsp.h 2006-05-03 22:32:17 UTC (rev 11337)
@@ -19,6 +19,7 @@
#define DSP_H
#include <theora/theora.h>
+typedef unsigned long int ogg_uint64_t;
typedef struct
{
@@ -84,10 +85,10 @@
extern void dsp_recon_init (DspFunctions *funcs);
void dsp_init(DspFunctions *funcs);
-void dsp_i386_mmx_init(DspFunctions *funcs);
-void dsp_i386_mmxext_init(DspFunctions *funcs);
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs);
-void dsp_i386_mmx_recon_init(DspFunctions *funcs);
+void dsp_mmx_init(DspFunctions *funcs);
+void dsp_mmxext_init(DspFunctions *funcs);
+void dsp_mmx_fdct_init(DspFunctions *funcs);
+void dsp_mmx_recon_init(DspFunctions *funcs);
void dsp_static_init(void);
#define dsp_save_fpu(funcs) (funcs.save_fpu ())
Modified: branches/theora-mmx/lib/i386/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/dsp_mmx.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -624,7 +624,7 @@
);
}
-void dsp_i386_mmx_init(DspFunctions *funcs)
+void dsp_mmx_init(DspFunctions *funcs)
{
funcs->restore_fpu = restore_fpu;
funcs->sub8x8 = sub8x8__mmx;
Modified: branches/theora-mmx/lib/i386/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/dsp_mmxext.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -304,7 +304,7 @@
return (( (XXSum<<6) - XSum*XSum ));
}
-void dsp_i386_mmxext_init(DspFunctions *funcs)
+void dsp_mmxext_init(DspFunctions *funcs)
{
funcs->row_sad8 = row_sad8__mmxext;
funcs->col_sad8x8 = col_sad8x8__mmxext;
Modified: branches/theora-mmx/lib/i386/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/fdct_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/fdct_mmx.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -334,7 +334,7 @@
);
}
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
+void dsp_mmx_fdct_init(DspFunctions *funcs)
{
funcs->fdct_short = fdct_short__mmx;
}
Modified: branches/theora-mmx/lib/i386/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/i386/recon_mmx.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -175,7 +175,7 @@
);
}
-void dsp_i386_mmx_recon_init(DspFunctions *funcs)
+void dsp_mmx_recon_init(DspFunctions *funcs)
{
funcs->copy8x8 = copy8x8__mmx;
funcs->recon_intra8x8 = recon_intra8x8__mmx;
Modified: branches/theora-mmx/lib/reconstruct.c
===================================================================
--- branches/theora-mmx/lib/reconstruct.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/reconstruct.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -105,6 +105,6 @@
funcs->recon_inter8x8 = recon_inter8x8__c;
funcs->recon_inter8x8_half = recon_inter8x8_half__c;
if (cpu_flags & CPU_X86_MMX) {
- dsp_i386_mmx_recon_init(&dsp_funcs);
+ dsp_mmx_recon_init(&dsp_funcs);
}
}
Copied: branches/theora-mmx/lib/x86_64 (from rev 11336, branches/theora-mmx/lib/i386)
Modified: branches/theora-mmx/lib/x86_64/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/dsp_mmx.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -33,7 +33,7 @@
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint64_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
@@ -105,7 +105,7 @@
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
+ ogg_uint64_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
@@ -152,270 +152,18 @@
);
}
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint64_t Stride)
{
- ogg_uint32_t MaxSad;
+ ogg_uint64_t XSum;
+ ogg_uint64_t XXSum;
__asm__ __volatile__ (
" .balign 16 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
-
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
- " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $32, %%mm2 \n\t" /* fold and add */
- " psrlq $32, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $16, %%mm2 \n\t"
- " psrlq $16, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
-
- " psubusw %%mm0, %%mm1 \n\t"
- " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
- " movd %%mm1, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%edi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 2b \n\t"
-
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
- " psubusw %%mm4, %%mm5 \n\t"
- " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
- " psubusw %%mm5, %%mm7 \n\t"
- " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" (stride)
- : "memory", "edi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
- " paddb %%mm5, %%mm5 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- " mov $8, %%edi \n\t" /* 8 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%2), %%mm2 \n\t"
- " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm5, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
" pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
+ " mov $8, %%rdi \n\t"
"1: \n\t"
" movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq %%mm0, %%mm2 \n\t"
@@ -434,7 +182,7 @@
" add %3, %2 \n\t" /* Inc pointer into src data */
- " dec %%edi \n\t"
+ " dec %%rdi \n\t"
" jnz 1b \n\t"
" movq %%mm5, %%mm0 \n\t"
@@ -443,9 +191,9 @@
" movq %%mm5, %%mm0 \n\t"
" psrlq $16, %%mm5 \n\t"
" paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
+ " movd %%mm5, %%rdi \n\t"
+ " movsx %%di, %%rdi \n\t"
+ " mov %%rdi, %0 \n\t"
" movq %%mm7, %%mm0 \n\t"
" psrlq $32, %%mm7 \n\t"
@@ -456,7 +204,7 @@
"=r" (XXSum),
"+r" (DataPtr)
: "r" (Stride)
- : "edi", "memory"
+ : "rdi", "memory"
);
/* Compute population variance as mis-match metric. */
@@ -475,7 +223,7 @@
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
" pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
+ " mov $8, %%rdi \n\t"
"1: \n\t"
" movq (%2), %%mm0 \n\t" /* take 8 bytes */
" movq (%3), %%mm1 \n\t"
@@ -502,7 +250,7 @@
" add %4, %2 \n\t" /* Inc pointer into src data */
" add %5, %3 \n\t" /* Inc pointer into ref data */
- " dec %%edi \n\t"
+ " dec %%rdi \n\t"
" jnz 1b \n\t"
" movq %%mm5, %%mm0 \n\t"
@@ -511,9 +259,9 @@
" movq %%mm5, %%mm0 \n\t"
" psrlq $16, %%mm5 \n\t"
" paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
+ " movd %%mm5, %%rdi \n\t"
+ " movsx %%di, %%rdi \n\t"
+ " mov %%rdi, %0 \n\t"
" movq %%mm7, %%mm0 \n\t"
" psrlq $32, %%mm7 \n\t"
@@ -526,97 +274,13 @@
"+r" (RefDataPtr)
: "m" (SrcStride),
"m" (RefStride)
- : "edi", "memory"
+ : "rdi", "memory"
);
/* Compute and return population variance as mis-match metric. */
return (( (XXSum<<6) - XSum*XSum ));
}
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
- " paddb %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm4, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
static void restore_fpu (void)
{
__asm__ __volatile__ (
@@ -630,13 +294,7 @@
funcs->sub8x8 = sub8x8__mmx;
funcs->sub8x8_128 = sub8x8_128__mmx;
funcs->sub8x8avg2 = sub8x8avg2__mmx;
- funcs->row_sad8 = row_sad8__mmx;
- funcs->col_sad8x8 = col_sad8x8__mmx;
- funcs->sad8x8 = sad8x8__mmx;
- funcs->sad8x8_thres = sad8x8_thres__mmx;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
funcs->intra8x8_err = intra8x8_err__mmx;
funcs->inter8x8_err = inter8x8_err__mmx;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
}
Modified: branches/theora-mmx/lib/x86_64/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/dsp_mmxext.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -18,8 +18,8 @@
#include <stdlib.h>
#include "dsp.h"
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint64_t stride1,
+ unsigned char *ptr2, ogg_uint64_t stride2)
{
ogg_uint32_t DiffVal;
@@ -53,9 +53,9 @@
return DiffVal;
}
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint64_t stride1,
+ unsigned char *ptr2, ogg_uint64_t stride2,
+ ogg_uint64_t thres)
{
ogg_uint32_t DiffVal;
@@ -85,10 +85,10 @@
return DiffVal;
}
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint64_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint64_t RefStride,
+ ogg_uint64_t thres)
{
ogg_uint32_t DiffVal;
@@ -150,7 +150,7 @@
}
static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
+ ogg_uint64_t stride)
{
ogg_uint32_t MaxSad;
@@ -162,7 +162,7 @@
" pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
" pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
" pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
+ " mov $4, %%rdi \n\t" /* 4 rows */
"1: \n\t"
" movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t" /* take 8 bytes */
@@ -180,10 +180,10 @@
" add %3, %1 \n\t" /* Inc pointer into the new data */
" add %3, %2 \n\t" /* Inc pointer into the new data */
- " dec %%edi \n\t"
+ " dec %%rdi \n\t"
" jnz 1b \n\t"
- " mov $4, %%edi \n\t" /* 4 rows */
+ " mov $4, %%rdi \n\t" /* 4 rows */
"2: \n\t"
" movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t" /* take 8 bytes */
@@ -201,7 +201,7 @@
" add %3, %1 \n\t" /* Inc pointer into the new data */
" add %3, %2 \n\t" /* Inc pointer into the new data */
- " dec %%edi \n\t"
+ " dec %%rdi \n\t"
" jnz 2b \n\t"
" pmaxsw %%mm6, %%mm7 \n\t"
@@ -220,18 +220,18 @@
"+r" (Src1),
"+r" (Src2)
: "r" (stride)
- : "memory", "edi"
+ : "memory", "rdi"
);
return MaxSad;
}
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint64_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint64_t RefStride)
{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
+ ogg_uint64_t XSum;
+ ogg_uint64_t XXSum;
__asm__ __volatile__ (
" .balign 16 \n\t"
@@ -240,7 +240,7 @@
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
" pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
+ " mov $8, %%rdi \n\t"
"1: \n\t"
" movq (%2), %%mm0 \n\t" /* take 8 bytes */
@@ -272,7 +272,7 @@
" add %6, %3 \n\t" /* Inc pointer into ref data */
" add %6, %4 \n\t" /* Inc pointer into ref data */
- " dec %%edi \n\t"
+ " dec %%rdi \n\t"
" jnz 1b \n\t"
" movq %%mm5, %%mm0 \n\t"
@@ -297,7 +297,7 @@
"+r" (RefDataPtr2)
: "m" (SrcStride),
"m" (RefStride)
- : "edi", "memory"
+ : "rdi", "memory"
);
/* Compute and return population variance as mis-match metric. */
@@ -313,4 +313,3 @@
funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
}
-
Modified: branches/theora-mmx/lib/x86_64/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_64/recon_mmx.c 2006-05-03 22:32:17 UTC (rev 11337)
@@ -27,53 +27,53 @@
#endif
static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- unsigned int stride)
+ unsigned char *dest,
+ ogg_uint64_t stride)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
- " lea (%2, %2, 2), %%edi \n\t"
+ " lea (%2, %2, 2), %%rdi \n\t"
" movq (%1), %%mm0 \n\t"
" movq (%1, %2), %%mm1 \n\t"
" movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
+ " movq (%1, %%rdi), %%mm3 \n\t"
" lea (%1, %2, 4), %1 \n\t"
" movq %%mm0, (%0) \n\t"
" movq %%mm1, (%0, %2) \n\t"
" movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
+ " movq %%mm3, (%0, %%rdi) \n\t"
" lea (%0, %2, 4), %0 \n\t"
" movq (%1), %%mm0 \n\t"
" movq (%1, %2), %%mm1 \n\t"
" movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
+ " movq (%1, %%rdi), %%mm3 \n\t"
" movq %%mm0, (%0) \n\t"
" movq %%mm1, (%0, %2) \n\t"
" movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
+ " movq %%mm3, (%0, %%rdi) \n\t"
: "+a" (dest)
: "c" (src),
"d" (stride)
- : "memory", "edi"
+ : "memory", "rdi"
);
}
static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ ogg_uint64_t LineStep)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
" movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
- " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
+ " lea 128(%1), %%rdi \n\t" /* Endpoint in input buffer */
"1: \n\t"
" movq (%1), %%mm2 \n\t" /* First four input values */
@@ -81,7 +81,7 @@
" por %%mm0, %%mm0 \n\t"
" pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
" lea 16(%1), %1 \n\t" /* Step source buffer */
- " cmp %%edi, %1 \n\t" /* are we done */
+ " cmp %%rdi, %1 \n\t" /* are we done */
" movq %%mm2, (%0) \n\t" /* store results */
@@ -90,18 +90,18 @@
: "+r" (ReconPtr)
: "r" (ChangePtr),
"r" (LineStep)
- : "memory", "edi"
+ : "memory", "rdi"
);
}
static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+ ogg_int16_t *ChangePtr, ogg_uint64_t LineStep)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
" pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
+ " lea 128(%1), %%rdi \n\t"
"1: \n\t"
" movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
@@ -116,7 +116,7 @@
" add %3, %2 \n\t" /* next row of reference pixels */
" packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
" lea 16(%1), %1 \n\t" /* next row of changes */
- " cmp %%edi, %1 \n\t" /* are we done? */
+ " cmp %%rdi, %1 \n\t" /* are we done? */
" movq %%mm2, (%0) \n\t" /* store result */
@@ -126,19 +126,19 @@
: "r" (ChangePtr),
"r" (RefPtr),
"r" (LineStep)
- : "memory", "edi"
+ : "memory", "rdi"
);
}
static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint64_t LineStep)
{
__asm__ __volatile__ (
" .balign 16 \n\t"
" pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
+ " lea 128(%1), %%rdi \n\t"
"1: \n\t"
" movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
@@ -164,14 +164,14 @@
" add %4, %3 \n\t" /* next row of reference pixels */
" movq %%mm2, (%0) \n\t" /* store result */
" add %4, %0 \n\t" /* next row of output */
- " cmp %%edi, %1 \n\t" /* are we done? */
+ " cmp %%rdi, %1 \n\t" /* are we done? */
" jc 1b \n\t"
: "+r" (ReconPtr)
: "r" (ChangePtr),
"r" (RefPtr1),
"r" (RefPtr2),
"m" (LineStep)
- : "memory", "edi"
+ : "memory", "rdi"
);
}
More information about the commits
mailing list