[xiph-commits] r11338 - in branches/theora-mmx: . lib lib/x86_32
j at svn.xiph.org
j at svn.xiph.org
Wed May 3 16:31:11 PDT 2006
Author: j
Date: 2006-05-03 16:31:00 -0700 (Wed, 03 May 2006)
New Revision: 11338
Added:
branches/theora-mmx/lib/x86_32/
branches/theora-mmx/lib/x86_32/dsp_mmx.c
branches/theora-mmx/lib/x86_32/dsp_mmxext.c
branches/theora-mmx/lib/x86_32/fdct_mmx.c
branches/theora-mmx/lib/x86_32/recon_mmx.c
Removed:
branches/theora-mmx/lib/i386/
branches/theora-mmx/lib/x86_32/dsp_mmx.c
branches/theora-mmx/lib/x86_32/dsp_mmxext.c
branches/theora-mmx/lib/x86_32/fdct_mmx.c
branches/theora-mmx/lib/x86_32/recon_mmx.c
Modified:
branches/theora-mmx/configure.ac
branches/theora-mmx/lib/Makefile.am
branches/theora-mmx/lib/cpu.c
branches/theora-mmx/lib/dct.c
branches/theora-mmx/lib/dsp.c
branches/theora-mmx/lib/dsp.h
branches/theora-mmx/lib/reconstruct.c
Log:
- move i386 -> x86_32
- make mmx optional(also compiles on ppc like trunk now)
Modified: branches/theora-mmx/configure.ac
===================================================================
--- branches/theora-mmx/configure.ac 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/configure.ac 2006-05-03 23:31:00 UTC (rev 11338)
@@ -102,12 +102,21 @@
fi
CFLAGS="$CFLAGS $cflags_save"
+cpu_optimization="no optimization for your platform, please send a patch"
cpu_x86_64=no
-case $target in
- x86_64-*)
- cpu_x86_64=yes ;;
+cpu_x86_32=no
+case $target_cpu in
+ i[[3456]]86)
+ cpu_x86_32=yes
+ cpu_optimization="32bit x86"
+ ;;
+ x86_64)
+ cpu_x86_64=yes
+ cpu_optimization="64bit x86"
+ ;;
esac
AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
# Test whenever ld supports -version-script
AC_PROG_LD
@@ -295,8 +304,9 @@
General configuration:
- Encoding support: ............ ${ac_enable_encode}
- Floating point support: ...... ${ac_enable_float}
+ Encoding support: ........... ${ac_enable_encode}
+ Floating point support: ..... ${ac_enable_float}
+ Assembly optimization: ...... ${cpu_optimization}
Installation paths:
Modified: branches/theora-mmx/lib/Makefile.am
===================================================================
--- branches/theora-mmx/lib/Makefile.am 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/Makefile.am 2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,10 +1,10 @@
INCLUDES = -I$(top_srcdir)/include
EXTRA_DIST = Version_script.in \
- i386/dsp_mmx.c \
- i386/dsp_mmxext.c \
- i386/recon_mmx.c \
- i386/fdct_mmx.c \
+ x86_32/dsp_mmx.c \
+ x86_32/dsp_mmxext.c \
+ x86_32/recon_mmx.c \
+ x86_32/fdct_mmx.c \
x86_64/dsp_mmx.c \
x86_64/dsp_mmxext.c \
x86_64/recon_mmx.c \
@@ -20,10 +20,23 @@
if CPU_x86_64
arch_dir = x86_64
+arch_sources= \
+ $(arch_dir)/dsp_mmx.c \
+ $(arch_dir)/dsp_mmxext.c \
+ $(arch_dir)/recon_mmx.c \
+ $(arch_dir)/fdct_mmx.c
else
-arch_dir = i386
+if CPU_x86_32
+arch_dir = x86_32
+arch_sources= \
+ $(arch_dir)/dsp_mmx.c \
+ $(arch_dir)/dsp_mmxext.c \
+ $(arch_dir)/recon_mmx.c \
+ $(arch_dir)/fdct_mmx.c
endif
+endif
+
libtheora_la_SOURCES = \
blockmap.c \
comment.c \
@@ -44,10 +57,7 @@
toplevel.c \
cpu.c \
dsp.c \
- $(arch_dir)/dsp_mmx.c \
- $(arch_dir)/dsp_mmxext.c \
- $(arch_dir)/recon_mmx.c \
- $(arch_dir)/fdct_mmx.c \
+ $(arch_sources) \
$(encoder_sources)
noinst_HEADERS = \
Modified: branches/theora-mmx/lib/cpu.c
===================================================================
--- branches/theora-mmx/lib/cpu.c 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/cpu.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -33,7 +33,7 @@
"=d" (*edx)
: "a" (op)
: "cc");
-#else
+#elif defined(__x86)
asm volatile ("pushl %%ebx \n\t"
"cpuid \n\t"
"movl %%ebx,%1 \n\t"
@@ -53,11 +53,11 @@
ogg_uint32_t eax, ebx, ecx, edx;
ogg_uint32_t flags;
-# if defined(__x86_64__)
+#if defined(__x86_64__)
/* no need to check, we have cpuid on x86_64 */
-#else /* assume i386 */
+#elif defined(__x86)
asm volatile ("pushfl \n\t"
"pushfl \n\t"
"popl %0 \n\t"
Modified: branches/theora-mmx/lib/dct.c
===================================================================
--- branches/theora-mmx/lib/dct.c 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dct.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -256,8 +256,10 @@
void dsp_dct_init (DspFunctions *funcs)
{
funcs->fdct_short = fdct_short__c;
+#if defined(__x86)
if (cpu_flags & CPU_X86_MMX) {
dsp_mmx_fdct_init(&dsp_funcs);
}
+#endif
}
Modified: branches/theora-mmx/lib/dsp.c
===================================================================
--- branches/theora-mmx/lib/dsp.c 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dsp.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -405,13 +405,16 @@
{
cpu_init ();
dsp_init (&dsp_funcs);
+
dsp_recon_init (&dsp_funcs);
dsp_dct_init (&dsp_funcs);
+#if defined(__x86)
if (cpu_flags & CPU_X86_MMX) {
dsp_mmx_init(&dsp_funcs);
}
if (cpu_flags & CPU_X86_MMXEXT) {
dsp_mmxext_init(&dsp_funcs);
}
+#endif
}
Modified: branches/theora-mmx/lib/dsp.h
===================================================================
--- branches/theora-mmx/lib/dsp.h 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/dsp.h 2006-05-03 23:31:00 UTC (rev 11338)
@@ -85,11 +85,13 @@
extern void dsp_recon_init (DspFunctions *funcs);
void dsp_init(DspFunctions *funcs);
-void dsp_mmx_init(DspFunctions *funcs);
-void dsp_mmxext_init(DspFunctions *funcs);
-void dsp_mmx_fdct_init(DspFunctions *funcs);
-void dsp_mmx_recon_init(DspFunctions *funcs);
void dsp_static_init(void);
+#if defined(__x86)
+extern void dsp_mmx_init(DspFunctions *funcs);
+extern void dsp_mmxext_init(DspFunctions *funcs);
+extern void dsp_mmx_fdct_init(DspFunctions *funcs);
+extern void dsp_mmx_recon_init(DspFunctions *funcs);
+#endif
#define dsp_save_fpu(funcs) (funcs.save_fpu ())
#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
Modified: branches/theora-mmx/lib/reconstruct.c
===================================================================
--- branches/theora-mmx/lib/reconstruct.c 2006-05-03 22:32:17 UTC (rev 11337)
+++ branches/theora-mmx/lib/reconstruct.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -104,7 +104,9 @@
funcs->recon_intra8x8 = recon_intra8x8__c;
funcs->recon_inter8x8 = recon_inter8x8__c;
funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if defined(__x86)
if (cpu_flags & CPU_X86_MMX) {
dsp_mmx_recon_init(&dsp_funcs);
}
+#endif
}
Copied: branches/theora-mmx/lib/x86_32 (from rev 11336, branches/theora-mmx/lib/i386)
Deleted: branches/theora-mmx/lib/x86_32/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/dsp_mmx.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,642 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
- " movq %%mm0, (%2) \n\t" /* write answer out */
- " movq %%mm2, 8(%2) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %2 \n\t"
- " add %3, %0 \n\t"
- " add %4, %1 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
- " movq "M(V128)", %%mm1 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
- " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
- " movq %%mm0, (%1) \n\t" /* write answer out */
- " movq %%mm2, 8(%1) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %1 \n\t"
- " add %2, %0 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine)
- : "memory"
- );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
- " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
- " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
- " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
- /* average ReconPtr1 and ReconPtr2 */
- " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
- " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " movq %%mm0, (%3) \n\t" /* write answer out */
- " movq %%mm2, 8(%3) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %3 \n\t"
- " add %4, %0 \n\t"
- " add %5, %1 \n\t"
- " add %5, %2 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr1),
- "+r" (ReconPtr2),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
-
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
- " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $32, %%mm2 \n\t" /* fold and add */
- " psrlq $32, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $16, %%mm2 \n\t"
- " psrlq $16, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
-
- " psubusw %%mm0, %%mm1 \n\t"
- " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
- " movd %%mm1, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%edi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 2b \n\t"
-
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
- " psubusw %%mm4, %%mm5 \n\t"
- " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
- " psubusw %%mm5, %%mm7 \n\t"
- " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" (stride)
- : "memory", "edi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
- " paddb %%mm5, %%mm5 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- " mov $8, %%edi \n\t" /* 8 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%2), %%mm2 \n\t"
- " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm5, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq %%mm0, %%mm2 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %3, %2 \n\t" /* Inc pointer into src data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=r" (XSum),
- "=r" (XXSum),
- "+r" (DataPtr)
- : "r" (Stride)
- : "edi", "memory"
- );
-
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq (%3), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %4, %2 \n\t" /* Inc pointer into src data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
- " paddb %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm4, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
- __asm__ __volatile__ (
- " emms \n\t"
- );
-}
-
-void dsp_i386_mmx_init(DspFunctions *funcs)
-{
- funcs->restore_fpu = restore_fpu;
- funcs->sub8x8 = sub8x8__mmx;
- funcs->sub8x8_128 = sub8x8_128__mmx;
- funcs->sub8x8avg2 = sub8x8avg2__mmx;
- funcs->row_sad8 = row_sad8__mmx;
- funcs->col_sad8x8 = col_sad8x8__mmx;
- funcs->sad8x8 = sad8x8__mmx;
- funcs->sad8x8_thres = sad8x8_thres__mmx;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
- funcs->intra8x8_err = intra8x8_err__mmx;
- funcs->inter8x8_err = inter8x8_err__mmx;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-
Copied: branches/theora-mmx/lib/x86_32/dsp_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/dsp_mmx.c)
Deleted: branches/theora-mmx/lib/x86_32/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/i386/dsp_mmxext.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/dsp_mmxext.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,316 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 7 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq (%3), %%mm2 \n\t"
- " pavgb %%mm2, %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
-
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movd (%1), %%mm0 \n\t"
- " movd (%2), %%mm1 \n\t"
- " psadbw %%mm0, %%mm1 \n\t"
- " movd 4(%1), %%mm2 \n\t"
- " movd 4(%2), %%mm3 \n\t"
- " psadbw %%mm2, %%mm3 \n\t"
-
- " pmaxsw %%mm1, %%mm3 \n\t"
- " movd %%mm3, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%edi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 2b \n\t"
-
- " pmaxsw %%mm6, %%mm7 \n\t"
- " pmaxsw %%mm4, %%mm5 \n\t"
- " pmaxsw %%mm5, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" (stride)
- : "memory", "edi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
- " pavgb %%mm2, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm4, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm4, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_i386_mmxext_init(DspFunctions *funcs)
-{
- funcs->row_sad8 = row_sad8__mmxext;
- funcs->col_sad8x8 = col_sad8x8__mmxext;
- funcs->sad8x8 = sad8x8__mmxext;
- funcs->sad8x8_thres = sad8x8_thres__mmxext;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}
-
Copied: branches/theora-mmx/lib/x86_32/dsp_mmxext.c (from rev 11337, branches/theora-mmx/lib/i386/dsp_mmxext.c)
Deleted: branches/theora-mmx/lib/x86_32/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/fdct_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/fdct_mmx.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,340 +0,0 @@
-;//==========================================================================
-;//
-;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;// PURPOSE.
-;//
-;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- * File: fdct_m.asm
- *
- * Description:
- * This function perform 2-D Forward DCT on a 8x8 block
- *
- *
- * Input: Pointers to input source data buffer and destination
- * buffer.
- *
- * Note: none
- *
- * Special Notes: We try to do the truncation right to match the result
- * of the c version.
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
- " movq " #ip0 ", %%mm0 \n\t" \
- " movq " #ip1 ", %%mm1 \n\t" \
- " movq " #ip3 ", %%mm2 \n\t" \
- " movq " #ip5 ", %%mm3 \n\t" \
- " movq %%mm0, %%mm4 \n\t" \
- " movq %%mm1, %%mm5 \n\t" \
- " movq %%mm2, %%mm6 \n\t" \
- " movq %%mm3, %%mm7 \n\t" \
- \
- " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
- " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
- " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
- " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
- " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
- " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
- \
- " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
- \
- " paddsw %%mm2, %%mm2 \n\t" \
- \
- " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
- \
- " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
- " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
- " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
- \
- " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
- " paddsw %%mm7, %%mm7 \n\t" \
- " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- \
- " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
- " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
- \
- " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
- " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
- \
- " movq %%mm3, %%mm2 \n\t" \
- " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
- \
- " movq %%mm3, %%mm0 \n\t" \
- " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
- \
- " movq %%mm3," #ip0 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
- " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
- \
- " movq " #temp ", %%mm2 \n\t" \
- " movq %%mm2, %%mm0 \n\t" \
- \
- " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
- " paddw %%mm0, %%mm3 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " movq %%mm5, %%mm0 \n\t" \
- \
- " movq %%mm5, %%mm2 \n\t" \
- " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
- " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
- \
- " movq %%mm5, %%mm0 \n\t" \
- " movq %%mm5, %%mm2 \n\t" \
- \
- " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " movq " #temp ", %%mm3 \n\t" \
- " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
- \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- " movq %%mm3, %%mm2 \n\t" \
- \
- " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " psubsw %%mm5, %%mm3 \n\t" \
- \
- " movq %%mm3," #ip6 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC4S4)", %%mm0 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- " movq %%mm1, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
- " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
- \
- " movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
- " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
- /* ------------------------------------------------------------------- */ \
- " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
- " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
- \
- " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
- " paddsw %%mm6, %%mm6 \n\t" \
- " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
- \
- " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
- " paddsw %%mm1, %%mm1 \n\t" \
- " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- \
- " movq %%mm1, %%mm3 \n\t" \
- " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
- " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
- \
- " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- \
- " movq %%mm0, %%mm5 \n\t" \
- " movq %%mm0, %%mm2 \n\t" \
- \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- \
- " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
- " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
- \
- " movq %%mm1," #ip1 " \n\t" \
- " movq %%mm3," #ip7 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC3S5)", %%mm0 \n\t" \
- " movq "M(xC5S3)", %%mm1 \n\t" \
- \
- " movq %%mm6, %%mm5 \n\t" \
- " movq %%mm6, %%mm7 \n\t" \
- \
- " movq %%mm4, %%mm2 \n\t" \
- " movq %%mm4, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " psrlw $15, %%mm5 \n\t" \
- \
- " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
- " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
- " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
- \
- " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
- " movq %%mm4," #ip3 " \n\t" \
- \
- " movq %%mm3, %%mm4 \n\t" \
- " movq %%mm7, %%mm6 \n\t" \
- \
- " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" \
- " paddw %%mm5, %%mm6 \n\t" \
- \
- " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
- \
- " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
- " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
- " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
- " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
- " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
- " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
- " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
- " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
- " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
- " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
- /* Transpose 2x8 block */ \
- " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
- " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
- " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
- " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
- " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
- " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
- " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
- " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
- " movq %%mm4," #op4 " \n\t" \
- " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
- " movq %%mm5," #op5 " \n\t" \
- " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
- " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
- " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
- " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
- " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
- " movq %%mm6," #op7 " \n\t" \
- " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
- " movq %%mm1," #op6 " \n\t" \
- " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
- " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
- " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
- " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
- " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
- " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
- " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
- " movq %%mm0," #op0 " \n\t" \
- " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
- " movq %%mm1," #op1 " \n\t" \
- " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
- " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
- " movq %%mm4," #op3 " \n\t" \
- " movq %%mm2," #op2 " \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
- ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
- ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- /*
- * Input data is an 8x8 block. To make processing of the data more efficent
- * we will transpose the block of data to two 4x8 blocks???
- */
- Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
- Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
- Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
- Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
- Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- " emms \n\t"
-
- : "+r" (InputData),
- "+r" (OutputData)
- : "r" (temp)
- : "memory"
- );
-}
-
-void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
-{
- funcs->fdct_short = fdct_short__mmx;
-}
Copied: branches/theora-mmx/lib/x86_32/fdct_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/fdct_mmx.c)
Deleted: branches/theora-mmx/lib/x86_32/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/i386/recon_mmx.c 2006-05-03 21:23:11 UTC (rev 11336)
+++ branches/theora-mmx/lib/x86_32/recon_mmx.c 2006-05-03 23:31:00 UTC (rev 11338)
@@ -1,185 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- unsigned int stride)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " lea (%2, %2, 2), %%edi \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
-
- " lea (%1, %2, 4), %1 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
-
- " lea (%0, %2, 4), %0 \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
- : "+a" (dest)
- : "c" (src),
- "d" (stride)
- : "memory", "edi"
- );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
-
- " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
- "1: \n\t"
- " movq (%1), %%mm2 \n\t" /* First four input values */
-
- " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
- " por %%mm0, %%mm0 \n\t"
- " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
- " lea 16(%1), %1 \n\t" /* Step source buffer */
- " cmp %%edi, %1 \n\t" /* are we done */
-
- " movq %%mm2, (%0) \n\t" /* store results */
-
- " lea (%0, %2), %0 \n\t" /* Step output buffer */
- " jc 1b \n\t" /* Loop back if we are not done */
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (LineStep)
- : "memory", "edi"
- );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq (%1), %%mm4 \n\t" /* first 4 changes */
- " movq %%mm2, %%mm3 \n\t"
- " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
- " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
- " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
- " add %3, %2 \n\t" /* next row of reference pixels */
- " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " cmp %%edi, %1 \n\t" /* are we done? */
-
- " movq %%mm2, (%0) \n\t" /* store result */
-
- " lea (%0, %3), %0 \n\t" /* next row of output */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr),
- "r" (LineStep)
- : "memory", "edi"
- );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
- " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq %%mm2, %%mm3 \n\t"
- " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
- " movq %%mm4, %%mm5 \n\t"
- " movq (%1), %%mm6 \n\t" /* first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
- " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
- " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
- " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
- " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
- " paddw %%mm6, %%mm2 \n\t" /* add changes to start */
- " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */
- " add %4, %2 \n\t" /* next row of reference pixels */
- " add %4, %3 \n\t" /* next row of reference pixels */
- " movq %%mm2, (%0) \n\t" /* store result */
- " add %4, %0 \n\t" /* next row of output */
- " cmp %%edi, %1 \n\t" /* are we done? */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr1),
- "r" (RefPtr2),
- "m" (LineStep)
- : "memory", "edi"
- );
-}
-
-void dsp_i386_mmx_recon_init(DspFunctions *funcs)
-{
- funcs->copy8x8 = copy8x8__mmx;
- funcs->recon_intra8x8 = recon_intra8x8__mmx;
- funcs->recon_inter8x8 = recon_inter8x8__mmx;
- funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-
Copied: branches/theora-mmx/lib/x86_32/recon_mmx.c (from rev 11337, branches/theora-mmx/lib/i386/recon_mmx.c)
More information about the commits
mailing list