[xiph-commits] r12433 - in trunk/theora: . lib/x86_32
j at svn.xiph.org
j at svn.xiph.org
Mon Feb 5 15:22:59 PST 2007
Author: j
Date: 2007-02-05 15:22:46 -0800 (Mon, 05 Feb 2007)
New Revision: 12433
Modified:
trunk/theora/configure.ac
trunk/theora/lib/x86_32/dsp_mmx.c
trunk/theora/lib/x86_32/dsp_mmxext.c
trunk/theora/lib/x86_32/fdct_mmx.c
trunk/theora/lib/x86_32/recon_mmx.c
Log:
rework MMX code to compile on Mac OS X(Intel)
It is known to compile with XCode 2.4.1,
- .balign 16 -> .p2align 4
- gas on OS X does not know about asm macros .rept / .endr
replaced with #defines.
Modified: trunk/theora/configure.ac
===================================================================
--- trunk/theora/configure.ac 2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/configure.ac 2007-02-05 23:22:46 UTC (rev 12433)
@@ -112,11 +112,9 @@
cpu_optimization="no optimization for your platform, please send a patch"
case $target_cpu in
i[[3456]]86)
- if test ! "x$target_vendor" = "xapple"; then
- cpu_x86_32=yes
- cpu_optimization="32 bit x86"
- AC_DEFINE([USE_ASM], [], [make use of asm optimization])
- fi
+ cpu_x86_32=yes
+ cpu_optimization="32 bit x86"
+ AC_DEFINE([USE_ASM], [], [make use of asm optimization])
;;
x86_64)
cpu_x86_64=yes
Modified: trunk/theora/lib/x86_32/dsp_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/dsp_mmx.c 2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/dsp_mmx.c 2007-02-05 23:22:46 UTC (rev 12433)
@@ -26,36 +26,42 @@
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+#define SUB_LOOP \
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ \
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
+ /* convert from UINT8 to INT16 */ \
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ \
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ \
+ /* start calculation */ \
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ \
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ \
+ " movq %%mm0, (%2) \n\t" /* write answer out */ \
+ " movq %%mm2, 8(%2) \n\t" /* write answer out */ \
+ /* Increment pointers */ \
+ " add $16, %2 \n\t" \
+ " add %3, %0 \n\t" \
+ " add %4, %1 \n\t"
+
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
- " movq %%mm0, (%2) \n\t" /* write answer out */
- " movq %%mm2, 8(%2) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %2 \n\t"
- " add %3, %0 \n\t"
- " add %4, %1 \n\t"
- ".endr \n\t"
-
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
: "+r" (FiltPtr),
"+r" (ReconPtr),
"+r" (DctInputPtr)
@@ -65,31 +71,38 @@
);
}
+#define SUB_128_LOOP \
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
+ /* convert from UINT8 to INT16 */ \
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
+ /* start calculation */ \
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ \
+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ \
+ " movq %%mm0, (%1) \n\t" /* write answer out */ \
+ " movq %%mm2, 8(%1) \n\t" /* write answer out */ \
+ /* Increment pointers */ \
+ " add $16, %1 \n\t" \
+ " add %2, %0 \n\t"
+
+
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t"
" movq %[V128], %%mm1 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
- " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
- " movq %%mm0, (%1) \n\t" /* write answer out */
- " movq %%mm2, 8(%1) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %1 \n\t"
- " add %2, %0 \n\t"
- ".endr \n\t"
-
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
: "+r" (FiltPtr),
"+r" (DctInputPtr)
: "m" (PixelsPerLine),
@@ -98,46 +111,53 @@
);
}
+#define SUB_AVG2_LOOP \
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ \
+ " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ \
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
+ " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */\
+ /* convert from UINT8 to INT16 */ \
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ \
+ " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ \
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ \
+ " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ \
+ /* average ReconPtr1 and ReconPtr2 */ \
+ " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ \
+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ \
+ " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ \
+ " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ \
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
+ " movq %%mm0, (%3) \n\t" /* write answer out */ \
+ " movq %%mm2, 8(%3) \n\t" /* write answer out */ \
+ /* Increment pointers */ \
+ " add $16, %3 \n\t" \
+ " add %4, %0 \n\t" \
+ " add %5, %1 \n\t" \
+ " add %5, %2 \n\t"
+
+
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine,
ogg_uint32_t ReconPixelsPerLine)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
- " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
- " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
- " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
- /* average ReconPtr1 and ReconPtr2 */
- " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
- " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " movq %%mm0, (%3) \n\t" /* write answer out */
- " movq %%mm2, 8(%3) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %3 \n\t"
- " add %4, %0 \n\t"
- " add %5, %1 \n\t"
- " add %5, %2 \n\t"
- ".endr \n\t"
-
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
+ SUB_AVG2_LOOP
: "+r" (FiltPtr),
"+r" (ReconPtr1),
"+r" (ReconPtr2),
@@ -153,7 +173,7 @@
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
" pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
@@ -203,7 +223,7 @@
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
" pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
@@ -279,33 +299,38 @@
return MaxSad;
}
+#define SAD_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " movq %%mm0, %%mm2 \n\t" \
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \
+ " movq %%mm0, %%mm1 \n\t" \
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
unsigned char *ptr2, ogg_uint32_t stride2)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
" movq %%mm7, %%mm0 \n\t"
" psrlq $32, %%mm7 \n\t"
" paddw %%mm0, %%mm7 \n\t"
@@ -341,7 +366,7 @@
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
" paddb %%mm5, %%mm5 \n\t"
@@ -406,7 +431,7 @@
ogg_uint32_t XXSum;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
@@ -466,7 +491,7 @@
ogg_uint32_t XXSum;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm5, %%mm5 \n\t"
" pxor %%mm6, %%mm6 \n\t"
@@ -537,7 +562,7 @@
ogg_uint32_t XXSum;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
" paddb %%mm4, %%mm4 \n\t"
Modified: trunk/theora/lib/x86_32/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/x86_32/dsp_mmxext.c 2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/dsp_mmxext.c 2007-02-05 23:22:46 UTC (rev 12433)
@@ -20,28 +20,36 @@
#include "codec_internal.h"
#include "dsp.h"
+#define SAD_MMXEXT_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+
static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
unsigned char *ptr2, ogg_uint32_t stride2)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
+ SAD_MMXEXT_LOOP
- ".rept 7 \n\t"
" movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
" psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
" movd %%mm7, %0 \n\t"
: "=r" (DiffVal),
@@ -55,6 +63,15 @@
return DiffVal;
}
+#define SAD_TRES_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+
static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
unsigned char *ptr2, ogg_uint32_t stride2,
ogg_uint32_t thres)
@@ -62,17 +79,17 @@
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
+ SAD_TRES_LOOP
" movd %%mm7, %0 \n\t"
@@ -87,6 +104,19 @@
return DiffVal;
}
+#define SAD_XY2_TRES \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " movq (%3), %%mm2 \n\t" \
+ " pavgb %%mm2, %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ \
+ " add %4, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %5, %2 \n\t" /* Inc pointer into ref data */ \
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
+
+
static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
unsigned char *RefDataPtr1,
unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
@@ -95,21 +125,17 @@
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq (%3), %%mm2 \n\t"
- " pavgb %%mm2, %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
+ SAD_XY2_TRES
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
" movd %%mm7, %0 \n\t"
: "=m" (DiffVal),
"+r" (SrcData),
@@ -128,7 +154,7 @@
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" movd (%1), %%mm0 \n\t"
" movd (%2), %%mm1 \n\t"
@@ -157,7 +183,7 @@
ogg_uint32_t MaxSad;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
" pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
@@ -236,7 +262,7 @@
ogg_uint32_t XXSum;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm4, %%mm4 \n\t"
" pxor %%mm5, %%mm5 \n\t"
Modified: trunk/theora/lib/x86_32/fdct_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/fdct_mmx.c 2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/fdct_mmx.c 2007-02-05 23:22:46 UTC (rev 12433)
@@ -288,7 +288,7 @@
ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
/*
* Input data is an 8x8 block. To make processing of the data more efficent
* we will transpose the block of data to two 4x8 blocks???
Modified: trunk/theora/lib/x86_32/recon_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/recon_mmx.c 2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/recon_mmx.c 2007-02-05 23:22:46 UTC (rev 12433)
@@ -24,7 +24,7 @@
unsigned int stride)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" lea (%2, %2, 2), %%edi \n\t"
@@ -62,7 +62,7 @@
ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" movq %[V128], %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
@@ -92,7 +92,7 @@
ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm0, %%mm0 \n\t"
" lea 128(%1), %%edi \n\t"
@@ -129,7 +129,7 @@
ogg_uint32_t LineStep)
{
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm0, %%mm0 \n\t"
" lea 128(%1), %%edi \n\t"
More information about the commits
mailing list