[xiph-commits] r12433 - in trunk/theora: . lib/x86_32

Mon Feb 5 15:22:59 PST 2007

Author: j
Date: 2007-02-05 15:22:46 -0800 (Mon, 05 Feb 2007)
New Revision: 12433

Modified:
   trunk/theora/configure.ac
   trunk/theora/lib/x86_32/dsp_mmx.c
   trunk/theora/lib/x86_32/dsp_mmxext.c
   trunk/theora/lib/x86_32/fdct_mmx.c
   trunk/theora/lib/x86_32/recon_mmx.c
Log:
rework MMX code to compile on Mac OS X(Intel)
It is known to compile with XCode 2.4.1,
- .balign 16  ->  .p2align 4
- gas on OS X does not know about asm macros  .rept / .endr 
  replaced with #defines.



Modified: trunk/theora/configure.ac
===================================================================

--- trunk/theora/configure.ac	2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/configure.ac	2007-02-05 23:22:46 UTC (rev 12433)
@@ -112,11 +112,9 @@
   cpu_optimization="no optimization for your platform, please send a patch"
   case $target_cpu in
 	i[[3456]]86)
-		if test ! "x$target_vendor" = "xapple"; then
-		  cpu_x86_32=yes 
-		  cpu_optimization="32 bit x86"
-  		AC_DEFINE([USE_ASM], [],  [make use of asm optimization])
-		fi
+		cpu_x86_32=yes 
+		cpu_optimization="32 bit x86"
+		AC_DEFINE([USE_ASM], [],  [make use of asm optimization])
     	;;
 	x86_64)
 		cpu_x86_64=yes

Modified: trunk/theora/lib/x86_32/dsp_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/dsp_mmx.c	2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/dsp_mmx.c	2007-02-05 23:22:46 UTC (rev 12433)
@@ -26,36 +26,42 @@
 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
 
+#define SUB_LOOP                                                              \
+  "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
+  "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */                  \
+  "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
+  "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */\
+  /* convert from UINT8 to INT16 */                                           \
+  "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
+  "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */           \
+  "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
+  "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */           \
+  /* start calculation */                                                     \
+  "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */        \
+  "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */        \
+  "  movq        %%mm0,  (%2)     \n\t" /* write answer out */                \
+  "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */                \
+  /* Increment pointers */                                                    \
+  "  add         $16, %2           \n\t"                                      \
+  "  add         %3, %0           \n\t"                                       \
+  "  add         %4, %1           \n\t"
+
 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                   ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
                   ogg_uint32_t ReconPixelsPerLine) 
 {
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %2           \n\t"
-    "  add         %3, %0           \n\t"
-    "  add         %4, %1           \n\t"
-    ".endr                          \n\t"
-
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
      : "+r" (FiltPtr),
        "+r" (ReconPtr),
        "+r" (DctInputPtr)
@@ -65,31 +71,38 @@
   );
 }
 
+#define SUB_128_LOOP                                                          \
+  "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
+  "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
+  /* convert from UINT8 to INT16 */                                           \
+  "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
+  "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
+  /* start calculation */                                                     \
+  "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */             \
+  "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */             \
+  "  movq        %%mm0,  (%1)     \n\t" /* write answer out */                \
+  "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */                \
+  /* Increment pointers */                                                    \
+  "  add         $16, %1           \n\t"                                      \
+  "  add         %2, %0           \n\t"  
+
+
 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                       ogg_uint32_t PixelsPerLine) 
 {
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm7, %%mm7     \n\t" 
     "  movq        %[V128], %%mm1   \n\t"
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %1           \n\t"
-    "  add         %2, %0           \n\t"
-    ".endr                          \n\t"
-
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
      : "+r" (FiltPtr),
        "+r" (DctInputPtr)
      : "m" (PixelsPerLine),
@@ -98,46 +111,53 @@
   );
 }
 
+#define SUB_AVG2_LOOP                                                         \
+  "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                   \
+  "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */                 \
+  "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */                 \
+  "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */\
+  "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */\
+  "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */\
+  /* convert from UINT8 to INT16 */                                           \
+  "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */            \
+  "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */          \
+  "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */          \
+  "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */            \
+  "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */          \
+  "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */          \
+  /* average ReconPtr1 and ReconPtr2 */                                       \
+  "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */     \
+  "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */     \
+  "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ \
+  "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ \
+  "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
+  "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
+  "  movq        %%mm0,  (%3)     \n\t" /* write answer out */                \
+  "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */                \
+  /* Increment pointers */                                                    \
+  "  add         $16, %3           \n\t"                                      \
+  "  add         %4, %0           \n\t"                                       \
+  "  add         %5, %1           \n\t"                                       \
+  "  add         %5, %2           \n\t"  
+
+
 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                      unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine,
                      ogg_uint32_t ReconPixelsPerLine) 
 {
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm7, %%mm7     \n\t" 
-
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
-    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
-    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
-    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
-    /* average ReconPtr1 and ReconPtr2 */
-    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
-    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
-    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
-    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %3           \n\t"
-    "  add         %4, %0           \n\t"
-    "  add         %5, %1           \n\t"
-    "  add         %5, %2           \n\t"
-    ".endr                          \n\t"
-
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
+    SUB_AVG2_LOOP
      : "+r" (FiltPtr),
        "+r" (ReconPtr1),
        "+r" (ReconPtr2),
@@ -153,7 +173,7 @@
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
     "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
@@ -203,7 +223,7 @@
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
     "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
@@ -279,33 +299,38 @@
   return MaxSad;
 }
 
+#define SAD_LOOP                                                              \
+  "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */                    \
+  "  movq        (%2), %%mm1      \n\t"                                       \
+  "  movq        %%mm0, %%mm2     \n\t"                                       \
+  "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */                         \
+  "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */                           \
+  "  por         %%mm1, %%mm0     \n\t" /* and or gives abs difference */     \
+  "  movq        %%mm0, %%mm1     \n\t"                                       \
+  "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */ \
+  "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */        \
+  "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */ \
+  "  add         %3, %1           \n\t"	/* Inc pointer into the new data */   \
+  "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */        \
+  "  add         %4, %2           \n\t"	/* Inc pointer into ref data */  
+
 static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
 		       	    unsigned char *ptr2, ogg_uint32_t stride2)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
     "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
-    ".rept 8                         \n\t"
-    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
-    "  movq        (%2), %%mm1      \n\t"
-    "  movq        %%mm0, %%mm2     \n\t"
-
-    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
-    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
-    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
-    "  movq        %%mm0, %%mm1     \n\t"
-
-    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
-    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
-    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
-    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
-    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
-    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
     "  movq        %%mm7, %%mm0     \n\t"
     "  psrlq       $32, %%mm7       \n\t"
     "  paddw       %%mm0, %%mm7     \n\t"
@@ -341,7 +366,7 @@
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
     "  paddb       %%mm5, %%mm5     \n\t"
@@ -406,7 +431,7 @@
   ogg_uint32_t  XXSum;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
@@ -466,7 +491,7 @@
   ogg_uint32_t  XXSum;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm5, %%mm5     \n\t"
     "  pxor        %%mm6, %%mm6     \n\t"
@@ -537,7 +562,7 @@
   ogg_uint32_t XXSum;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
     "  paddb       %%mm4, %%mm4     \n\t"

Modified: trunk/theora/lib/x86_32/dsp_mmxext.c
===================================================================
--- trunk/theora/lib/x86_32/dsp_mmxext.c	2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/dsp_mmxext.c	2007-02-05 23:22:46 UTC (rev 12433)
@@ -20,28 +20,36 @@
 #include "codec_internal.h"
 #include "dsp.h"
 
+#define SAD_MMXEXT_LOOP \
+ "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+ "  movq (%2), %%mm1             \n\t" \
+ "  psadbw %%mm1, %%mm0          \n\t" \
+ "  add %3, %1                   \n\t"	/* Inc pointer into the new data */ \
+ "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
+ "  add %4, %2                   \n\t"	/* Inc pointer into ref data */ 
+
+
 static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
 		       	    unsigned char *ptr2, ogg_uint32_t stride2)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
+    SAD_MMXEXT_LOOP
 
-    ".rept 7                        \n\t"
     "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
     "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
@@ -55,6 +63,15 @@
   return DiffVal;
 }
 
+#define SAD_TRES_LOOP \
+  "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+  "  movq (%2), %%mm1             \n\t" \
+  "  psadbw %%mm1, %%mm0          \n\t" \
+  "  add %3, %1                   \n\t"	/* Inc pointer into the new data */ \
+  "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
+  "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
+
+
 static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
 		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
 			   	  ogg_uint32_t thres)
@@ -62,17 +79,17 @@
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
 
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
+    SAD_TRES_LOOP
 
     "  movd %%mm7, %0               \n\t"
 
@@ -87,6 +104,19 @@
   return DiffVal;
 }
 
+#define SAD_XY2_TRES \
+  "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */ \
+  "  movq (%2), %%mm1             \n\t" \
+  "  movq (%3), %%mm2             \n\t" \
+  "  pavgb %%mm2, %%mm1           \n\t" \
+  "  psadbw %%mm1, %%mm0          \n\t" \
+ \
+  "  add %4, %1                   \n\t"	/* Inc pointer into the new data */ \
+  "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */ \
+  "  add %5, %2                   \n\t"	/* Inc pointer into ref data */ \
+  "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
+
+
 static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
 		                      unsigned char *RefDataPtr1,
 			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
@@ -95,21 +125,17 @@
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  movq (%3), %%mm2             \n\t"
-    "  pavgb %%mm2, %%mm1           \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
+    SAD_XY2_TRES
 
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
-    ".endr                          \n\t"
-
     "  movd %%mm7, %0               \n\t"
      : "=m" (DiffVal),
        "+r" (SrcData), 
@@ -128,7 +154,7 @@
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  movd        (%1), %%mm0      \n\t"
     "  movd        (%2), %%mm1      \n\t"
@@ -157,7 +183,7 @@
   ogg_uint32_t MaxSad;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
     "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
@@ -236,7 +262,7 @@
   ogg_uint32_t XXSum;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
 
     "  pxor        %%mm4, %%mm4     \n\t"
     "  pxor        %%mm5, %%mm5     \n\t"

Modified: trunk/theora/lib/x86_32/fdct_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/fdct_mmx.c	2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/fdct_mmx.c	2007-02-05 23:22:46 UTC (rev 12433)
@@ -288,7 +288,7 @@
   ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     /*
      * Input data is an 8x8 block.  To make processing of the data more efficent
      * we will transpose the block of data to two 4x8 blocks???

Modified: trunk/theora/lib/x86_32/recon_mmx.c
===================================================================
--- trunk/theora/lib/x86_32/recon_mmx.c	2007-02-05 21:55:44 UTC (rev 12432)
+++ trunk/theora/lib/x86_32/recon_mmx.c	2007-02-05 23:22:46 UTC (rev 12433)
@@ -24,7 +24,7 @@
 	                unsigned int stride)
 {
   __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
+    "  .p2align 4                      \n\t"
 
     "  lea         (%2, %2, 2), %%edi  \n\t"
 
@@ -62,7 +62,7 @@
 		      ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
+    "  .p2align 4                      \n\t"
 
     "  movq        %[V128], %%mm0      \n\t" /* Set mm0 to 0x8080808080808080 */
 
@@ -92,7 +92,7 @@
 		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
+    "  .p2align 4                      \n\t"
 
     "  pxor        %%mm0, %%mm0        \n\t"
     "  lea         128(%1), %%edi      \n\t"
@@ -129,7 +129,7 @@
 			   ogg_uint32_t LineStep)
 {
   __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
+    "  .p2align 4                      \n\t"
 
     "  pxor        %%mm0, %%mm0        \n\t"
     "  lea         128(%1), %%edi      \n\t"