[xiph-commits] r15838 - branches/theora-thusnelda/lib/enc/x86

Tue Mar 24 14:08:34 PDT 2009

Author: giles
Date: 2009-03-24 14:08:34 -0700 (Tue, 24 Mar 2009)
New Revision: 15838

Modified:
   branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
   branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
   branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
   branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
Log:
Replace additional .balign calls. More painfully, replace .rept/.endr
with C preprocessor macros. This psuedo op also isn't supported by 
Apple's fork of the gnu assembler.

With the previous commit, this reproduces trunk commit 12433.


Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
===================================================================

--- branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-03-24 21:08:34 UTC (rev 15838)
@@ -28,6 +28,26 @@
 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
 
+#define SUB_LOOP                                                                 \
+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                    \
+    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */                   \
+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */ \
+    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */ \
+    /* convert from UINT8 to INT16 */                                            \
+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */             \
+    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */            \
+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */             \
+    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */            \
+    /* start calculation */                                                      \
+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */         \
+    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */         \
+    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */                 \
+    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */                 \
+    /* Increment pointers */                                                     \
+    "  add         $16, %2          \n\t"                                        \
+    "  add         %3, %0           \n\t"                                        \
+    "  add         %3, %1           \n\t"
+
 static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
                          ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine)
 {
@@ -36,27 +56,16 @@
 
     "  pxor        %%mm7, %%mm7     \n\t"
 
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %2          \n\t"
-    "  add         %3, %0           \n\t"
-    "  add         %3, %1           \n\t"
-    ".endr                          \n\t"
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
+    SUB_LOOP
 
+
      : "+r" (FiltPtr),
        "+r" (ReconPtr),
        "+r" (DctInputPtr)
@@ -66,6 +75,21 @@
   );
 }
 
+#define SUB_128_LOOP                                                             \
+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                    \
+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */ \
+    /* convert from UINT8 to INT16 */                                            \
+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */             \
+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */             \
+    /* start calculation */                                                      \
+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */              \
+    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */              \
+    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */                 \
+    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */                 \
+    /* Increment pointers */                                                     \
+    "  add         $16, %1          \n\t"                                        \
+    "  add         %2, %0           \n\t"
+
 static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                              ogg_uint32_t PixelsPerLine)
 {
@@ -76,21 +100,14 @@
     "  pxor        %%mm7, %%mm7     \n\t"
     "  movq        %[V128], %%mm1   \n\t"
 
-    ".rept 8                        \n\t"
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
-    /* convert from UINT8 to INT16 */
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
-    /* start calculation */
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
-    /* Increment pointers */
-    "  add         $16, %1           \n\t"
-    "  add         %2, %0           \n\t"
-    ".endr                          \n\t"
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
+    SUB_128_LOOP
 
      : "+r" (FiltPtr),
        "+r" (DctInputPtr)

Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-03-24 21:08:34 UTC (rev 15838)
@@ -22,23 +22,30 @@
 
 #if defined(USE_ASM)
 
+#define SAD_LOOP                                                                    \
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
+    "  movq (%2), %%mm1             \n\t"                                           \
+    "  psadbw %%mm1, %%mm0          \n\t"                                           \
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */ \
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
+    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
+
 static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
                                     ogg_uint32_t stride)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
-    ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
-    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
-    ".endr                          \n\t"
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
+    SAD_LOOP
 
     "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
@@ -56,23 +63,31 @@
   return DiffVal;
 }
 
+#define SAD_THRES_LOOP                                                              \
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
+    "  movq (%2), %%mm1             \n\t"                                           \
+    "  psadbw %%mm1, %%mm0          \n\t"                                           \
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */ \
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
+    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
+
 static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
                                           ogg_uint32_t stride, ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
-    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
-    ".endr                          \n\t"
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
+    SAD_THRES_LOOP
 
     "  movd %%mm7, %0               \n\t"
 
@@ -86,6 +101,18 @@
   return DiffVal;
 }
 
+#define SAD_XY2_THRES_LOOP                                                          \
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
+    "  movq (%2), %%mm1             \n\t"                                           \
+    "  movq (%3), %%mm2             \n\t"                                           \
+    "  pavgb %%mm2, %%mm1           \n\t"                                           \
+    "  psadbw %%mm1, %%mm0          \n\t"                                           \
+                                                                                    \
+    "  add %4, %1                   \n\t"       /* Inc pointer into the new data */ \
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
+    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */     \
+    "  add %4, %3                   \n\t"       /* Inc pointer into ref data */
+
 static ogg_uint32_t sad8x8_xy2_thres__mmxext (const unsigned char *SrcData, const unsigned char *RefDataPtr1,
                                               const unsigned char *RefDataPtr2, ogg_uint32_t Stride,
                                               ogg_uint32_t thres)
@@ -93,20 +120,17 @@
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
-    ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  movq (%3), %%mm2             \n\t"
-    "  pavgb %%mm2, %%mm1           \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
 
-    "  add %4, %1                   \n\t"       /* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
-    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */
-    "  add %4, %3                   \n\t"       /* Inc pointer into ref data */
-    ".endr                          \n\t"
+	SAD_XY2_THRES_LOOP
+    SAD_XY2_THRES_LOOP
+	SAD_XY2_THRES_LOOP
+    SAD_XY2_THRES_LOOP
+	SAD_XY2_THRES_LOOP
+    SAD_XY2_THRES_LOOP
+	SAD_XY2_THRES_LOOP
+    SAD_XY2_THRES_LOOP
 
     "  movd %%mm7, %0               \n\t"
      : "=m" (DiffVal),

Modified: branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c	2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c	2009-03-24 21:08:34 UTC (rev 15838)
@@ -26,7 +26,7 @@
   int ret,tmp,tmp2;
 
   __asm__ (
-	   ".balign 16 \n"
+	   ".p2align 4 \n"
 	   "movd      %[in],%%xmm0\n"
 	   "punpcklwd %%xmm0,%%xmm0\n"
 	   "punpcklwd %%xmm0,%%xmm0\n"

Modified: branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-03-24 21:08:34 UTC (rev 15838)
@@ -289,7 +289,7 @@
   ogg_int16_t __attribute__((aligned(8))) temp[8*8];
 
   __asm__ __volatile__ (
-    "  .balign 16                   \n\t"
+    "  .p2align 4                   \n\t"
     /*
      * Input data is an 8x8 block.  To make processing of the data more efficent
      * we will transpose the block of data to two 4x8 blocks???