[xiph-commits] r15838 - branches/theora-thusnelda/lib/enc/x86
giles at svn.xiph.org
giles at svn.xiph.org
Tue Mar 24 14:08:34 PDT 2009
Author: giles
Date: 2009-03-24 14:08:34 -0700 (Tue, 24 Mar 2009)
New Revision: 15838
Modified:
branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
Log:
Replace additional .balign calls. More painfully, replace .rept/.endr
with C preprocessor macros. This psuedo op also isn't supported by
Apple's fork of the gnu assembler.
With the previous commit, this reproduces trunk commit 12433.
Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c 2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c 2009-03-24 21:08:34 UTC (rev 15838)
@@ -28,6 +28,26 @@
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+#define SUB_LOOP \
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ \
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ \
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ \
+ /* convert from UINT8 to INT16 */ \
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ \
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ \
+ /* start calculation */ \
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ \
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ \
+ " movq %%mm0, (%2) \n\t" /* write answer out */ \
+ " movq %%mm2, 8(%2) \n\t" /* write answer out */ \
+ /* Increment pointers */ \
+ " add $16, %2 \n\t" \
+ " add %3, %0 \n\t" \
+ " add %3, %1 \n\t"
+
static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine)
{
@@ -36,27 +56,16 @@
" pxor %%mm7, %%mm7 \n\t"
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
- " movq %%mm0, (%2) \n\t" /* write answer out */
- " movq %%mm2, 8(%2) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %2 \n\t"
- " add %3, %0 \n\t"
- " add %3, %1 \n\t"
- ".endr \n\t"
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+ SUB_LOOP
+
: "+r" (FiltPtr),
"+r" (ReconPtr),
"+r" (DctInputPtr)
@@ -66,6 +75,21 @@
);
}
+#define SUB_128_LOOP \
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ \
+ /* convert from UINT8 to INT16 */ \
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
+ /* start calculation */ \
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ \
+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ \
+ " movq %%mm0, (%1) \n\t" /* write answer out */ \
+ " movq %%mm2, 8(%1) \n\t" /* write answer out */ \
+ /* Increment pointers */ \
+ " add $16, %1 \n\t" \
+ " add %2, %0 \n\t"
+
static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
ogg_uint32_t PixelsPerLine)
{
@@ -76,21 +100,14 @@
" pxor %%mm7, %%mm7 \n\t"
" movq %[V128], %%mm1 \n\t"
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
- " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
- " movq %%mm0, (%1) \n\t" /* write answer out */
- " movq %%mm2, 8(%1) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %1 \n\t"
- " add %2, %0 \n\t"
- ".endr \n\t"
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
+ SUB_128_LOOP
: "+r" (FiltPtr),
"+r" (DctInputPtr)
Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c 2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c 2009-03-24 21:08:34 UTC (rev 15838)
@@ -22,23 +22,30 @@
#if defined(USE_ASM)
+#define SAD_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %3, %2 \n\t" /* Inc pointer into ref data */
+
static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2,
ogg_uint32_t stride)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 7 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
+ SAD_LOOP
" movq (%1), %%mm0 \n\t" /* take 8 bytes */
" movq (%2), %%mm1 \n\t"
@@ -56,23 +63,31 @@
return DiffVal;
}
+#define SAD_THRES_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %3, %2 \n\t" /* Inc pointer into ref data */
+
static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2,
ogg_uint32_t stride, ogg_uint32_t thres)
{
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
+ SAD_THRES_LOOP
" movd %%mm7, %0 \n\t"
@@ -86,6 +101,18 @@
return DiffVal;
}
+#define SAD_XY2_THRES_LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " movq (%3), %%mm2 \n\t" \
+ " pavgb %%mm2, %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ \
+ " add %4, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */ \
+ " add %4, %3 \n\t" /* Inc pointer into ref data */
+
static ogg_uint32_t sad8x8_xy2_thres__mmxext (const unsigned char *SrcData, const unsigned char *RefDataPtr1,
const unsigned char *RefDataPtr2, ogg_uint32_t Stride,
ogg_uint32_t thres)
@@ -93,20 +120,17 @@
ogg_uint32_t DiffVal;
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq (%3), %%mm2 \n\t"
- " pavgb %%mm2, %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- " add %4, %3 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
+ SAD_XY2_THRES_LOOP
" movd %%mm7, %0 \n\t"
: "=m" (DiffVal),
Modified: branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c 2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c 2009-03-24 21:08:34 UTC (rev 15838)
@@ -26,7 +26,7 @@
int ret,tmp,tmp2;
__asm__ (
- ".balign 16 \n"
+ ".p2align 4 \n"
"movd %[in],%%xmm0\n"
"punpcklwd %%xmm0,%%xmm0\n"
"punpcklwd %%xmm0,%%xmm0\n"
Modified: branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c 2009-03-24 20:27:32 UTC (rev 15837)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c 2009-03-24 21:08:34 UTC (rev 15838)
@@ -289,7 +289,7 @@
ogg_int16_t __attribute__((aligned(8))) temp[8*8];
__asm__ __volatile__ (
- " .balign 16 \n\t"
+ " .p2align 4 \n\t"
/*
* Input data is an 8x8 block. To make processing of the data more efficent
* we will transpose the block of data to two 4x8 blocks???
More information about the commits
mailing list