[xiph-commits] r9592 - experimental/derf/theora-exp/lib/x86
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Tue Jul 19 19:23:32 PDT 2005
Author: tterribe
Date: 2005-07-19 19:23:30 -0700 (Tue, 19 Jul 2005)
New Revision: 9592
Modified:
experimental/derf/theora-exp/lib/x86/cpu.c
experimental/derf/theora-exp/lib/x86/mmxfrag.c
experimental/derf/theora-exp/lib/x86/mmxstate.c
Log:
Port of MMX code to x86-64 (contributed by Rudolf Marek).
Modified: experimental/derf/theora-exp/lib/x86/cpu.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/cpu.c 2005-07-20 02:17:59 UTC (rev 9591)
+++ experimental/derf/theora-exp/lib/x86/cpu.c 2005-07-20 02:23:30 UTC (rev 9592)
@@ -20,8 +20,27 @@
ogg_uint32_t ecx;
ogg_uint32_t edx;
ogg_uint32_t flags;
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
#define cpuid(op,eax,ebx,ecx,edx) \
__asm__ __volatile__( \
+ "push %%rbx \n\t" \
+ "cpuid \n\t" \
+ "movl %%ebx,%1 \n\t" \
+ "pop %%rbx" \
+ :"=a" (eax), \
+ "=r" (ebx), \
+ "=c" (ecx), \
+ "=d" (edx) \
+ :"a" (op) \
+ :"cc" \
+ )
+
+#else
+
+#define cpuid(op,eax,ebx,ecx,edx) \
+ __asm__ __volatile__( \
"pushl %%ebx \n\t" \
"cpuid \n\t" \
"movl %%ebx,%1 \n\t" \
@@ -51,6 +70,7 @@
);
/*No cpuid.*/
if(eax==ebx)return 0;
+#endif
cpuid(0,eax,ebx,ecx,edx);
if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
/*Intel:*/
Modified: experimental/derf/theora-exp/lib/x86/mmxfrag.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxfrag.c 2005-07-20 02:17:59 UTC (rev 9591)
+++ experimental/derf/theora-exp/lib/x86/mmxfrag.c 2005-07-20 02:23:30 UTC (rev 9592)
@@ -20,7 +20,7 @@
__asm__ __volatile__(
" mov $0x7, %%ecx \n\t" /* 8x loop */
" .balign 16 \n\t"
- "1: movq (V128), %%mm0 \n\t" /* Set mm0 to 0x0080008000800080 */
+ "1:movq %3, %%mm0 \n\t" /* Set mm0 to 0x0080008000800080 */
" movq (%1), %%mm2 \n\t" /* First four input values */
" movq %%mm0, %%mm1 \n\t" /* Set mm1 == mm0 */
" movq 8(%1), %%mm3 \n\t" /* Next four input values */
@@ -34,8 +34,9 @@
" jns 1b \n\t" /* loop */
:"+r" (_dst)
:"r" (_residue),
- "r" (_dst_ystride)
- :"memory", "ecx"
+ "r" ((long)_dst_ystride),
+ "m" (V128)
+ :"memory", "ecx", "cc"
);
}
@@ -61,16 +62,61 @@
" jns 1b \n\t" /* loop */
:"+r" (_dst)
:"r" (_residue),
- "r" (_dst_ystride),
- "r" (_src_ystride),
+ "r" ((long)_dst_ystride),
+ "r" ((long)_src_ystride),
"r" (_src)
- :"memory", "eax"
+ :"memory", "eax", "cc"
);
}
+#if (defined(__amd64__) || defined(__x86_64__))
+
void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
int _src2_ystride,const ogg_int16_t *_residue){
+
+ __asm__ __volatile__(
+ " movl $0x7, %%eax \n\t" /* 8x loop */
+ " pxor %%mm0, %%mm0 \n\t" /* zero mm0 */
+ " movq (%4), %%mm2 \n\t" /* load mm2 with _src1 */
+ " .balign 16 \n\t"
+ "1:movq (%6), %%mm4 \n\t" /* packed SRC2 */
+ " movq %%mm2, %%mm3 \n\t" /* copy to mm3 */
+ " movq %%mm4, %%mm5 \n\t" /* copy packed src2 to mm5 */
+ " punpcklbw %%mm0, %%mm2 \n\t" /* expand low part of src1 to mm2 */
+ " punpcklbw %%mm0, %%mm4 \n\t" /* low part expand of src2 to mm4 */
+ " lea (%4,%3), %4 \n\t" /* _src1+_src1_ystride */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* expand high part of src1 to mm3 */
+ " punpckhbw %%mm0, %%mm5 \n\t" /* high part expand of src2 to mm5 */
+ " paddsw %%mm2, %%mm4 \n\t" /* add low parts of src1 and src2 */
+ " paddsw %%mm3, %%mm5 \n\t" /* add high parts of src1 and src2 */
+ " lea (%6,%5), %6 \n\t" /* _src2+_src2_ystride */
+ " movq (%4), %%mm2 \n\t" /* load mm2 with _src1 */
+ " psrlw $1, %%mm4 \n\t" /* shift logical 1 to right o 2 dolu */
+ " psrlw $1, %%mm5 \n\t" /* shift logical 1 to right */
+ " paddsw (%1), %%mm4 \n\t" /* add low parts wwith low parts */
+ " paddsw 8(%1), %%mm5 \n\t" /* add highparts with high */
+ " packuswb %%mm5, %%mm4 \n\t" /* pack saturate high to low */
+ " lea 0x10(%1), %1 \n\t" /* _residuo+16 */
+ " movq %%mm4, (%0) \n\t" /* write to src */
+ " decl %%eax \n\t"
+ " lea (%0,%2), %0 \n\t" /* _dst+_dst_ystride */
+ " jns 1b\n\t"
+ :"+r" (_dst) /* 0 */
+ :"r" (_residue), /* 1 */
+ "r" ((long)_dst_ystride), /* 2 */
+ "r" ((long)_src1_ystride), /* 3 */
+ "r" (_src1), /* 4 */
+ "r" ((long)_src2_ystride), /* 5 */
+ "r" (_src2) /* 6 */
+ : "memory", "cc", "eax"
+ );
+}
+#else
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
int i;
__asm__ __volatile__(
" movl $0x7, %7 \n\t" /* 8x loop */
@@ -109,10 +155,12 @@
"m" (_src2_ystride), /* 5 */
"r" (_src2), /* 6 */
"m" (i)
- :"memory", "eax"
+ :"memory", "eax", "cc"
);
}
+#endif
+
void oc_restore_fpu_mmx(void){
__asm__ __volatile__(
" emms \n\t" /* pack with next(high) four values */
Modified: experimental/derf/theora-exp/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxstate.c 2005-07-20 02:17:59 UTC (rev 9591)
+++ experimental/derf/theora-exp/lib/x86/mmxstate.c 2005-07-20 02:23:30 UTC (rev 9592)
@@ -39,7 +39,38 @@
frag=_state->frags+*fragi;
dst=frag->buffer[dst_framei];
src=frag->buffer[src_framei];
+
+#if (defined(__amd64__) || defined(__x86_64__))
__asm__ __volatile__(
+ " lea (%3, %3, 2), %%rsi \n\t" /* esi=src_stride*3 */
+ " movq (%1), %%mm0 \n\t" /* src */
+ " lea (%2, %2, 2), %%rdi \n\t" /* edi=dst_stride*3 */
+ " movq (%1, %3), %%mm1 \n\t" /* src+1x stride */
+ " movq (%1, %3, 2), %%mm2 \n\t" /* src+2x stride */
+ " movq (%1, %%rsi), %%mm3 \n\t" /* src+3x stride */
+ " movq %%mm0, (%0) \n\t" /* dst */
+ " movq %%mm1, (%0, %2)\n\t" /* dst+dst_stride */
+ " lea (%1,%3,4), %1 \n\t" /* pointer to next 4 */
+ " movq %%mm2, (%0, %2, 2) \n\t" /*dst+2x dst_stride */
+ " movq %%mm3, (%0, %%rdi) \n\t" /* 3x */
+ " lea (%0,%2,4), %0 \n\t" /* pointer to next 4 */
+ " movq (%1), %%mm0 \n\t" /* src */
+ " movq (%1, %3), %%mm1 \n\t" /* src+1x stride */
+ " movq (%1, %3, 2), %%mm2 \n\t" /* src+2x stride */
+ " movq (%1, %%rsi), %%mm3 \n\t" /* src+3x stride */
+ " movq %%mm0, (%0) \n\t" /* dst */
+ " movq %%mm1, (%0, %2)\n\t" /* dst+dst_stride */
+ " movq %%mm2, (%0, %2, 2) \n\t" /* dst+2x dst_stride */
+ " movq %%mm3, (%0, %%rdi) \n\t" /* 3x */
+ :"+r" (dst) /* 0 */
+ :"r" (src), /* 1 */
+ "r" ((long)dst_ystride), /* 2 */
+ "r" ((long)src_ystride) /* 3 */
+ :"memory", "rsi","rdi"
+ );
+ }
+#else
+ __asm__ __volatile__(
" lea (%3, %3, 2), %%esi \n\t" /* esi=src_stride*3 */
" movq (%1), %%mm0 \n\t" /* src */
" lea (%2, %2, 2), %%edi \n\t" /* edi=dst_stride*3 */
@@ -67,6 +98,7 @@
:"memory", "esi","edi"
);
}
+#endif
/*This needs to be removed when decode specific functions are implemented:*/
__asm__ __volatile__("emms\n\t");
}
More information about the commits
mailing list