[Theora-dev] [PATCH] promised MMX patches rc1
Rudolf Marek
r.marek at sh.cvut.cz
Tue Jul 19 06:05:24 PDT 2005
> i would like to try it
I have now 64bit userspace account so I fixed it.
I would like to have it tested. Please can you try? It produces files ./dump_video with same MD5SUMs so I think it works.
Patch is in attachment.
You need to apply it with -p2 sorry about that.
Derf if you like it please check it into SVN.
I hope I will find some time to come with MMX IDCT patch soon. (I just need to integrate it to current SVN tree the patch itself works)
I would like to thank j^ for testing this patch.
Regards
Rudolf
-------------- next part --------------
diff -Naur svn/theora-exp/lib/x86/cpu.c integrate/theora-exp/lib/x86/cpu.c
--- svn/theora-exp/lib/x86/cpu.c 2005-07-19 14:11:13.975046750 +0200
+++ integrate/theora-exp/lib/x86/cpu.c 2005-07-19 14:49:33.690770000 +0200
@@ -20,6 +20,25 @@
ogg_uint32_t ecx;
ogg_uint32_t edx;
ogg_uint32_t flags;
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+#define cpuid(op,eax,ebx,ecx,edx) \
+ __asm__ __volatile__( \
+ "push %%rbx \n\t" \
+ "cpuid \n\t" \
+ "movl %%ebx,%1 \n\t" \
+ "pop %%rbx" \
+ :"=a" (eax), \
+ "=r" (ebx), \
+ "=c" (ecx), \
+ "=d" (edx) \
+ :"a" (op) \
+ :"cc" \
+ )
+
+#else
+
#define cpuid(op,eax,ebx,ecx,edx) \
__asm__ __volatile__( \
"pushl %%ebx \n\t" \
@@ -33,6 +52,7 @@
:"a" (op) \
:"cc" \
)
+
__asm__ __volatile__(
"pushfl \n\t"
"pushfl \n\t"
@@ -51,6 +71,7 @@
);
/*No cpuid.*/
if(eax==ebx)return 0;
+#endif
cpuid(0,eax,ebx,ecx,edx);
if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
/*Intel:*/
diff -Naur svn/theora-exp/lib/x86/mmxfrag.c integrate/theora-exp/lib/x86/mmxfrag.c
--- svn/theora-exp/lib/x86/mmxfrag.c 2005-07-19 14:11:13.971046500 +0200
+++ integrate/theora-exp/lib/x86/mmxfrag.c 2005-07-19 14:55:05.863529500 +0200
@@ -20,7 +20,7 @@
__asm__ __volatile__(
" mov $0x7, %%ecx \n\t" /* 8x loop */
" .balign 16 \n\t"
- "1: movq (V128), %%mm0 \n\t" /* Set mm0 to 0x0080008000800080 */
+ "1: movq %3, %%mm0 \n\t" /* Set mm0 to 0x0080008000800080 */
" movq (%1), %%mm2 \n\t" /* First four input values */
" movq %%mm0, %%mm1 \n\t" /* Set mm1 == mm0 */
" movq 8(%1), %%mm3 \n\t" /* Next four input values */
@@ -34,8 +34,9 @@
" jns 1b \n\t" /* loop */
:"+r" (_dst)
:"r" (_residue),
- "r" (_dst_ystride)
- :"memory", "ecx"
+ "r" ((long) _dst_ystride),
+ "m" (V128)
+ :"memory", "ecx", "cc"
);
}
@@ -61,13 +62,58 @@
" jns 1b \n\t" /* loop */
:"+r" (_dst)
:"r" (_residue),
- "r" (_dst_ystride),
- "r" (_src_ystride),
+ "r" ((long) _dst_ystride),
+ "r" ((long) _src_ystride),
"r" (_src)
- :"memory", "eax"
+ :"memory", "eax", "cc"
);
}
+#if (defined(__amd64__) || defined(__x86_64__))
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+
+ __asm__ __volatile__(
+ " movl $0x7, %%eax \n\t" /* 8x loop */
+ " pxor %%mm0, %%mm0 \n\t" /* zero mm0 */
+ " movq (%4), %%mm2 \n\t" /* load mm2 with _src1 */
+ " .balign 16 \n\t"
+ "1: movq (%6), %%mm4 \n\t" /* packed SRC2 */
+ " movq %%mm2, %%mm3 \n\t" /* copy to mm3 */
+ " movq %%mm4, %%mm5 \n\t" /* copy packed src2 to mm5 */
+ " punpcklbw %%mm0, %%mm2 \n\t" /* expand low part of src1 to mm2 */
+ " punpcklbw %%mm0, %%mm4 \n\t" /* low part expand of src2 to mm4 */
+ " lea (%4,%3), %4 \n\t" /* _src1+_src1_ystride */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* expand high part of src1 to mm3 */
+ " punpckhbw %%mm0, %%mm5 \n\t" /* high part expand of src2 to mm5 */
+ " paddsw %%mm2, %%mm4 \n\t" /* add low parts of src1 and src2 */
+ " paddsw %%mm3, %%mm5 \n\t" /* add high parts of src1 and src2 */
+ " lea (%6,%5), %6 \n\t" /* _src2+_src2_ystride */
+ " movq (%4), %%mm2 \n\t" /* load mm2 with _src1 */
+ " psrlw $1, %%mm4 \n\t" /* shift logical 1 to right o 2 dolu */
+ " psrlw $1, %%mm5 \n\t" /* shift logical 1 to right */
+ " paddsw (%1), %%mm4 \n\t" /* add low parts wwith low parts */
+ " paddsw 8(%1), %%mm5 \n\t" /* add highparts with high */
+ " packuswb %%mm5, %%mm4 \n\t" /* pack saturate high to low */
+ " lea 0x10(%1), %1 \n\t" /* _residuo+16 */
+ " movq %%mm4, (%0) \n\t" /* write to src */
+ " decl %%eax \n\t"
+ " lea (%0,%2), %0 \n\t" /* _dst+_dst_ystride */
+ " jns 1b\n\t"
+ :"+r" (_dst) /* 0 */
+ :"r" (_residue), /* 1 */
+ "r" ((long) _dst_ystride), /* 2 */
+ "r" ((long) _src1_ystride), /* 3 */
+ "r" (_src1), /* 4 */
+ "r" ((long) _src2_ystride), /* 5 */
+ "r" (_src2) /* 6 */
+ : "memory", "cc", "eax"
+ );
+}
+#else
+
void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
int _src2_ystride,const ogg_int16_t *_residue){
@@ -109,10 +155,12 @@
"m" (_src2_ystride), /* 5 */
"r" (_src2), /* 6 */
"m" (i)
- :"memory", "eax"
+ : "memory", "eax", "cc"
);
}
+#endif
+
void oc_restore_fpu_mmx(void){
__asm__ __volatile__(
" emms \n\t" /* pack with next(high) four values */
diff -Naur svn/theora-exp/lib/x86/mmxstate.c integrate/theora-exp/lib/x86/mmxstate.c
--- svn/theora-exp/lib/x86/mmxstate.c 2005-07-19 14:11:13.971046500 +0200
+++ integrate/theora-exp/lib/x86/mmxstate.c 2005-07-19 14:57:31.636639750 +0200
@@ -39,6 +39,37 @@
frag=_state->frags+*fragi;
dst=frag->buffer[dst_framei];
src=frag->buffer[src_framei];
+
+#if (defined(__amd64__) || defined(__x86_64__))
+ __asm__ __volatile__(
+ " lea (%3, %3, 2), %%rsi \n\t" /* esi=src_stride*3 */
+ " movq (%1), %%mm0 \n\t" /* src */
+ " lea (%2, %2, 2), %%rdi \n\t" /* edi=dst_stride*3 */
+ " movq (%1, %3), %%mm1 \n\t" /* src+1x stride */
+ " movq (%1, %3, 2), %%mm2 \n\t" /* src+2x stride */
+ " movq (%1, %%rsi), %%mm3 \n\t" /* src+3x stride */
+ " movq %%mm0, (%0) \n\t" /* dst */
+ " movq %%mm1, (%0, %2)\n\t" /* dst+dst_stride */
+ " lea (%1,%3,4), %1 \n\t" /* pointer to next 4 */
+ " movq %%mm2, (%0, %2, 2) \n\t" /*dst+2x dst_stride */
+ " movq %%mm3, (%0, %%rdi) \n\t" /* 3x */
+ " lea (%0,%2,4), %0 \n\t" /* pointer to next 4 */
+ " movq (%1), %%mm0 \n\t" /* src */
+ " movq (%1, %3), %%mm1 \n\t" /* src+1x stride */
+ " movq (%1, %3, 2), %%mm2 \n\t" /* src+2x stride */
+ " movq (%1, %%rsi), %%mm3 \n\t" /* src+3x stride */
+ " movq %%mm0, (%0) \n\t" /* dst */
+ " movq %%mm1, (%0, %2)\n\t" /* dst+dst_stride */
+ " movq %%mm2, (%0, %2, 2) \n\t" /* dst+2x dst_stride */
+ " movq %%mm3, (%0, %%rdi) \n\t" /* 3x */
+ :"+r" (dst) /* 0 */
+ :"r" (src), /* 1 */
+ "r" ((long) dst_ystride), /* 2 */
+ "r" ((long) src_ystride) /* 3 */
+ :"memory", "rsi","rdi"
+ );
+ }
+#else
__asm__ __volatile__(
" lea (%3, %3, 2), %%esi \n\t" /* esi=src_stride*3 */
" movq (%1), %%mm0 \n\t" /* src */
@@ -67,6 +98,7 @@
:"memory", "esi","edi"
);
}
+#endif
/*This needs to be removed when decode specific functions are implemented:*/
__asm__ __volatile__("emms\n\t");
}
More information about the Theora-dev
mailing list