[Theora-dev] MMX IDCT for theora-exp
Rudolf Marek
r.marek at sh.cvut.cz
Wed Jul 20 13:31:41 PDT 2005
Hello,
I optimized a bit from:
39182 6.9383 10146 2.6140 dump oc_state_frag_recon_mmx
to:
92486 4.3839 24848 1.5635 dump oc_state_frag_recon_mmx
This patch apply on the top of previus one.
I wont be available until sunday so I will reply to suggestions later.
Regards
Rudolf
-------------- next part --------------
diff -Naur b/lib/x86/mmxstate.c c/lib/x86/mmxstate.c
--- b/lib/x86/mmxstate.c 2005-07-20 16:48:32.718713000 +0200
+++ c/lib/x86/mmxstate.c 2005-07-20 22:22:08.161600000 +0200
@@ -36,7 +36,7 @@
void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
- ogg_int16_t __attribute__((aligned(8),used)) res_buf[64];
+ ogg_int16_t __attribute__((aligned(8))) res_buf[64];
int dst_framei;
int dst_ystride;
int zzi;
@@ -73,30 +73,36 @@
Who knows.*/
p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-#if (defined(__amd64__) || defined(__x86_64__))
/* for(ci=0;ci<64;ci++)res_buf[ci]=p; */
- __asm__ __volatile__(
- "mov %%rdi,%%rdx\n" /* I cant tell the GCC that EDI value is clobbered */
- "cld\n"
- "rep\n"
- "stosq\n"
- "mov %%rdx,%%rdi\n"
- :
- : "D" (res_buf), "a" (p), "c" (16)
- : "memory", "cc", "rdx"
- );
-#else
- __asm__ __volatile__(
- "mov %%edi,%%edx\n" /* I cant tell the GCC that EDI value is clobbered */
- "cld\n"
- "rep\n"
- "stosw\n"
- "mov %%edx,%%edi\n" /* I cant tell the GCC that EDI value is clobbered */
- :
- : "D" (res_buf), "a" (p), "c" (64)
- : "memory", "%edx", "cc"
- );
-#endif
+ /* this could be also with MMX 2 */
+ __asm__ __volatile__(
+ "movzwl %1,%%eax\n"
+ "movd %%eax,%%mm0\n" /* XXXX XXXX 0000 AAAA */
+ "movq %%mm0,%%mm1\n" /* XXXX XXXX 0000 AAAA */
+ "pslld $16,%%mm1\n" /* XXXX XXXX AAAA 0000 */
+ "por %%mm0,%%mm1\n" /* XXXX XXXX AAAA AAAA */
+ "movq %%mm1,%%mm0\n" /* XXXX XXXX AAAA AAAA */
+ "psllq $32, %%mm1\n" /* AAAA AAAA 0000 0000 */
+ "por %%mm1,%%mm0\n" /* AAAA AAAA AAAA AAAA */
+ "movq %%mm0,(%0)\n"
+ "movq %%mm0,8(%0)\n"
+ "movq %%mm0,16(%0)\n"
+ "movq %%mm0,24(%0)\n"
+ "movq %%mm0,32(%0)\n"
+ "movq %%mm0,40(%0)\n"
+ "movq %%mm0,48(%0)\n"
+ "movq %%mm0,56(%0)\n"
+ "movq %%mm0,64(%0)\n"
+ "movq %%mm0,72(%0)\n"
+ "movq %%mm0,80(%0)\n"
+ "movq %%mm0,88(%0)\n"
+ "movq %%mm0,96(%0)\n"
+ "movq %%mm0,104(%0)\n"
+ "movq %%mm0,112(%0)\n"
+ "movq %%mm0,120(%0)\n"
+ :
+ : "r" (res_buf), "r" (p)
+ : "memory" );
}
else{
@@ -104,42 +110,42 @@
the iDCT.*/
/* First zero the buffer */
-
-#if (defined(__amd64__) || defined(__x86_64__))
- __asm__ __volatile__(
- "mov %%rdi,%%rdx\n" /* I cant tell the GCC that EDI value is clobbered */
- "xor %%rax,%%rax\n"
- "cld\n"
- "rep\n"
- "stosq\n"
- "mov %%rdx,%%rdi\n"
- :
- : "D" (res_buf), "c" (16)
- : "memory", "cc", "rdx"
- );
+ /* on K7 etc this could be replaced with movntq and sfence */
-#else
- __asm__ __volatile__(
- "mov %%edi,%%edx\n" /* I cant tell the GCC that EDI value is clobbered */
- "xor %%eax,%%eax\n"
- "cld\n"
- "rep\n"
- "stosw\n"
- "mov %%edx,%%edi\n" /* I cant tell the GCC that EDI value is clobbered */
- :
- : "D" (res_buf), "c" (64)
- : "memory", "%edx", "cc"
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n"
+ "movq %%mm0,(%0)\n"
+ "movq %%mm0,8(%0)\n"
+ "movq %%mm0,16(%0)\n"
+ "movq %%mm0,24(%0)\n"
+ "movq %%mm0,32(%0)\n"
+ "movq %%mm0,40(%0)\n"
+ "movq %%mm0,48(%0)\n"
+ "movq %%mm0,56(%0)\n"
+ "movq %%mm0,64(%0)\n"
+ "movq %%mm0,72(%0)\n"
+ "movq %%mm0,80(%0)\n"
+ "movq %%mm0,88(%0)\n"
+ "movq %%mm0,96(%0)\n"
+ "movq %%mm0,104(%0)\n"
+ "movq %%mm0,112(%0)\n"
+ "movq %%mm0,120(%0)\n"
+ :
+ : "r" (res_buf)
+ : "memory"
);
-#endif
res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+ /* this is in plan to be rewritten in MMX */
+
for(zzi=1;zzi<_ncoefs;zzi++){
int ci;
ci=OC_FZIG_ZAG[zzi];
res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]);
}
+
if(_last_zzi<10){
oc_idct8x8_10_mmx(res_buf);
}
More information about the Theora-dev
mailing list