[Theora-dev] MMX IDCT for theora-exp

Rudolf Marek r.marek at sh.cvut.cz
Wed Jul 20 13:31:41 PDT 2005


Hello,

I optimized a bit from:
39182     6.9383  10146     2.6140  dump                     oc_state_frag_recon_mmx
to:
92486     4.3839  24848     1.5635  dump                     oc_state_frag_recon_mmx

This patch apply on the top of previus one.
I wont be available until sunday so I will reply to suggestions later.

Regards

Rudolf
-------------- next part --------------
diff -Naur b/lib/x86/mmxstate.c c/lib/x86/mmxstate.c
--- b/lib/x86/mmxstate.c	2005-07-20 16:48:32.718713000 +0200
+++ c/lib/x86/mmxstate.c	2005-07-20 22:22:08.161600000 +0200
@@ -36,7 +36,7 @@
 void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
-  ogg_int16_t  __attribute__((aligned(8),used)) res_buf[64];
+  ogg_int16_t  __attribute__((aligned(8))) res_buf[64];
   int dst_framei;
   int dst_ystride;
   int zzi;
@@ -73,30 +73,36 @@
       Who knows.*/
     p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
 
-#if (defined(__amd64__) ||  defined(__x86_64__))
 	/* for(ci=0;ci<64;ci++)res_buf[ci]=p; */
-     __asm__ __volatile__(
-	"mov %%rdi,%%rdx\n"  /* I cant tell the GCC that EDI value is clobbered */
-	"cld\n"
-	"rep\n"
-	"stosq\n"
-	"mov %%rdx,%%rdi\n" 
-    : 
-    : "D" (res_buf), "a" (p), "c" (16)
-    : "memory", "cc", "rdx"
-    );
-#else
-       __asm__ __volatile__(
-	"mov %%edi,%%edx\n"  /* I cant tell the GCC that EDI value is clobbered */
-	"cld\n"
-	"rep\n"
-	"stosw\n"
-	 "mov %%edx,%%edi\n"  /* I cant tell the GCC that EDI value is clobbered */
-    : 
-    : "D" (res_buf), "a" (p), "c" (64)
-    : "memory", "%edx", "cc"
-    );
-#endif
+	/* this could be also with MMX 2 */
+	    __asm__ __volatile__(
+	    "movzwl %1,%%eax\n"
+	    "movd %%eax,%%mm0\n"   	/* XXXX XXXX 0000 AAAA */
+	    "movq %%mm0,%%mm1\n"	/* XXXX XXXX 0000 AAAA */
+	    "pslld $16,%%mm1\n"		/* XXXX XXXX AAAA 0000 */
+	    "por %%mm0,%%mm1\n"		/* XXXX XXXX AAAA AAAA */
+	    "movq %%mm1,%%mm0\n"	/* XXXX XXXX AAAA AAAA */
+	    "psllq $32, %%mm1\n"	/* AAAA AAAA 0000 0000 */
+	    "por %%mm1,%%mm0\n"		/* AAAA AAAA AAAA AAAA */
+	    "movq %%mm0,(%0)\n"
+	    "movq %%mm0,8(%0)\n"
+	    "movq %%mm0,16(%0)\n"
+	    "movq %%mm0,24(%0)\n"
+	    "movq %%mm0,32(%0)\n"
+	    "movq %%mm0,40(%0)\n"
+	    "movq %%mm0,48(%0)\n"
+	    "movq %%mm0,56(%0)\n"
+	    "movq %%mm0,64(%0)\n"
+	    "movq %%mm0,72(%0)\n"
+	    "movq %%mm0,80(%0)\n"
+	    "movq %%mm0,88(%0)\n"
+	    "movq %%mm0,96(%0)\n"
+	    "movq %%mm0,104(%0)\n"
+	    "movq %%mm0,112(%0)\n"
+	    "movq %%mm0,120(%0)\n"
+	    :
+	    : "r" (res_buf), "r" (p)
+	    : "memory" );
   }
   else{
     
@@ -104,42 +110,42 @@
        the iDCT.*/
 
     /* First zero the buffer */
-    
-#if (defined(__amd64__) ||  defined(__x86_64__))
 
-	    __asm__ __volatile__(
-		"mov %%rdi,%%rdx\n"  /* I cant tell the GCC that EDI value is clobbered */
-		"xor %%rax,%%rax\n"
-		"cld\n"
-		"rep\n"
-		"stosq\n"
-		"mov %%rdx,%%rdi\n" 
-	    : 
-	    : "D" (res_buf), "c" (16)
-	    : "memory", "cc", "rdx"
-	    );
+	/* on K7 etc this could be replaced with movntq and sfence */
 
-#else    
-	       __asm__ __volatile__(
-		"mov %%edi,%%edx\n"  /* I cant tell the GCC that EDI value is clobbered */
-		"xor %%eax,%%eax\n"
-		"cld\n"
-		"rep\n"
-		"stosw\n"
-		"mov %%edx,%%edi\n"  /* I cant tell the GCC that EDI value is clobbered */
-	    : 
-	    : "D" (res_buf), "c" (64)
-	    : "memory", "%edx", "cc"
+	    __asm__ __volatile__(
+	    "pxor %%mm0,%%mm0\n"
+	    "movq %%mm0,(%0)\n"
+	    "movq %%mm0,8(%0)\n"
+	    "movq %%mm0,16(%0)\n"
+	    "movq %%mm0,24(%0)\n"
+	    "movq %%mm0,32(%0)\n"
+	    "movq %%mm0,40(%0)\n"
+	    "movq %%mm0,48(%0)\n"
+	    "movq %%mm0,56(%0)\n"
+	    "movq %%mm0,64(%0)\n"
+	    "movq %%mm0,72(%0)\n"
+	    "movq %%mm0,80(%0)\n"
+	    "movq %%mm0,88(%0)\n"
+	    "movq %%mm0,96(%0)\n"
+	    "movq %%mm0,104(%0)\n"
+	    "movq %%mm0,112(%0)\n"
+	    "movq %%mm0,120(%0)\n"
+	    :
+	    : "r" (res_buf)
+	    : "memory"
 	    );
 
-#endif
 	res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
   
+  	/* this is in plan to be rewritten in MMX */
+
   	for(zzi=1;zzi<_ncoefs;zzi++){
       	int ci;
       	ci=OC_FZIG_ZAG[zzi];
       	res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]);
     	}
+	
     if(_last_zzi<10){
       oc_idct8x8_10_mmx(res_buf);
     }


More information about the Theora-dev mailing list