[xiph-commits] r16258 - in branches/theora-gumboot/lib/dec: . x86

gumboot at svn.xiph.org gumboot at svn.xiph.org
Fri Jul 10 20:15:55 PDT 2009


Author: gumboot
Date: 2009-07-10 20:15:54 -0700 (Fri, 10 Jul 2009)
New Revision: 16258

Modified:
   branches/theora-gumboot/lib/dec/decode.c
   branches/theora-gumboot/lib/dec/x86/mmxidct.c
   branches/theora-gumboot/lib/dec/x86/mmxstate.c
Log:
Quick-and-dirty (and broken on everything except non-VC x86 with MMX) rolling of the zig-zag and dequantisation into the un-packing code.  Seems to be 25% faster for me, now.


Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================
--- branches/theora-gumboot/lib/dec/decode.c	2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/decode.c	2009-07-11 03:15:54 UTC (rev 16258)
@@ -218,7 +218,6 @@
  OC_DCT_CW_PACK(0, 5, -1, 0),
  OC_DCT_CW_FINISH,           
  OC_DCT_CW_FINISH,           
- OC_DCT_CW_FINISH,           
 };
 #undef OC_DCT_CW_PACK
 
@@ -1308,6 +1307,29 @@
    (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
 }
 
+#if 1
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+#endif
+
 /*Reconstructs all coded fragments in a single MCU (one or two super block
    rows).
   This requires that each coded fragment have a proper macro block mode and
@@ -1338,11 +1360,40 @@
   for(fragii=0;fragii<ncoded_fragis;fragii++){
     /*This array is made twice as large as necessary so that an invalid zero
        run cannot cause a buffer overflow.*/
-    ogg_int16_t dct_coeffs[128];
+    OC_ALIGN8(ogg_int16_t dct_coeffs[128]);
     ptrdiff_t   fragi;
     int         last_zzi;
     int         zzi;
     fragi=coded_fragis[fragii];
+#if 1
+    ogg_uint16_t const*ac_quant;
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(dct_coeffs)
+      :"memory"
+    );
+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
+    ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
+#endif
     /*Decode the AC coefficients.*/
     for(zzi=0;zzi<64;){
       int token;
@@ -1377,8 +1428,13 @@
         coeff=(cw>>OC_DCT_CW_MAG_SHIFT);
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
+#if 0
         while(--rlen>=0)dct_coeffs[zzi++]=0;
         dct_coeffs[zzi]=coeff;
+#else
+        zzi+=rlen;
+        dct_coeffs[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+#endif
         zzi+=(eob==0);
       }
     }

Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-11 03:15:54 UTC (rev 16258)
@@ -613,6 +613,7 @@
   }
   else{
     int zzi;
+#if 0
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
     __asm__ __volatile__(
@@ -637,11 +638,14 @@
       :[y]"r"(_y)
       :"memory"
     );
+#endif
     /*Dequantize the coefficients.*/
     _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
+#if 0
     for(zzi=1;zzi<_ncoefs;zzi++){
       _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
     }
+#endif
     /*Then perform the iDCT.*/
     if(_last_zzi<10)oc_idct8x8_10(_y);
     else oc_idct8x8_slow(_y);

Modified: branches/theora-gumboot/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/x86/mmxstate.c	2009-07-11 03:15:54 UTC (rev 16258)
@@ -27,7 +27,11 @@
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
+#if 0
   OC_ALIGN8(ogg_int16_t    res_buf[64]);
+#else
+  ogg_int16_t *res_buf = _dct_coeffs;
+#endif
   unsigned char          *dst;
   ptrdiff_t               frag_buf_off;
   int                     ystride;



More information about the commits mailing list