[xiph-commits] r16258 - in branches/theora-gumboot/lib/dec: . x86
gumboot at svn.xiph.org
gumboot at svn.xiph.org
Fri Jul 10 20:15:55 PDT 2009
Author: gumboot
Date: 2009-07-10 20:15:54 -0700 (Fri, 10 Jul 2009)
New Revision: 16258
Modified:
branches/theora-gumboot/lib/dec/decode.c
branches/theora-gumboot/lib/dec/x86/mmxidct.c
branches/theora-gumboot/lib/dec/x86/mmxstate.c
Log:
Quick-and-dirty (and broken on everything except non-VC x86 with MMX) rolling of the zig-zag and dequantisation into the un-packing code. Seems to be 25% faster for me, now.
Modified: branches/theora-gumboot/lib/dec/decode.c
===================================================================
--- branches/theora-gumboot/lib/dec/decode.c 2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/decode.c 2009-07-11 03:15:54 UTC (rev 16258)
@@ -218,7 +218,6 @@
OC_DCT_CW_PACK(0, 5, -1, 0),
OC_DCT_CW_FINISH,
OC_DCT_CW_FINISH,
- OC_DCT_CW_FINISH,
};
#undef OC_DCT_CW_PACK
@@ -1308,6 +1307,29 @@
(fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
}
+#if 1
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+ each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+};
+#endif
+
/*Reconstructs all coded fragments in a single MCU (one or two super block
rows).
This requires that each coded fragment have a proper macro block mode and
@@ -1338,11 +1360,40 @@
for(fragii=0;fragii<ncoded_fragis;fragii++){
/*This array is made twice as large as necessary so that an invalid zero
run cannot cause a buffer overflow.*/
- ogg_int16_t dct_coeffs[128];
+ OC_ALIGN8(ogg_int16_t dct_coeffs[128]);
ptrdiff_t fragi;
int last_zzi;
int zzi;
fragi=coded_fragis[fragii];
+#if 1
+ ogg_uint16_t const*ac_quant;
+ /*First zero the buffer.*/
+ /*On K7, etc., this could be replaced with movntq and sfence.*/
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,(%[y])\n\t"
+ "movq %%mm0,8(%[y])\n\t"
+ "movq %%mm0,16(%[y])\n\t"
+ "movq %%mm0,24(%[y])\n\t"
+ "movq %%mm0,32(%[y])\n\t"
+ "movq %%mm0,40(%[y])\n\t"
+ "movq %%mm0,48(%[y])\n\t"
+ "movq %%mm0,56(%[y])\n\t"
+ "movq %%mm0,64(%[y])\n\t"
+ "movq %%mm0,72(%[y])\n\t"
+ "movq %%mm0,80(%[y])\n\t"
+ "movq %%mm0,88(%[y])\n\t"
+ "movq %%mm0,96(%[y])\n\t"
+ "movq %%mm0,104(%[y])\n\t"
+ "movq %%mm0,112(%[y])\n\t"
+ "movq %%mm0,120(%[y])\n\t"
+ :
+ :[y]"r"(dct_coeffs)
+ :"memory"
+ );
+ qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
+ ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
+#endif
/*Decode the AC coefficients.*/
for(zzi=0;zzi<64;){
int token;
@@ -1377,8 +1428,13 @@
coeff=(cw>>OC_DCT_CW_MAG_SHIFT);
eob_runs[zzi]=eob;
ti[zzi]=lti;
+#if 0
while(--rlen>=0)dct_coeffs[zzi++]=0;
dct_coeffs[zzi]=coeff;
+#else
+ zzi+=rlen;
+ dct_coeffs[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+#endif
zzi+=(eob==0);
}
}
Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-11 03:15:54 UTC (rev 16258)
@@ -613,6 +613,7 @@
}
else{
int zzi;
+#if 0
/*First zero the buffer.*/
/*On K7, etc., this could be replaced with movntq and sfence.*/
__asm__ __volatile__(
@@ -637,11 +638,14 @@
:[y]"r"(_y)
:"memory"
);
+#endif
/*Dequantize the coefficients.*/
_y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
+#if 0
for(zzi=1;zzi<_ncoefs;zzi++){
_y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
}
+#endif
/*Then perform the iDCT.*/
if(_last_zzi<10)oc_idct8x8_10(_y);
else oc_idct8x8_slow(_y);
Modified: branches/theora-gumboot/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxstate.c 2009-07-11 01:23:16 UTC (rev 16257)
+++ branches/theora-gumboot/lib/dec/x86/mmxstate.c 2009-07-11 03:15:54 UTC (rev 16258)
@@ -27,7 +27,11 @@
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
+#if 0
OC_ALIGN8(ogg_int16_t res_buf[64]);
+#else
+ ogg_int16_t *res_buf = _dct_coeffs;
+#endif
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
More information about the commits
mailing list