[xiph-commits] r16260 - in branches/theora-gumboot/lib: dec/x86 enc
gumboot at svn.xiph.org
gumboot at svn.xiph.org
Sat Jul 11 13:19:36 PDT 2009
Author: gumboot
Date: 2009-07-11 13:19:36 -0700 (Sat, 11 Jul 2009)
New Revision: 16260
Modified:
branches/theora-gumboot/lib/dec/x86/mmxidct.c
branches/theora-gumboot/lib/enc/analyze.c
branches/theora-gumboot/lib/enc/tokenize.c
Log:
Make the encoder work, but still be dirty and broken on everything except non-VC x86 with MMX. The encoder now seems to be slightly slower, but at least it works.
Also create a zig-zag-ordered un-quantised dct coefficient buffer for oc_enc_tokenize_ac() so that it doesn't have to mess about with double-indirection through the zig-zag buffer. This did help a little, but it didn't completely compensate for whatever is slowing things down.
Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c 2009-07-11 20:19:36 UTC (rev 16260)
@@ -581,7 +581,11 @@
ogg_uint16_t p;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
+#if 0
p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
+#else
+ p=(ogg_int16_t)(_y[0]*(ogg_int32_t)_dc_quant+15>>5);
+#endif
/*Fill _y with p.*/
__asm__ __volatile__(
/*mm0=0000 0000 0000 AAAA*/
@@ -640,11 +644,13 @@
);
#endif
/*Dequantize the coefficients.*/
+#if 0
_y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-#if 0
for(zzi=1;zzi<_ncoefs;zzi++){
_y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
}
+#else
+ _y[0]=(ogg_int16_t)(_y[0]*(int)_dc_quant);
#endif
/*Then perform the iDCT.*/
if(_last_zzi<10)oc_idct8x8_10(_y);
Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c 2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/enc/analyze.c 2009-07-11 20:19:36 UTC (rev 16260)
@@ -516,6 +516,7 @@
oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
oc_rd_metric *_mo,oc_token_checkpoint **_stack){
OC_ALIGN16(ogg_int16_t buffer[64]);
+ OC_ALIGN16(ogg_int16_t zzbuffer[64]);
OC_ALIGN16(ogg_int16_t data[64]);
const ogg_uint16_t *dequant;
const oc_iquant *enquant;
@@ -651,6 +652,7 @@
int d;
v=buffer[OC_FZIG_ZAG[zzi]];
d=dequant[zzi];
+ zzbuffer[zzi]=v;
val=v<<1;
v=abs(val);
if(v>=d){
@@ -670,36 +672,43 @@
}
/*Tokenize.*/
checkpoint=*_stack;
- ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,buffer,nonzero+1,
+ ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,zzbuffer,nonzero+1,
_stack,mb_mode==OC_MODE_INTRA?3:0);
/*Reconstruct.
TODO: nonzero may need to be adjusted after tokenization.*/
+#if 0
oc_dequant_idct8x8(&_enc->state,buffer,data,
nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
- if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,buffer);
+#else
+// memcpy(buffer, data, sizeof(buffer));
+ int dc=data[0];
+ oc_dequant_idct8x8(&_enc->state,data,data,
+ nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
+#endif
+ if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
else{
oc_enc_frag_recon_inter(_enc,dst,
- nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
+ nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
}
#if !defined(OC_COLLECT_METRICS)
if(frame_type!=OC_INTRA_FRAME)
#endif
{
/*In retrospect, should we have skipped this block?*/
- oc_enc_frag_sub(_enc,buffer,src,dst,ystride);
+ oc_enc_frag_sub(_enc,data,src,dst,ystride);
coded_ssd=coded_dc=0;
if(borderi<0){
for(pi=0;pi<64;pi++){
- coded_ssd+=buffer[pi]*buffer[pi];
- coded_dc+=buffer[pi];
+ coded_ssd+=data[pi]*data[pi];
+ coded_dc+=data[pi];
}
}
else{
ogg_int64_t mask;
mask=_enc->state.borders[borderi].mask;
for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
- coded_ssd+=buffer[pi]*buffer[pi];
- coded_dc+=buffer[pi];
+ coded_ssd+=data[pi]*data[pi];
+ coded_dc+=data[pi];
}
}
/*Scale to match DCT domain.*/
@@ -736,7 +745,11 @@
_mo->ac_bits+=ac_bits;
}
}
+#if 0
frags[_fragi].dc=data[0];
+#else
+ frags[_fragi].dc=dc;
+#endif
frags[_fragi].coded=1;
return 1;
}
Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c 2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/enc/tokenize.c 2009-07-11 20:19:36 UTC (rev 16260)
@@ -208,6 +208,29 @@
int qc;
};
+#if 1
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+ each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+};
+#endif
+
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -255,7 +278,7 @@
qc=_qdct[zzi];
s=-(qc<0);
qc=qc+s^s;
- c=_dct[OC_FZIG_ZAG[zzi]];
+ c=_dct[zzi];
if(qc<=1){
ogg_uint32_t sum_d2;
int nzeros;
@@ -324,7 +347,7 @@
token=OC_DCT_RUN_CAT2A+cat;
bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
val=2+((val+val_s^val_s)>2);
- e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
+ e=(_dct[zzj]+val_s^val_s)-_dequant[zzj]*val;
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
cost=d2+lambda*bits+tokens[zzk][tk].cost;
if(cost<=best_cost){
@@ -347,7 +370,7 @@
token=OC_DCT_RUN_CAT1B+cat;
eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
}
- e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
+ e=(_dct[zzj]+val_s^val_s)-_dequant[zzj];
d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
cost=d2+lambda*bits+tokens[zzk][tk].cost;
@@ -626,6 +649,32 @@
}
/*Emit the tokens from the best path through the trellis.*/
stack=*_stack;
+#if 1
+ int dc=_qdct[0];
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,(%[y])\n\t"
+ "movq %%mm0,8(%[y])\n\t"
+ "movq %%mm0,16(%[y])\n\t"
+ "movq %%mm0,24(%[y])\n\t"
+ "movq %%mm0,32(%[y])\n\t"
+ "movq %%mm0,40(%[y])\n\t"
+ "movq %%mm0,48(%[y])\n\t"
+ "movq %%mm0,56(%[y])\n\t"
+ "movq %%mm0,64(%[y])\n\t"
+ "movq %%mm0,72(%[y])\n\t"
+ "movq %%mm0,80(%[y])\n\t"
+ "movq %%mm0,88(%[y])\n\t"
+ "movq %%mm0,96(%[y])\n\t"
+ "movq %%mm0,104(%[y])\n\t"
+ "movq %%mm0,112(%[y])\n\t"
+ "movq %%mm0,120(%[y])\n\t"
+ :
+ :[y]"r"(_qdct)
+ :"memory"
+ );
+ _qdct[0]=dc;
+#endif
zzi=1;
ti=best_flags>>1&1;
bits=tokens[zzi][ti].bits;
@@ -642,7 +691,11 @@
/*We don't include the actual EOB cost for this block in the return value.
It will be paid for by the fragment that terminates the EOB run.*/
bits-=tokens[zzi][ti].bits;
+#if 0
for(;zzi<_zzi;zzi++)_qdct[zzi]=0;
+#else
+ zzi=_zzi;
+#endif
break;
}
/*Emit pending EOB run if any.*/
@@ -654,8 +707,12 @@
next=tokens[zzi][ti].next;
qc=tokens[zzi][ti].qc;
zzj=(next>>1)-1&63;
+#if 0
for(;zzi<zzj;zzi++)_qdct[zzi]=0;
_qdct[zzj]=qc;
+#else
+ _qdct[OC_FZIG_ZAG_MMX[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+#endif
zzi=next>>1;
ti=next&1;
}
More information about the commits
mailing list