[xiph-commits] r17307 - in experimental/derf/theora-ptalarbvorm/lib: . x86
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sat Jun 26 23:02:15 PDT 2010
Author: tterribe
Date: 2010-06-26 23:02:15 -0700 (Sat, 26 Jun 2010)
New Revision: 17307
Modified:
experimental/derf/theora-ptalarbvorm/lib/analyze.c
experimental/derf/theora-ptalarbvorm/lib/encint.h
experimental/derf/theora-ptalarbvorm/lib/encode.c
experimental/derf/theora-ptalarbvorm/lib/enquant.c
experimental/derf/theora-ptalarbvorm/lib/quant.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
Log:
More cleanup/refactoring of the quantization code.
The post-quantization clamps have been entirely removed; they should be
unnecessary given the range of the input and the minimum quantizers allowed.
This knocks 10-11 cycles off oc_enc_quantize_sse2.
The quantizer tables also now have the proper DC coefficient injected once per
frame, instead of for every block.
This adds a bunch of extra initialization code to deal with the per-arch
quantization matrix format, but simplifies the code that executes for every
coded block (oc_enc_quantize_sse2 alone drops by almost 100 bytes).
Together with some additional code-size reductions this knocks another 4-5
cycles off oc_enc_quantize_sse2.
The total encode time, despite saving at least 14-15 cycles/block, remains
almost entirely unchanged.
Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/analyze.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -560,10 +560,6 @@
int bounding_values[256];
oc_fr_state fr[3];
oc_qii_state qs[3];
- /*Condensed dequantization tables.*/
- const ogg_uint16_t *dequant[3][3][2];
- /*Condensed quantization tables.*/
- const oc_iquant *enquant[3][3][2];
/*Skip SSD storage for the current MCU in each plane.*/
unsigned *skip_ssd[3];
/*Coded/uncoded fragment lists for each plane for the current MCU.*/
@@ -597,7 +593,9 @@
int hdec;
int vdec;
int pli;
+ int nqis;
int qii;
+ int qi0;
int qti;
/*Initialize the per-plane coded block flag trackers.
These are used for bit-estimation purposes only; the real flag bits span
@@ -626,16 +624,25 @@
memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
/*Set up condensed quantizer tables.*/
+ qi0=_enc->state.qis[0];
+ nqis=_enc->state.nqis;
for(pli=0;pli<3;pli++){
- for(qii=0;qii<_enc->state.nqis;qii++){
+ for(qii=0;qii<nqis;qii++){
int qi;
qi=_enc->state.qis[qii];
for(qti=0;qti<2;qti++){
- _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
- _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+ /*Set the DC coefficient in the dequantization table.*/
+ _enc->state.dequant_tables[qi][pli][qti][0]=
+ _enc->dequant_dc[qi0][pli][qti];
+ _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+ /*Copy over the quantization table.*/
+ memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
+ _enc->opt_data.enquant_table_size);
}
}
}
+ /*Fix up the DC coefficients in the quantization tables.*/
+ oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
/*Initialize the tokenization state.*/
for(pli=0;pli<3;pli++){
_pipe->ndct_tokens1[pli]=0;
@@ -737,7 +744,8 @@
OC_ALIGN16(ogg_int16_t dct[64]);
OC_ALIGN16(ogg_int16_t data[64]);
oc_qii_state qs;
- ogg_uint16_t dc_dequant;
+ const ogg_uint16_t *dequant;
+ ogg_uint16_t dequant_dc;
ptrdiff_t frag_offs;
int ystride;
const unsigned char *src;
@@ -828,18 +836,16 @@
oc_enc_fdct8x8(_enc,dct,data);
/*Quantize:*/
qti=mb_mode!=OC_MODE_INTRA;
- dc_dequant=_pipe->dequant[_pli][0][qti][0];
- nonzero=oc_enc_quantize(_enc,data,dct,
- dc_dequant,_pipe->dequant[_pli][qii][qti],
- _pipe->enquant[_pli][0][qti],_pipe->enquant[_pli][qii][qti]);
+ dequant=_enc->dequant[_pli][qii][qti];
+ nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
dc=data[0];
/*Tokenize.*/
checkpoint=*_stack;
- ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,
- _pipe->dequant[_pli][qii][qti],dct,nonzero+1,_stack,
- OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+ ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+ _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
/*Reconstruct.
TODO: nonzero may need to be adjusted after tokenization.*/
+ dequant_dc=dequant[0];
if(nonzero==0){
ogg_int16_t p;
int ci;
@@ -847,7 +853,7 @@
int qi12;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
- p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
+ p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
/*LOOP VECTORIZES.*/
for(ci=0;ci<64;ci++)data[ci]=p;
/*We didn't code any AC coefficients, so don't change the quantizer.*/
@@ -857,7 +863,7 @@
else if(qi01>=0)qii=0;
}
else{
- data[0]=dc*dc_dequant;
+ data[0]=dc*dequant_dc;
oc_idct8x8(&_enc->state,data,nonzero+1);
}
oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h 2010-06-27 06:02:15 UTC (rev 17307)
@@ -146,9 +146,9 @@
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
void (*enquant_table_init)(void *_enquant,
const ogg_uint16_t _dequant[64]);
+ void (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
int (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
const ogg_int16_t _residue[64]);
void (*frag_recon_inter)(unsigned char *_dst,
@@ -450,7 +450,15 @@
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
/*The quantization parameters in use.*/
th_quant_info qinfo;
- oc_iquant *enquant_tables[64][3][2];
+ /*The original DC coefficients saved off from the dequatization tables.*/
+ ogg_uint16_t dequant_dc[64][3][2];
+ /*Condensed dequantization tables.*/
+ const ogg_uint16_t *dequant[3][3][2];
+ /*Condensed quantization tables.*/
+ void *enquant[3][3][2];
+ /*The full set of quantization tables.*/
+ void *enquant_tables[64][3][2];
+ /*Storage for the quantization tables.*/
unsigned char *enquant_table_data;
/*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
This is used to paramterize the rate control decisions.
@@ -560,12 +568,13 @@
ogg_int64_t _mask);
void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
- const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,
+ void *_enquant,const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup(const oc_enc_ctx *_enc,
+ void *_enquant[3][3][2],int _nqis);
int oc_enc_quantize(const oc_enc_ctx *_enc,
ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
@@ -601,9 +610,9 @@
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
void oc_enc_enquant_table_init_c(void *_enquant,
const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/encode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encode.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/encode.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -950,6 +950,7 @@
_enc->opt_data.enquant_table_size=64*sizeof(oc_iquant);
_enc->opt_data.enquant_table_alignment=16;
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_c;
+ _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_c;
_enc->opt_vtable.quantize=oc_enc_quantize_c;
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
@@ -1054,61 +1055,41 @@
return 0;
}
-void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
- const ogg_uint16_t _dequant[64]){
- (*_enc->opt_vtable.enquant_table_init)(_enquant,_dequant);
-}
-
static void oc_enc_enquant_tables_init(oc_enc_ctx *_enc,
const th_quant_info *_qinfo){
unsigned char *etd;
size_t ets;
int align;
+ int qii;
int qi;
int pli;
int qti;
+ for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+ _enc->state.dequant_tables[qi][pli][qti]=
+ _enc->state.dequant_table_data[qi][pli][qti];
+ }
+ /*Initialize the dequantization tables.*/
+ oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
+ /*And save off the DC values.*/
+ for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+ _enc->dequant_dc[qi][pli][qti]=_enc->state.dequant_tables[qi][pli][qti][0];
+ }
+ /*Set up storage for the quantization tables.*/
etd=_enc->enquant_table_data;
ets=_enc->opt_data.enquant_table_size;
align=-(etd-(unsigned char *)0)&_enc->opt_data.enquant_table_alignment-1;
etd+=align;
+ /*Set up the main tables.*/
for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
- _enc->state.dequant_tables[qi][pli][qti]=
- _enc->state.dequant_table_data[qi][pli][qti];
- _enc->enquant_tables[qi][pli][qti]=etd+((qi*3+pli)*2+qti)*ets;
+ _enc->enquant_tables[qi][pli][qti]=etd;
+ oc_enc_enquant_table_init(_enc,etd,
+ _enc->state.dequant_tables[qi][pli][qti]);
+ etd+=ets;
}
- /*Initialize the dequantization tables first.*/
- oc_dequant_tables_init(_enc->state.dequant_tables,NULL,_qinfo);
- /*Derive the quantization tables directly from the dequantization tables.*/
- for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
- int plj;
- int qtj;
- int dupe;
- dupe=0;
- for(qtj=0;qtj<=qti;qtj++){
- for(plj=0;plj<(qtj<qti?3:pli);plj++){
- if(_enc->state.dequant_tables[qi][pli][qti]==
- _enc->state.dequant_tables[qi][plj][qtj]){
- dupe=1;
- break;
- }
- }
- if(dupe)break;
- }
- if(dupe){
- _enc->enquant_tables[qi][pli][qti]=_enc->enquant_tables[qi][plj][qtj];
- }
- /*In the original VP3.2 code, the rounding offset and the size of the
- dead zone around 0 were controlled by a "sharpness" parameter.
- We now R-D optimize the tokens for each block after quantization,
- so the rounding offset should always be 1/2, and an explicit dead
- zone is unnecessary.
- Hence, all of that VP3.2 code is gone from here, and the remaining
- floating point code has been implemented as equivalent integer
- code with exact precision.*/
- else{
- oc_enc_enquant_table_init(_enc,_enc->enquant_tables[qi][pli][qti],
- _enc->state.dequant_tables[qi][pli][qti]);
- }
+ /*Set up storage for the local copies we modify for each frame.*/
+ for(pli=0;pli<3;pli++)for(qii=0;qii<3;qii++)for(qti=0;qti<2;qti++){
+ _enc->enquant[pli][qii][qti]=etd;
+ etd+=ets;
}
}
@@ -1190,7 +1171,7 @@
oc_enc_vtable_init_c(_enc);
#endif
_enc->enquant_table_data=(unsigned char *)_ogg_malloc(
- 64*3*2*_enc->opt_data.enquant_table_size
+ (64+3)*3*2*_enc->opt_data.enquant_table_size
+_enc->opt_data.enquant_table_alignment-1);
_enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
_enc->state.qis[0]=_enc->state.info.quality;
Modified: experimental/derf/theora-ptalarbvorm/lib/enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/enquant.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/enquant.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -129,49 +129,62 @@
_this->l=l;
}
+void oc_enc_enquant_table_init(const oc_enc_ctx *_enc,void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+ (*_enc->opt_vtable.enquant_table_init)(_enquant,_dequant);
+}
+
void oc_enc_enquant_table_init_c(void *_enquant,
const ogg_uint16_t _dequant[64]){
oc_iquant *enquant;
int zzi;
+ /*In the original VP3.2 code, the rounding offset and the size of the
+ dead zone around 0 were controlled by a "sharpness" parameter.
+ We now R-D optimize the tokens for each block after quantization,
+ so the rounding offset should always be 1/2, and an explicit dead
+ zone is unnecessary.
+ Hence, all of that VP3.2 code is gone from here, and the remaining
+ floating point code has been implemented as equivalent integer
+ code with exact precision.*/
enquant=(oc_iquant *)_enquant;
for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
}
+void oc_enc_enquant_table_fixup(const oc_enc_ctx *_enc,
+ void *_enquant[3][3][2],int _nqis){
+ (*_enc->opt_vtable.enquant_table_fixup)(_enquant,_nqis);
+}
+
+void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){
+ int pli;
+ int qii;
+ int qti;
+ for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+ *((oc_iquant *)_enquant[pli][qii][qti])=
+ *((oc_iquant *)_enquant[pli][0][qti]);
+ }
+}
+
int oc_enc_quantize(const oc_enc_ctx *_enc,
ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
- return (*_enc->opt_vtable.quantize)(_qdct,_dct,
- _dc_dequant,_ac_dequant,_dc_enquant,_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+ return (*_enc->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant);
}
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
+ const ogg_uint16_t _dequant[64],const void *_enquant){
const oc_iquant *enquant;
int nonzero;
int zzi;
- int v;
int val;
int d;
int s;
- /*Quantize the DC coefficient:*/
- enquant=(const oc_iquant *)_dc_enquant;
- v=_dct[0];
- val=v<<1;
- s=OC_SIGNMASK(val);
- val+=_dc_dequant+s^s;
- val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
- _qdct[0]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
- nonzero=0;
- /*Quantize the AC coefficients:*/
- enquant=(const oc_iquant *)_ac_enquant;
- for(zzi=1;zzi<64;zzi++){
- v=_dct[OC_FZIG_ZAG[zzi]];
- d=_ac_dequant[zzi];
- val=v<<1;
- v=abs(val);
- if(v>=d){
+ enquant=(const oc_iquant *)_enquant;
+ for(zzi=0;zzi<64;zzi++){
+ val=_dct[OC_FZIG_ZAG[zzi]];
+ d=_dequant[zzi];
+ val=val<<1;
+ if(abs(val)>=d){
s=OC_SIGNMASK(val);
/*The bias added here rounds ties away from zero, since token
optimization can only decrease the magnitude of the quantized
@@ -180,7 +193,7 @@
/*Note the arithmetic right shift is not guaranteed by ANSI C.
Hopefully no one still uses ones-complement architectures.*/
val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
- _qdct[zzi]=(ogg_int16_t)OC_CLAMPI(-580,val,580);
+ _qdct[zzi]=(ogg_int16_t)val;
nonzero=zzi;
}
else _qdct[zzi]=0;
Modified: experimental/derf/theora-ptalarbvorm/lib/quant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/quant.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/quant.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -21,6 +21,14 @@
#include "quant.h"
#include "decint.h"
+/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
+ These minimum quantizers ensure the result after quantization (and after
+ prediction for DC) will be no more than +/- 510.
+ The tokenization system can handle values up to +/- 580, so there is no need
+ to do any coefficient clamping.
+ I would rather have allowed smaller quantizers and had to clamp, but these
+ minimums were required when constructing the original VP3 matrices and have
+ been formalized in the spec.*/
static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -52,6 +52,7 @@
_enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
_enc->opt_data.enquant_table_alignment=16;
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+ _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
_enc->opt_vtable.quantize=oc_enc_quantize_sse2;
}
}
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h 2010-06-27 06:02:15 UTC (rev 17307)
@@ -51,9 +51,9 @@
const unsigned char *_src,int _ystride);
void oc_enc_enquant_table_init_x86(void *_enquant,
const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant);
+ const ogg_uint16_t _dequant[64],const void *_enquant);
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c 2010-06-26 18:21:33 UTC (rev 17306)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enquant.c 2010-06-27 06:02:15 UTC (rev 17307)
@@ -21,13 +21,6 @@
-/*The maximum quantized coefficient value.*/
-static const ogg_uint16_t __attribute__((aligned(16))) OC_COEFF_MAX_SSE2[8]={
- 580,580,580,580,580,580,580,580
-};
-
-
-
/*The default enquant table is not quite suitable for SIMD purposes.
First, the m and l parameters need to be separated so that an entire row full
of m's or l's can be loaded at a time.
@@ -46,16 +39,28 @@
oc_iquant_init(&q,_dequant[zzi]);
m[zzi]=q.m;
/*q.l must be at least 2 for this to work; fortunately, once all the scale
- factors are baked in, the minimum quantizer is much larger.*/
+ factors are baked in, the minimum quantizer is much larger than that.*/
l[zzi]=1<<16-q.l;
}
}
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+ int pli;
+ int qii;
+ int qti;
+ for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+ ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
+ ((ogg_int16_t *)_enquant[pli][0][qti])[0];
+ ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
+ ((ogg_int16_t *)_enquant[pli][0][qti])[64];
+ }
+}
+
/*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
and store them in %[qdct].
- The index of each output element in the original 64-element array should be
- the following 8x8 array (the letters indicate the order we compute each
- 4-tuple below):
+ The index of each output element in the original 64-element array should wind
+ up in the following 8x8 matrix (the letters indicate the order we compute
+ each 4-tuple below):
A 0 1 8 16 9 2 3 10 B
C 17 24 32 25 18 11 4 5 D
E 12 19 26 33 40 48 41 34 I
@@ -65,7 +70,7 @@
P 58 59 52 45 38 31 39 46 L
N 53 60 61 54 47 55 62 63 O
The order of the coefficients within each tuple is reversed in the comments
- below to reflect the usual MSB to LSB ordering.*/
+ below to reflect the usual MSB to LSB notation.*/
#define OC_ZIG_ZAG_MMXEXT \
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
@@ -158,33 +163,22 @@
"movq %%mm7,0x60(%[qdct])\n\t" \
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
- ogg_uint16_t _dc_dequant,const ogg_uint16_t _ac_dequant[64],
- const void *_dc_enquant,const void *_ac_enquant){
+ const ogg_uint16_t _dequant[64],const void *_enquant){
ptrdiff_t r;
- /*Load the first rows of the quantizer data and inject the DC terms.
- We do this early to reduce general-purpose register pressure and because
- pinsrw has a very long latency.*/
__asm__ __volatile__(
- "movdqa 0x00(%[dq]),%%xmm2\n\t"
- "movdqa 0x00(%[q]),%%xmm4\n\t"
- "movdqa 0x80(%[q]),%%xmm5\n\t"
- "pinsrw $0,%k[dc_dq],%%xmm2\n\t"
- "pinsrw $0,0x00(%[dc_q]),%%xmm4\n\t"
- "pinsrw $0,0x80(%[dc_q]),%%xmm5\n\t"
- :[dq]"+r"(_ac_dequant),[dc_q]"+r"(_dc_enquant),[q]"+r"(_ac_enquant)
- :[dc_dq]"r"(_dc_dequant)
- );
- __asm__ __volatile__(
/*Put the input in zig-zag order.*/
OC_ZIG_ZAG_MMXEXT
- /*Loading the first two rows of data and the second dequant row.*/
- "movdqa 0x00(%[qdct]),%%xmm0\n\t"
- "movdqa 0x10(%[qdct]),%%xmm1\n\t"
- "movdqa 0x10(%[dq]),%%xmm3\n\t"
- "mov $-0x60,%[r]\n\t"
+ "xor %[r],%[r]\n\t"
/*Loop through two rows at a time.*/
".p2align 4\n\t"
"0:\n\t"
+ /*Load the first two rows of the data and the quant matrices.*/
+ "movdqa 0x00(%[qdct],%[r]),%%xmm0\n\t"
+ "movdqa 0x10(%[qdct],%[r]),%%xmm1\n\t"
+ "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
+ "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
+ "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
+ "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
/*Double the input and propagate its sign to the rounding factor.
Using SSSE3's psignw would help here, but we need the mask later anyway.*/
"movdqa %%xmm0,%%xmm6\n\t"
@@ -199,64 +193,42 @@
"pxor %%xmm1,%%xmm3\n\t"
/*Add the rounding factor and perform the first multiply.*/
"paddw %%xmm2,%%xmm6\n\t"
- "movdqa 0x70(%[q],%[r]),%%xmm2\n\t"
"paddw %%xmm3,%%xmm7\n\t"
- "movdqa 0xF0(%[q],%[r]),%%xmm3\n\t"
"pmulhw %%xmm6,%%xmm4\n\t"
- "pmulhw %%xmm7,%%xmm2\n\t"
+ "pmulhw %%xmm7,%%xmm5\n\t"
+ "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
+ "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
"paddw %%xmm4,%%xmm6\n\t"
- "paddw %%xmm2,%%xmm7\n\t"
+ "paddw %%xmm5,%%xmm7\n\t"
/*Emulate an element-wise right-shift via a second multiply.*/
- "pmulhw %%xmm5,%%xmm6\n\t"
+ "pmulhw %%xmm2,%%xmm6\n\t"
"pmulhw %%xmm3,%%xmm7\n\t"
- /*Load the bounds for the clamp operation.
- It would be nice to keep these around across iterations, but there aren't
- enough registers, and it's not like we're doing anything else while
- waiting for the multiplies to finish.*/
- "movdqa %[c],%%xmm2\n\t"
- "pxor %%xmm3,%%xmm3\n\t"
"add $32,%[r]\n\t"
- "psubw %%xmm2,%%xmm3\n\t"
+ "cmp $96,%[r]\n\t"
/*Correct for the sign.*/
"psubw %%xmm0,%%xmm6\n\t"
"psubw %%xmm1,%%xmm7\n\t"
- /*Clamp into the valid range.*/
- "pminsw %%xmm2,%%xmm6\n\t"
- "pminsw %%xmm2,%%xmm7\n\t"
- "pmaxsw %%xmm3,%%xmm6\n\t"
- "pmaxsw %%xmm3,%%xmm7\n\t"
/*Save the result.*/
- "movdqa %%xmm6,0x40(%[qdct],%[r])\n\t"
- "movdqa %%xmm7,0x50(%[qdct],%[r])\n\t"
- "jg 1f\n\t"
- /*Start loading the data for the next iteration.*/
- "movdqa 0x60(%[qdct],%[r]),%%xmm0\n\t"
- "movdqa 0x70(%[qdct],%[r]),%%xmm1\n\t"
- "movdqa 0x60(%[dq],%[r]),%%xmm2\n\t"
- "movdqa 0x70(%[dq],%[r]),%%xmm3\n\t"
- "movdqa 0x60(%[q],%[r]),%%xmm4\n\t"
- "movdqa 0xE0(%[q],%[r]),%%xmm5\n\t"
- "jmp 0b\n\t"
- ".p2align 4\n\t"
- "1:\n\t"
+ "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
+ "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
+ "jle 0b\n\t"
/*Now find the location of the last non-zero value.*/
"movdqa 0x50(%[qdct]),%%xmm5\n\t"
"movdqa 0x40(%[qdct]),%%xmm4\n\t"
"packsswb %%xmm7,%%xmm6\n\t"
- "pxor %%xmm0,%%xmm0\n\t"
- "mov $0xFFFFFFFF,%[dq]\n\t"
"packsswb %%xmm5,%%xmm4\n\t"
+ "pxor %%xmm0,%%xmm0\n\t"
+ "mov $-1,%k[dq]\n\t"
"pcmpeqb %%xmm0,%%xmm6\n\t"
"pcmpeqb %%xmm0,%%xmm4\n\t"
- "pmovmskb %%xmm6,%[q]\n\t"
- "pmovmskb %%xmm4,%[r]\n\t"
- "shl $16,%[q]\n\t"
- "or %[r],%[q]\n\t"
+ "pmovmskb %%xmm6,%k[q]\n\t"
+ "pmovmskb %%xmm4,%k[r]\n\t"
+ "shl $16,%k[q]\n\t"
+ "or %k[r],%k[q]\n\t"
"mov $32,%[r]\n\t"
- /*We have to use xor here instead of not in order to set the flags.
- This also makes it easy to flip just the lower 32 bits on x86-64.*/
- "xor %[dq],%[q]\n\t"
- "jnz 2f\n\t"
+ /*We have to use xor here instead of not in order to set the flags.*/
+ "xor %k[dq],%k[q]\n\t"
+ "jnz 1f\n\t"
"movdqa 0x30(%[qdct]),%%xmm7\n\t"
"movdqa 0x20(%[qdct]),%%xmm6\n\t"
"movdqa 0x10(%[qdct]),%%xmm5\n\t"
@@ -265,19 +237,18 @@
"packsswb %%xmm5,%%xmm4\n\t"
"pcmpeqb %%xmm0,%%xmm6\n\t"
"pcmpeqb %%xmm0,%%xmm4\n\t"
- "pmovmskb %%xmm6,%[q]\n\t"
- "pmovmskb %%xmm4,%[r]\n\t"
- "shl $16,%[q]\n\t"
- "or %[r],%[q]\n\t"
+ "pmovmskb %%xmm6,%k[q]\n\t"
+ "pmovmskb %%xmm4,%k[r]\n\t"
+ "shl $16,%k[q]\n\t"
+ "or %k[r],%k[q]\n\t"
"xor %[r],%[r]\n\t"
- "xor %[dq],%[q]\n\t"
- "or $1,%[q]\n\t"
- "2:\n\t"
- "bsr %[q],%[q]\n\t"
- "add %[q],%[r]\n\t"
- :[r]"=&a"(r),[q]"+r"(_ac_enquant)
- :[dct]"r"(_dct),[qdct]"r"(_qdct),[dq]"r"(_ac_dequant),
- [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_uint16_t,OC_COEFF_MAX_SSE2,8))
+ "not %k[q]\n\t"
+ "or $1,%k[q]\n\t"
+ "1:\n\t"
+ "bsr %k[q],%k[q]\n\t"
+ "add %k[q],%k[r]\n\t"
+ :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+ :[dct]"r"(_dct),[qdct]"r"(_qdct)
:"cc","memory"
);
return (int)r;
More information about the commits
mailing list