[xiph-commits] r16130 - in branches/theora-thusnelda/lib: dec dec/x86 enc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sun Jun 14 11:50:23 PDT 2009
Author: tterribe
Date: 2009-06-14 11:50:22 -0700 (Sun, 14 Jun 2009)
New Revision: 16130
Modified:
branches/theora-thusnelda/lib/dec/decode.c
branches/theora-thusnelda/lib/dec/state.c
branches/theora-thusnelda/lib/dec/x86/mmxstate.c
branches/theora-thusnelda/lib/enc/analyze.c
branches/theora-thusnelda/lib/enc/encint.h
branches/theora-thusnelda/lib/enc/encode.c
branches/theora-thusnelda/lib/enc/mcenc.c
branches/theora-thusnelda/lib/enc/tokenize.c
Log:
Pipeline encode so that MB mode decision, transform, quantization,
tokenization, reconstruction, loop filtering, and boundary extension are all
performed on a couple super blocks rows before moving on to subsequent rows.
This means we only have to load the frame data into cache once, and gives a
3.1% speed improvement on x86-32, and a 1% improvement on x86-64 (measured for
a single 1080p file at a single rate).
Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/decode.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -1308,7 +1308,7 @@
const ptrdiff_t *uncoded_fragis[3];
ptrdiff_t ncoded_fragis[3];
ptrdiff_t nuncoded_fragis[3];
- const ogg_uint16_t *qtables[3][3][2];
+ const ogg_uint16_t *dequant[3][3][2];
int fragy0[3];
int fragy_end[3];
int pred_last[3][3];
@@ -1355,7 +1355,7 @@
for(pli=0;pli<3;pli++){
for(qii=0;qii<_dec->state.nqis;qii++){
for(qti=0;qti<2;qti++){
- _pipe->qtables[pli][qii][qti]=
+ _pipe->dequant[pli][qii][qti]=
_dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
}
}
@@ -1445,7 +1445,7 @@
ti=_pipe->ti[_pli];
ebi=_pipe->ebi[_pli];
eob_runs=_pipe->eob_runs[_pli];
- for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->qtables[_pli][0][qti][0];
+ for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
for(fragii=0;fragii<ncoded_fragis;fragii++){
/*This array is made twice as large as necessary so that an invalid zero
run cannot cause a buffer overflow.*/
@@ -1483,7 +1483,7 @@
/*last_zzi is always initialized.
If your compiler thinks otherwise, it is dumb.*/
oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,
- dc_quant[qti],_pipe->qtables[_pli][frags[fragi].qii][qti]);
+ dc_quant[qti],_pipe->dequant[_pli][frags[fragi].qii][qti]);
}
_pipe->coded_fragis[_pli]+=ncoded_fragis;
/*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -2028,7 +2028,7 @@
oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
notstart=0;
notdone=1;
- for(stripe_fragy=notstart=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+ for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
int avail_fragy0;
int avail_fragy_end;
avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/state.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -1004,11 +1004,6 @@
_bv+=127;
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
- /*The following loops are constructed somewhat non-intuitively on purpose.
- The main idea is: if a block boundary has at least one coded fragment on
- it, the filter is applied to it.
- However, the order that the filters are applied in matters, and VP3 chose
- the somewhat strange ordering used below.*/
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
@@ -1017,6 +1012,11 @@
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -97,7 +97,6 @@
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
unsigned char OC_ALIGN8 ll[8];
- const th_img_plane *iplane;
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
@@ -109,22 +108,21 @@
int ystride;
int nhfrags;
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
- iplane=_state->ref_frame_bufs[_refi]+_pli;
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
- /*The following loops are constructed somewhat non-intuitively on purpose.
- The main idea is: if a block boundary has at least one coded fragment on
- it, the filter is applied to it.
- However, the order that the filters are applied in matters, and VP3 chose
- the somewhat strange ordering used below.*/
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
- ystride=iplane->stride;
+ ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/analyze.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -21,63 +21,13 @@
-typedef struct oc_plane_state oc_plane_state;
-typedef struct oc_frag_state oc_frag_state;
-typedef struct oc_mode_choice oc_mode_choice;
-typedef struct oc_rd_metric oc_rd_metric;
+typedef struct oc_fr_state oc_fr_state;
+typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
+typedef struct oc_rd_metric oc_rd_metric;
+typedef struct oc_mode_choice oc_mode_choice;
-/*Temporary encoder state for a single color plane.*/
-struct oc_plane_state{
- /*Condensed dequantization tables.*/
- const ogg_uint16_t *dequant[3][2];
- /*Condensed quantization tables.*/
- const oc_iquant *enquant[3][2];
- /*Plane index.*/
- int pli;
-};
-
-
-
-/*State to track coded block flags and their bit cost.*/
-struct oc_frag_state{
- unsigned sb_partial_count:16;
- unsigned sb_full_count:16;
- unsigned b_count:8;
- unsigned b_pend:8;
- signed int sb_partial_last:2;
- signed int sb_full_last:2;
- signed int b_last:2;
- unsigned sb_partial:1;
- unsigned sb_coded:1;
- unsigned sb_partial_break:1;
- unsigned sb_full_break:1;
- ptrdiff_t bits;
-};
-
-
-
-/*Cost information about a MB mode.*/
-struct oc_mode_choice{
- unsigned cost;
- unsigned ssd;
- unsigned rate;
- unsigned overhead;
-};
-
-
-
-/*Cost information about the coded blocks in a MB.*/
-struct oc_rd_metric{
- int uncoded_ac_ssd;
- int coded_ac_ssd;
- int ac_bits;
- int dc_flag;
-};
-
-
-
/*There are 8 possible schemes used to encode macro block modes.
Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
The same set of Huffman codes is used for each of these 7 schemes, but the
@@ -244,36 +194,41 @@
-static void oc_plane_state_plane_setup(oc_enc_ctx *_enc,oc_plane_state *_ps,
- int _pli){
- int qii;
- int qti;
- _ps->pli=_pli;
- for(qii=0;qii<_enc->state.nqis;qii++){
- int qi;
- qi=_enc->state.qis[qii];
- for(qti=0;qti<2;qti++){
- _ps->dequant[qii][qti]=_enc->state.dequant_tables[qi][_pli][qti];
- _ps->enquant[qii][qti]=_enc->enquant_tables[qi][_pli][qti];
- }
- }
-}
-
-
-
+/*The number of bits required to encode a super block run.
+ _run_count: The desired run count; must be positive and less than 4130.*/
static int oc_sb_run_bits(int _run_count){
int i;
for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
return OC_SB_RUN_CODE_NBITS[i];
}
+/*The number of bits required to encode a block run.
+ _run_count: The desired run count; must be positive and less than 30.*/
static int oc_block_run_bits(int _run_count){
return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
}
-static void oc_fr_state_init(oc_frag_state *_fr){
+/*State to track coded block flags and their bit cost.*/
+struct oc_fr_state{
+ unsigned sb_partial_count:16;
+ unsigned sb_full_count:16;
+ unsigned b_count:8;
+ unsigned b_pend:8;
+ signed int sb_partial_last:2;
+ signed int sb_full_last:2;
+ signed int b_last:2;
+ unsigned sb_partial:1;
+ unsigned sb_coded:1;
+ unsigned sb_partial_break:1;
+ unsigned sb_full_break:1;
+ ptrdiff_t bits;
+};
+
+
+
+static void oc_fr_state_init(oc_fr_state *_fr){
_fr->sb_partial_last=-1;
_fr->sb_partial_count=0;
_fr->sb_partial_break=0;
@@ -289,7 +244,7 @@
}
-static void oc_fr_skip_block(oc_frag_state *_fr){
+static void oc_fr_skip_block(oc_fr_state *_fr){
if(_fr->sb_coded){
if(!_fr->sb_partial){
/*The super block was previously fully coded.*/
@@ -321,7 +276,7 @@
_fr->sb_partial=1;
}
-static void oc_fr_code_block(oc_frag_state *_fr){
+static void oc_fr_code_block(oc_fr_state *_fr){
if(_fr->sb_partial){
if(!_fr->sb_coded){
/*The super block was previously completely uncoded...*/
@@ -353,7 +308,7 @@
_fr->sb_coded=1;
}
-static void oc_fr_finish_sb(oc_frag_state *_fr){
+static void oc_fr_finish_sb(oc_fr_state *_fr){
/*Update the partial flag.*/
int partial;
partial=_fr->sb_partial&_fr->sb_coded;
@@ -399,7 +354,7 @@
_fr->sb_coded=0;
}
-static void oc_fr_flush(oc_frag_state *_fr){
+static void oc_fr_flush(oc_fr_state *_fr){
/*Flush any pending partial run.*/
if(_fr->sb_partial_break)_fr->bits++;
if(_fr->sb_partial_count)_fr->bits+=oc_sb_run_bits(_fr->sb_partial_count);
@@ -410,9 +365,9 @@
if(_fr->b_count)_fr->bits+=oc_block_run_bits(_fr->b_count);
}
-static int oc_fr_cost1(const oc_frag_state *_fr){
- oc_frag_state tmp;
- int bits;
+static int oc_fr_cost1(const oc_fr_state *_fr){
+ oc_fr_state tmp;
+ int bits;
*&tmp=*_fr;
oc_fr_skip_block(&tmp);
bits=tmp.bits;
@@ -421,8 +376,8 @@
return tmp.bits-bits;
}
-static int oc_fr_cost4(const oc_frag_state *_pre,const oc_frag_state *_post){
- oc_frag_state tmp;
+static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
+ oc_fr_state tmp;
*&tmp=*_pre;
oc_fr_skip_block(&tmp);
oc_fr_skip_block(&tmp);
@@ -433,23 +388,132 @@
-static void oc_enc_frag_uncode(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi){
- const unsigned char *src;
- unsigned char *dst;
- ptrdiff_t frag_offs;
- int ystride;
- frag_offs=_enc->state.frag_buf_offs[_fragi];
- ystride=_enc->state.ref_ystride[_pli];
- src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
- dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]];
- oc_frag_copy(&_enc->state,dst+frag_offs,src+frag_offs,ystride);
- _enc->state.frags[_fragi].coded=0;
- /*We do NOT update frags[_fragi].mb_mode or frag_mvs[_fragi], since they are
- not subsequently referenced by uncoded fragments.*/
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+ int bounding_values[256];
+ oc_fr_state fr[3];
+ /*Condensed dequantization tables.*/
+ const ogg_uint16_t *dequant[3][3][2];
+ /*Condensed quantization tables.*/
+ const oc_iquant *enquant[3][3][2];
+ /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+ ptrdiff_t *coded_fragis[3];
+ ptrdiff_t *uncoded_fragis[3];
+ ptrdiff_t ncoded_fragis[3];
+ ptrdiff_t nuncoded_fragis[3];
+ /*The starting row for the current MCU in each plane.*/
+ int fragy0[3];
+ /*The ending row for the current MCU in each plane.*/
+ int fragy_end[3];
+ /*The number of tokens for zzi=1 for each color plane.*/
+ int ndct_tokens1[3];
+ /*The outstanding eob_run count for zzi=1 for each color plane.*/
+ int eob_run1[3];
+ /*The number of vertical super blocks in an MCU.*/
+ int mcu_nvsbs;
+ /*Whether or not the loop filter is enabled.*/
+ int loop_filter;
+};
+
+
+static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
+ ptrdiff_t *coded_fragis;
+ int pli;
+ int qii;
+ int qti;
+ /*Initialize the per-plane coded block flag trackers.
+ These are used for bit-estimation purposes only; the real flag bits span
+ all three planes, so we can't compute them in parallel.*/
+ for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
+ /*Set up per-plane pointers to the coded and uncoded fragments lists.
+ Unlike the decoder, each planes' coded and uncoded fragment list is kept
+ separate during the analysis stage; we only make the coded list for all
+ three planes contiguous right before the final packet is output
+ (destroying the uncoded lists, which are no longer needed).*/
+ coded_fragis=_enc->state.coded_fragis;
+ for(pli=0;pli<3;pli++){
+ _pipe->coded_fragis[pli]=coded_fragis;
+ coded_fragis+=_enc->state.fplanes[pli].nfrags;
+ _pipe->uncoded_fragis[pli]=coded_fragis;
+ }
+ memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
+ memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
+ /*Set up condensed quantizer tables.*/
+ for(pli=0;pli<3;pli++){
+ for(qii=0;qii<_enc->state.nqis;qii++){
+ int qi;
+ qi=_enc->state.qis[qii];
+ for(qti=0;qti<2;qti++){
+ _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+ _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+ }
+ }
+ }
+ /*Initialize the tokenization state.*/
+ for(pli=0;pli<3;pli++){
+ _pipe->ndct_tokens1[pli]=0;
+ _pipe->eob_run1[pli]=0;
+ }
+ /*If chroma is sub-sampled in the vertical direction, we have to encode two
+ super block rows of Y' for each super block row of Cb and Cr.*/
+ _pipe->mcu_nvsbs=1<<!(_enc->state.info.pixel_fmt&2);
+ /*Initialize the bounding value array for the loop filter.*/
+ _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
+ _pipe->bounding_values);
}
+static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
+ int refi;
+ /*Copy over all the uncoded fragments from this plane and advance the uncoded
+ fragment list.*/
+ _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+ oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
+ _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+ _pipe->nuncoded_fragis[_pli]=0;
+ /*Perform DC prediction.*/
+ oc_enc_pred_dc_frag_rows(_enc,_pli,
+ _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
+ /*Finish DC tokenization.*/
+ oc_enc_tokenize_dc_frag_list(_enc,_pli,
+ _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
+ _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
+ _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
+ _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
+ /*And advance the coded fragment list.*/
+ _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+ _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+ _pipe->ncoded_fragis[_pli]=0;
+ /*Apply the loop filter if necessary.*/
+ refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+ if(_pipe->loop_filter){
+ oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
+ refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+ }
+ else _sdelay=_edelay=0;
+ /*To fill borders, we have an additional two pixel delay, since a fragment
+ in the next row could filter its top edge, using two pixels from a
+ fragment in this row.
+ But there's no reason to delay a full fragment between the two.*/
+ oc_state_borders_fill_rows(&_enc->state,refi,_pli,
+ (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
+ (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
+}
+
+
+
+/*Cost information about the coded blocks in a MB.*/
+struct oc_rd_metric{
+ int uncoded_ac_ssd;
+ int coded_ac_ssd;
+ int ac_bits;
+ int dc_flag;
+};
+
+
+
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
- oc_plane_state *_ps,ptrdiff_t _fragi,int _overhead_bits,
+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
oc_rd_metric *_mo,oc_token_checkpoint **_stack){
ogg_int16_t buffer[64]OC_ALIGN16;
ogg_int16_t data[64]OC_ALIGN16;
@@ -478,7 +542,7 @@
int zzi;
frags=_enc->state.frags;
frag_offs=_enc->state.frag_buf_offs[_fragi];
- ystride=_enc->state.ref_ystride[_ps->pli];
+ ystride=_enc->state.ref_ystride[_pli];
mb_mode=frags[_fragi].mb_mode;
src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
ref=_enc->state.ref_frame_data[
@@ -507,7 +571,7 @@
default:{
const oc_mv *frag_mvs;
frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
- nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_ps->pli,
+ nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
if(nmv_offs>1){
oc_enc_frag_copy2(_enc,dst,
@@ -577,8 +641,9 @@
/*Transform:*/
oc_enc_fdct8x8(_enc,buffer,data);
/*Quantize:*/
- dequant=_ps->dequant[0][mb_mode!=OC_MODE_INTRA];
- enquant=_ps->enquant[0][mb_mode!=OC_MODE_INTRA];
+ /*TODO: Block-level quantizers.*/
+ dequant=_pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA];
+ enquant=_pipe->enquant[_pli][0][mb_mode!=OC_MODE_INTRA];
nonzero=0;
for(zzi=0;zzi<64;zzi++){
int v;
@@ -603,12 +668,10 @@
}
else data[zzi]=0;
}
- frags[_fragi].dc=data[0];
- frags[_fragi].coded=1;
/*Tokenize.*/
checkpoint=*_stack;
ac_bits=oc_enc_tokenize_ac(_enc,_fragi,data,dequant,buffer,
- _ps->pli,_stack,mb_mode==OC_MODE_INTRA?3:0);
+ _pli,_stack,mb_mode==OC_MODE_INTRA?3:0);
/*Reconstruct.
TODO: nonzero may need to be adjusted after tokenization.*/
oc_dequant_idct8x8(&_enc->state,buffer,data,
@@ -617,7 +680,7 @@
else{
oc_enc_frag_recon_inter(_enc,dst,
nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
- }
+ }
#if !defined(OC_COLLECT_METRICS)
if(frame_type!=OC_INTRA_FRAME)
#endif
@@ -663,8 +726,8 @@
/*Hm, not worth it; roll back.*/
oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
*_stack=checkpoint;
- oc_enc_frag_uncode(_enc,_ps->pli,_fragi);
_mo->coded_ac_ssd+=uncoded_ssd;
+ frags[_fragi].coded=0;
return 0;
}
else{
@@ -673,12 +736,14 @@
_mo->ac_bits+=ac_bits;
}
}
+ frags[_fragi].dc=data[0];
+ frags[_fragi].coded=1;
return 1;
}
/* mode_overhead is scaled by << OC_BIT_SCALE */
static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
- oc_plane_state *_ps,int _mbi,int _mode_overhead,oc_frag_state *_fr){
+ oc_enc_pipeline_state *_pipe,int _mbi,int _mode_overhead){
/*Worst case token stack usage for 4 fragments.*/
oc_token_checkpoint stack[64*4];
oc_token_checkpoint *stackptr;
@@ -687,18 +752,22 @@
oc_fragment *frags;
ptrdiff_t *coded_fragis;
ptrdiff_t ncoded_fragis;
+ ptrdiff_t *uncoded_fragis;
+ ptrdiff_t nuncoded_fragis;
oc_rd_metric mo;
- oc_frag_state fr_checkpoint;
+ oc_fr_state fr_checkpoint;
int mb_mode;
int ncoded;
ptrdiff_t fragi;
int bi;
- *&fr_checkpoint=*_fr;
+ *&fr_checkpoint=*(_pipe->fr+0);
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
mb_modes=_enc->state.mb_modes;
frags=_enc->state.frags;
- coded_fragis=_enc->state.coded_fragis;
- ncoded_fragis=_enc->state.ncoded_fragis[0];
+ coded_fragis=_pipe->coded_fragis[0];
+ ncoded_fragis=_pipe->ncoded_fragis[0];
+ uncoded_fragis=_pipe->uncoded_fragis[0];
+ nuncoded_fragis=_pipe->nuncoded_fragis[0];
mb_mode=mb_modes[_mbi];
ncoded=0;
stackptr=stack;
@@ -707,12 +776,15 @@
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
frags[fragi].mb_mode=mb_mode;
if(oc_enc_block_transform_quantize(_enc,
- _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
- oc_fr_code_block(_fr);
+ _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
+ oc_fr_code_block(_pipe->fr+0);
coded_fragis[ncoded_fragis++]=fragi;
ncoded++;
}
- else oc_fr_skip_block(_fr);
+ else{
+ *(uncoded_fragis-++nuncoded_fragis)=fragi;
+ oc_fr_skip_block(_pipe->fr+0);
+ }
}
if(_enc->state.frame_type!=OC_INTRA_FRAME){
if(ncoded>0&&!mo.dc_flag){
@@ -720,27 +792,27 @@
/*Some individual blocks were worth coding.
See if that's still true when accounting for mode and MV overhead.*/
cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
- +oc_fr_cost4(&fr_checkpoint,_fr)+(_mode_overhead>>OC_BIT_SCALE));
+ +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+(_mode_overhead>>OC_BIT_SCALE));
if(mo.uncoded_ac_ssd<=cost){
/*Taking macroblock overhead into account, it is not worth coding this
MB.*/
oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
- *_fr=*&fr_checkpoint;
+ *(_pipe->fr+0)=*&fr_checkpoint;
for(bi=0;bi<4;bi++){
fragi=sb_maps[_mbi>>2][_mbi&3][bi];
- if(frags[fragi].coded)oc_enc_frag_uncode(_enc,0,fragi);
- oc_fr_skip_block(_fr);
+ if(frags[fragi].coded){
+ *(uncoded_fragis-++nuncoded_fragis)=fragi;
+ frags[fragi].coded=0;
+ }
+ oc_fr_skip_block(_pipe->fr+0);
}
ncoded_fragis-=ncoded;
ncoded=0;
}
}
- if(ncoded==0){
- /*No luma blocks coded, mode is forced.*/
- mb_modes[_mbi]=OC_MODE_INTER_NOMV;
- return 0;
- }
- /*Assume that a 1mv with a single coded block is always cheaper than a 4mv
+ /*If no luma blocks coded, the mode is forced.*/
+ if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+ /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
with a single coded block.
This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
skipped blocks, while a 1MV does not.*/
@@ -748,21 +820,26 @@
mb_modes[_mbi]=OC_MODE_INTER_MV;
}
}
- _enc->state.ncoded_fragis[0]=ncoded_fragis;
+ _pipe->ncoded_fragis[0]=ncoded_fragis;
+ _pipe->nuncoded_fragis[0]=nuncoded_fragis;
return ncoded;
}
static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
- oc_plane_state *_ps,int _sbi_start,int _sbi_end,oc_frag_state *_fr){
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
const oc_sb_map *sb_maps;
oc_sb_flags *sb_flags;
ptrdiff_t *coded_fragis;
ptrdiff_t ncoded_fragis;
+ ptrdiff_t *uncoded_fragis;
+ ptrdiff_t nuncoded_fragis;
int sbi;
sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
sb_flags=_enc->state.sb_flags;
- coded_fragis=_enc->state.coded_fragis+_enc->state.fplanes[_ps->pli].froffset;
- ncoded_fragis=_enc->state.ncoded_fragis[_ps->pli];
+ coded_fragis=_pipe->coded_fragis[_pli];
+ ncoded_fragis=_pipe->ncoded_fragis[_pli];
+ uncoded_fragis=_pipe->uncoded_fragis[_pli];
+ nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
/*Worst case token stack usage for 1 fragment.*/
oc_token_checkpoint stack[64];
@@ -777,18 +854,22 @@
oc_token_checkpoint *stackptr;
stackptr=stack;
if(oc_enc_block_transform_quantize(_enc,
- _ps,fragi,oc_fr_cost1(_fr),&mo,&stackptr)){
+ _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
coded_fragis[ncoded_fragis++]=fragi;
- oc_fr_code_block(_fr);
+ oc_fr_code_block(_pipe->fr+_pli);
}
- else oc_fr_skip_block(_fr);
+ else{
+ *(uncoded_fragis-++nuncoded_fragis)=fragi;
+ oc_fr_skip_block(_pipe->fr+_pli);
+ }
}
}
- oc_fr_finish_sb(_fr);
- sb_flags[sbi].coded_fully=_fr->sb_full_last;
- sb_flags[sbi].coded_partially=_fr->sb_partial_last;
+ oc_fr_finish_sb(_pipe->fr+_pli);
+ sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full_last;
+ sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial_last;
}
- _enc->state.ncoded_fragis[_ps->pli]=ncoded_fragis;
+ _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+ _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
}
/*Mode decision is done by exhaustively examining all potential choices.
@@ -828,6 +909,16 @@
year=2003
}*/
+/*Cost information about a MB mode.*/
+struct oc_mode_choice{
+ unsigned cost;
+ unsigned ssd;
+ unsigned rate;
+ unsigned overhead;
+};
+
+
+
static void oc_mode_dct_cost_accum(oc_mode_choice *_modec,
int _qi,int _pli,int _qti,int _satd){
unsigned rmse;
@@ -852,7 +943,7 @@
}
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
- _modec->cost=_modec->ssd+(_modec->rate+_modec->overhead)*_lambda;
+ _modec->cost=_modec->ssd+(_modec->rate+_modec->overhead)*_lambda;
}
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
@@ -1078,8 +1169,7 @@
int oc_enc_analyze(oc_enc_ctx *_enc,int _frame_type,int _recode){
oc_set_chroma_mvs_func set_chroma_mvs;
oc_mcenc_ctx mcenc;
- oc_plane_state ps;
- oc_frag_state fr;
+ oc_enc_pipeline_state pipe;
oc_mv last_mv;
oc_mv prior_mv;
ogg_int64_t interbits;
@@ -1097,16 +1187,20 @@
oc_fragment *frags;
oc_mv *frag_mvs;
int qi;
+ unsigned stripe_sby;
+ int notstart;
+ int notdone;
+ int vdec;
unsigned sbi;
unsigned sbi_end;
+ int refi;
int pli;
set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
_enc->state.frame_type=_frame_type;
- if(!_recode)oc_mcenc_start(_enc,&mcenc);
- oc_fr_state_init(&fr);
+ if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_start(_enc,&mcenc);
oc_mode_scheme_chooser_reset(&_enc->chooser);
oc_enc_tokenize_start(_enc);
- oc_plane_state_plane_setup(_enc,&ps,0);
+ oc_enc_pipeline_init(_enc,&pipe);
_enc->mv_bits[0]=_enc->mv_bits[1]=0;
interbits=intrabits=0;
last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
@@ -1129,253 +1223,285 @@
frags=_enc->state.frags;
frag_mvs=_enc->state.frag_mvs;
sbi_end=_enc->state.fplanes[0].nsbs;
- for(sbi=0;sbi<sbi_end;sbi++){
- int quadi;
- /*Mode addressing is through Y plane, always 4 MB per SB.*/
- for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
- unsigned mbi;
- int mb_mode;
- int dx;
- int dy;
- int mapii;
- int mapi;
- int bi;
- ptrdiff_t fragi;
- mbi=sbi<<2|quadi;
- if(!_recode&&_enc->state.curframe_num>0){
- /*Motion estimation:
- We always do a basic 1MV search for all macroblocks, coded or not,
- keyframe or not.*/
- /*Move the motion vector predictors back a frame.*/
- memmove(embs[mbi].analysis_mv+1,
- embs[mbi].analysis_mv,2*sizeof(embs[mbi].analysis_mv[0]));
- /*Search the last frame.*/
- oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_PREV);
- /*Search the golden frame.*/
- oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_GOLD);
- }
- dx=dy=0;
- if(_enc->state.frame_type==OC_INTRA_FRAME){
- mb_modes[mbi]=mb_mode=OC_MODE_INTRA;
- oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,0,&fr);
- }
- else{
- oc_mode_choice modes[8];
- int mb_mv_bits_0;
- int mb_gmv_bits_0;
- int mb_4mv_bits_0;
- int mb_4mv_bits_1;
- int inter_mv_pref;
- /*Find the block choice with the lowest estimated coding cost.
- If a Cb or Cr block is coded but no Y' block from a macro block then
- the mode MUST be OC_MODE_INTER_NOMV.
- This is the default state to which the mode data structure is
- initialised in encoder and decoder at the start of each frame.*/
- /*Block coding cost is estimated from correlated SATD metrics.*/
- /*At this point, all blocks that are in frame are still marked coded.*/
- if(!_recode){
- memcpy(embs[mbi].unref_mv,
- embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
- embs[mbi].refined=0;
+ vdec=!(_enc->state.info.pixel_fmt&2);
+ notstart=0;
+ notdone=1;
+ for(stripe_sby=0;notdone;stripe_sby+=pipe.mcu_nvsbs){
+ const oc_fragment_plane *fplane;
+ int sby_end;
+ fplane=_enc->state.fplanes+0;
+ sby_end=fplane->nvsbs;
+ notdone=stripe_sby+pipe.mcu_nvsbs<sby_end;
+ if(notdone)sby_end=stripe_sby+pipe.mcu_nvsbs;
+ sbi=stripe_sby*fplane->nhsbs;
+ sbi_end=sby_end*fplane->nhsbs;
+ pipe.fragy0[0]=stripe_sby<<2;
+ pipe.fragy_end[0]=sby_end<<2;
+ for(;sbi<sbi_end;sbi++){
+ int quadi;
+ /*Mode addressing is through Y plane, always 4 MB per SB.*/
+ for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+ unsigned mbi;
+ int mb_mode;
+ int dx;
+ int dy;
+ int mapii;
+ int mapi;
+ int bi;
+ ptrdiff_t fragi;
+ mbi=sbi<<2|quadi;
+ if(!_recode&&_enc->state.curframe_num>0){
+ /*Motion estimation:
+ We always do a basic 1MV search for all macroblocks, coded or not,
+ keyframe or not.*/
+ /*Move the motion vector predictors back a frame.*/
+ memmove(embs[mbi].analysis_mv+1,
+ embs[mbi].analysis_mv,2*sizeof(embs[mbi].analysis_mv[0]));
+ /*Search the last frame.*/
+ oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_PREV);
+ /*Search the golden frame.*/
+ oc_mcenc_search(_enc,&mcenc,mbi,OC_FRAME_GOLD);
}
- oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,OC_MODE_INTER_NOMV,qi);
- oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,qi);
- intrabits+=modes[OC_MODE_INTRA].rate;
- mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
- OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],qi);
- oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
- OC_MODE_INTER_MV_LAST,last_mv,qi);
- oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
- OC_MODE_INTER_MV_LAST2,prior_mv,qi);
- oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
- OC_MODE_GOLDEN_NOMV,qi);
- mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
- OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],qi);
- mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
- embs[mbi].block_mv,qi);
- mb_4mv_bits_1=48;
- /*The explicit MV modes (2,6,7) have not yet gone through halfpel
- refinement.
- We choose the explicit MV mode that's already furthest ahead on bits
- and refine only that one.
- We have to be careful to remember which ones we've refined so that
- we don't refine it again if we re-encode this frame.*/
- inter_mv_pref=_enc->lambda*3<<OC_BIT_SCALE;
- if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
- modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
- if(!(embs[mbi].refined&0x80)){
- oc_mcenc_refine4mv(_enc,mbi);
- embs[mbi].refined|=0x80;
- }
- mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
- embs[mbi].ref_mv,qi);
+ dx=dy=0;
+ if(_enc->state.frame_type==OC_INTRA_FRAME){
+ mb_modes[mbi]=mb_mode=OC_MODE_INTRA;
+ oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
}
- else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
- modes[OC_MODE_INTER_MV].cost){
- if(!(embs[mbi].refined&0x40)){
- oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
- embs[mbi].refined|=0x40;
+ else{
+ oc_mode_choice modes[8];
+ int mb_mv_bits_0;
+ int mb_gmv_bits_0;
+ int mb_4mv_bits_0;
+ int mb_4mv_bits_1;
+ int inter_mv_pref;
+ /*Find the block choice with the lowest estimated coding cost.
+ If a Cb or Cr block is coded but no Y' block from a macro block then
+ the mode MUST be OC_MODE_INTER_NOMV.
+ This is the default state to which the mode data structure is
+ initialised in encoder and decoder at the start of each frame.*/
+ /*Block coding cost is estimated from correlated SATD metrics.*/
+ /*At this point, all blocks that are in frame are still marked coded.*/
+ if(!_recode){
+ memcpy(embs[mbi].unref_mv,
+ embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
+ embs[mbi].refined=0;
}
+ oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,OC_MODE_INTER_NOMV,qi);
+ oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,qi);
+ intrabits+=modes[OC_MODE_INTRA].rate;
+ mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+ OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],qi);
+ oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
+ OC_MODE_INTER_MV_LAST,last_mv,qi);
+ oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
+ OC_MODE_INTER_MV_LAST2,prior_mv,qi);
+ oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+ OC_MODE_GOLDEN_NOMV,qi);
mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
- OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],qi);
- }
- if(!(embs[mbi].refined&0x04)){
- oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
- embs[mbi].refined|=0x04;
- }
- mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
- OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],qi);
- /*Finally, pick the mode with the cheapest estimated bit cost.*/
- mb_mode=0;
- if(modes[1].cost<modes[0].cost)mb_mode=1;
- if(modes[3].cost<modes[mb_mode].cost)mb_mode=3;
- if(modes[4].cost<modes[mb_mode].cost)mb_mode=4;
- if(modes[5].cost<modes[mb_mode].cost)mb_mode=5;
- if(modes[6].cost<modes[mb_mode].cost)mb_mode=6;
- if(modes[7].cost<modes[mb_mode].cost)mb_mode=7;
- /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
- if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
- inter_mv_pref=0;
- }
- if(modes[2].cost<modes[mb_mode].cost+inter_mv_pref)mb_mode=2;
- mb_modes[mbi]=mb_mode;
- /*Propagate the MVs to the luma blocks.*/
- if(mb_mode!=OC_MODE_INTER_MV_FOUR){
- switch(mb_mode){
- case OC_MODE_INTER_MV:{
- dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
- dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
- }break;
- case OC_MODE_INTER_MV_LAST:{
- dx=last_mv[0];
- dy=last_mv[1];
- }break;
- case OC_MODE_INTER_MV_LAST2:{
- dx=prior_mv[0];
- dy=prior_mv[1];
- }break;
- case OC_MODE_GOLDEN_MV:{
- dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
- dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
- }break;
+ OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],qi);
+ mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+ embs[mbi].block_mv,qi);
+ mb_4mv_bits_1=48;
+ /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+ refinement.
+ We choose the explicit MV mode that's already furthest ahead on bits
+ and refine only that one.
+ We have to be careful to remember which ones we've refined so that
+ we don't refine it again if we re-encode this frame.*/
+ inter_mv_pref=_enc->lambda*3<<OC_BIT_SCALE;
+ if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
+ modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+ if(!(embs[mbi].refined&0x80)){
+ oc_mcenc_refine4mv(_enc,mbi);
+ embs[mbi].refined|=0x80;
+ }
+ mb_4mv_bits_0=oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+ embs[mbi].ref_mv,qi);
}
- for(bi=0;bi<4;bi++){
- fragi=mb_maps[mbi][0][bi];
- frag_mvs[fragi][0]=(signed char)dx;
- frag_mvs[fragi][1]=(signed char)dy;
+ else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
+ modes[OC_MODE_INTER_MV].cost){
+ if(!(embs[mbi].refined&0x40)){
+ oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
+ embs[mbi].refined|=0x40;
+ }
+ mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+ OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],qi);
}
- }
- if(oc_enc_mb_transform_quantize_luma(_enc,&ps,mbi,
- modes[mb_mode].overhead,&fr)>0){
- int orig_mb_mode;
- orig_mb_mode=mb_mode;
- mb_mode=mb_modes[mbi];
- switch(mb_mode){
- case OC_MODE_INTER_MV:{
- memcpy(prior_mv,last_mv,sizeof(prior_mv));
- /*If we're backing out from 4MV, find the MV we're actually
- using.*/
- if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
- for(bi=0;;bi++){
+ if(!(embs[mbi].refined&0x04)){
+ oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
+ embs[mbi].refined|=0x04;
+ }
+ mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+ OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],qi);
+ /*Finally, pick the mode with the cheapest estimated bit cost.*/
+ mb_mode=0;
+ if(modes[1].cost<modes[0].cost)mb_mode=1;
+ if(modes[3].cost<modes[mb_mode].cost)mb_mode=3;
+ if(modes[4].cost<modes[mb_mode].cost)mb_mode=4;
+ if(modes[5].cost<modes[mb_mode].cost)mb_mode=5;
+ if(modes[6].cost<modes[mb_mode].cost)mb_mode=6;
+ if(modes[7].cost<modes[mb_mode].cost)mb_mode=7;
+ /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
+ if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
+ inter_mv_pref=0;
+ }
+ if(modes[2].cost<modes[mb_mode].cost+inter_mv_pref)mb_mode=2;
+ mb_modes[mbi]=mb_mode;
+ /*Propagate the MVs to the luma blocks.*/
+ if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+ switch(mb_mode){
+ case OC_MODE_INTER_MV:{
+ dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
+ dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
+ }break;
+ case OC_MODE_INTER_MV_LAST:{
+ dx=last_mv[0];
+ dy=last_mv[1];
+ }break;
+ case OC_MODE_INTER_MV_LAST2:{
+ dx=prior_mv[0];
+ dy=prior_mv[1];
+ }break;
+ case OC_MODE_GOLDEN_MV:{
+ dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
+ dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
+ }break;
+ }
+ for(bi=0;bi<4;bi++){
+ fragi=mb_maps[mbi][0][bi];
+ frag_mvs[fragi][0]=(signed char)dx;
+ frag_mvs[fragi][1]=(signed char)dy;
+ }
+ }
+ if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
+ modes[mb_mode].overhead)>0){
+ int orig_mb_mode;
+ orig_mb_mode=mb_mode;
+ mb_mode=mb_modes[mbi];
+ switch(mb_mode){
+ case OC_MODE_INTER_MV:{
+ memcpy(prior_mv,last_mv,sizeof(prior_mv));
+ /*If we're backing out from 4MV, find the MV we're actually
+ using.*/
+ if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
+ for(bi=0;;bi++){
+ fragi=mb_maps[mbi][0][bi];
+ if(frags[fragi].coded){
+ memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+ dx=frag_mvs[fragi][0];
+ dy=frag_mvs[fragi][1];
+ break;
+ }
+ }
+ mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+ }
+ /*Otherwise we used the original analysis MV.*/
+ else{
+ memcpy(last_mv,
+ embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
+ }
+ _enc->mv_bits[0]+=mb_mv_bits_0;
+ _enc->mv_bits[1]+=12;
+ }break;
+ case OC_MODE_INTER_MV_LAST2:{
+ oc_mv tmp_mv;
+ memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
+ memcpy(prior_mv,last_mv,sizeof(prior_mv));
+ memcpy(last_mv,tmp_mv,sizeof(last_mv));
+ }break;
+ case OC_MODE_GOLDEN_MV:{
+ _enc->mv_bits[0]+=mb_gmv_bits_0;
+ _enc->mv_bits[1]+=12;
+ }break;
+ case OC_MODE_INTER_MV_FOUR:{
+ oc_mv lbmvs[4];
+ oc_mv cbmvs[4];
+ memcpy(prior_mv,last_mv,sizeof(prior_mv));
+ for(bi=0;bi<4;bi++){
fragi=mb_maps[mbi][0][bi];
if(frags[fragi].coded){
memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
- dx=frag_mvs[fragi][0];
- dy=frag_mvs[fragi][1];
- break;
+ memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
+ _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
+ +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
+ _enc->mv_bits[1]+=12;
}
+ /*Replace the block MVs for not-coded blocks with (0,0).*/
+ else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
}
- mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
- }
- /*Otherwise we used the original analysis MV.*/
- else{
- memcpy(last_mv,
- embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
- }
- _enc->mv_bits[0]+=mb_mv_bits_0;
- _enc->mv_bits[1]+=12;
- }break;
- case OC_MODE_INTER_MV_LAST2:{
- oc_mv tmp_mv;
- memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
- memcpy(prior_mv,last_mv,sizeof(prior_mv));
- memcpy(last_mv,tmp_mv,sizeof(last_mv));
- }break;
- case OC_MODE_GOLDEN_MV:{
- _enc->mv_bits[0]+=mb_gmv_bits_0;
- _enc->mv_bits[1]+=12;
- }break;
- case OC_MODE_INTER_MV_FOUR:{
- oc_mv lbmvs[4];
- oc_mv cbmvs[4];
- memcpy(prior_mv,last_mv,sizeof(prior_mv));
- for(bi=0;bi<4;bi++){
- fragi=mb_maps[mbi][0][bi];
- if(frags[fragi].coded){
- memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
- memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
- _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
- +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
- _enc->mv_bits[1]+=12;
+ (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+ for(mapii=4;mapii<nmap_idxs;mapii++){
+ mapi=map_idxs[mapii];
+ pli=mapi>>2;
+ bi=mapi&3;
+ fragi=mb_maps[mbi][pli][bi];
+ frags[fragi].mb_mode=mb_mode;
+ memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
}
- /*Replace the block MVs for not-coded blocks with (0,0).*/
- else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
- }
- (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
- for(mapii=4;mapii<nmap_idxs;mapii++){
- mapi=map_idxs[mapii];
- pli=mapi>>2;
- bi=mapi&3;
- fragi=mb_maps[mbi][pli][bi];
- frags[fragi].mb_mode=mb_mode;
- memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
- }
- }break;
+ }break;
+ }
+ coded_mbis[ncoded_mbis++]=mbi;
+ oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
+ interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
}
- coded_mbis[ncoded_mbis++]=mbi;
- oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
- interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
+ else{
+ *(uncoded_mbis-++nuncoded_mbis)=mbi;
+ mb_mode=OC_MODE_INTER_NOMV;
+ dx=dy=0;
+ }
}
- else{
- *(uncoded_mbis-++nuncoded_mbis)=mbi;
- mb_mode=OC_MODE_INTER_NOMV;
- dx=dy=0;
+ /*Propagate final MB mode and MVs to the chroma blocks.
+ This has already been done for 4MV mode, since it requires individual
+ block motion vectors.*/
+ if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+ for(mapii=4;mapii<nmap_idxs;mapii++){
+ mapi=map_idxs[mapii];
+ pli=mapi>>2;
+ bi=mapi&3;
+ fragi=mb_maps[mbi][pli][bi];
+ frags[fragi].mb_mode=mb_mode;
+ frag_mvs[fragi][0]=(signed char)dx;
+ frag_mvs[fragi][1]=(signed char)dy;
+ }
}
}
- /*Propagate final MB mode and MVs to the chroma blocks.
- This has already been done for 4MV mode, since it requires individual
- block motion vectors.*/
- if(mb_mode!=OC_MODE_INTER_MV_FOUR){
- for(mapii=4;mapii<nmap_idxs;mapii++){
- mapi=map_idxs[mapii];
- pli=mapi>>2;
- bi=mapi&3;
- fragi=mb_maps[mbi][pli][bi];
- frags[fragi].mb_mode=mb_mode;
- frag_mvs[fragi][0]=(signed char)dx;
- frag_mvs[fragi][1]=(signed char)dy;
- }
+ oc_fr_finish_sb(pipe.fr+0);
+ sb_flags[sbi].coded_fully=pipe.fr[0].sb_full_last;
+ sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial_last;
+ }
+ oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+ /*Code chroma planes.*/
+ for(pli=1;pli<3;pli++){
+ fplane=_enc->state.fplanes+pli;
+ sbi=fplane->sboffset+(stripe_sby>>vdec)*fplane->nhsbs;
+ pipe.fragy0[pli]=stripe_sby<<2-vdec;
+ if(notdone){
+ sbi_end=sbi+(sby_end-stripe_sby>>vdec)*fplane->nhsbs;
+ pipe.fragy_end[pli]=sby_end<<2-vdec;
}
+ else{
+ sbi_end=fplane->sboffset+fplane->nsbs;
+ pipe.fragy_end[pli]=fplane->nvfrags;
+ }
+ oc_enc_sb_transform_quantize_chroma(_enc,&pipe,pli,sbi,sbi_end);
+ oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
}
- oc_fr_finish_sb(&fr);
- sb_flags[sbi].coded_fully=fr.sb_full_last;
- sb_flags[sbi].coded_partially=fr.sb_partial_last;
+ notstart=1;
}
- /*Code Cb plane.*/
- oc_plane_state_plane_setup(_enc,&ps,1);
- sbi=sbi_end;
- sbi_end=sbi+_enc->state.fplanes[1].nsbs;
- oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
- /*Code Cr plane.*/
- oc_plane_state_plane_setup(_enc,&ps,2);
- sbi=sbi_end;
- sbi_end=sbi+_enc->state.fplanes[2].nsbs;
- oc_enc_sb_transform_quantize_chroma(_enc,&ps,sbi,sbi_end,&fr);
+ /*Finish filling in the reference frame borders.*/
+ refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+ for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+ /*Finish adding flagging overhead costs to inter bit counts to determine if
+ we should have coded a key frame instead.*/
if(_enc->state.frame_type!=OC_INTRA_FRAME){
if(interbits>intrabits)return 1;
- /*Finish adding flagging overhead costs to inter bit counts.*/
- oc_fr_flush(&fr);
- interbits+=fr.bits<<OC_BIT_SCALE;
+ /*Technically the chroma plane counts are over-estimations, because they
+ don't account for continuing runs from the luma planes, but the
+ inaccuracy is small.*/
+ for(pli=0;pli<3;pli++){
+ oc_fr_flush(pipe.fr+pli);
+ interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
+ }
interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
interbits+=
_enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
Modified: branches/theora-thusnelda/lib/enc/encint.h
===================================================================
--- branches/theora-thusnelda/lib/enc/encint.h 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/encint.h 2009-06-14 18:50:22 UTC (rev 16130)
@@ -335,6 +335,10 @@
oc_token_checkpoint **_stack,int _acmin);
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
const oc_token_checkpoint *_stack,int _n);
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend);
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1);
void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/encode.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -946,6 +946,15 @@
oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
oc_enc_analyze(_enc,OC_INTRA_FRAME,_recode);
oc_enc_frame_pack(_enc);
+ /*On the first frame, the previous call was an initial dry-run to prime
+ feed-forward statistics.*/
+ if(!_recode&&_enc->state.curframe_num==0){
+ if(_enc->state.info.target_bitrate>0){
+ oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+ OC_INTRA_FRAME,_enc->state.qis[0],1);
+ }
+ oc_enc_compress_keyframe(_enc,1);
+ }
}
static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
@@ -1261,40 +1270,15 @@
_enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
_enc->state.curframe_num+=_enc->prev_dup_count+1;
/*Step 4: Compress the frame.*/
- /*Don't allow the generation of invalid files that overflow the
- keyframe_granule_shift.*/
+ /*Start with a keyframe, and don't allow the generation of invalid files that
+ overflow the keyframe_granule_shift.*/
if(_enc->state.curframe_num==0||
_enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
_enc->keyframe_frequency_force){
oc_enc_compress_keyframe(_enc,0);
- /*On the first frame, the previous call was an initial dry-run to prime
- feed-forward statistics.*/
- if(_enc->state.curframe_num==0){
- if(_enc->state.info.target_bitrate>0){
- oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
- OC_INTRA_FRAME,_enc->state.qis[0],1);
- }
- oc_enc_compress_keyframe(_enc,1);
- }
}
/*Compress the frame.*/
else oc_enc_compress_frame(_enc,0);
- /*Step 5: Finish reconstruction.
- TODO: Move this inline with compression process.*/
- {
- int bv[256];
- int loop_filter;
- loop_filter=!oc_state_loop_filter_init(&_enc->state,bv);
- for(pli=0;pli<3;pli++){
- if(loop_filter){
- oc_state_loop_filter_frag_rows(&_enc->state,bv,refi,pli,
- 0,_enc->state.fplanes[pli].nvfrags);
- }
- oc_state_borders_fill_rows(&_enc->state,refi,pli,
- 0,_enc->state.ref_frame_bufs[refi][pli].height);
- oc_state_borders_fill_caps(&_enc->state,refi,pli);
- }
- }
oc_restore_fpu(&_enc->state);
/*Update state variables.*/
_enc->packet_state=OC_PACKET_READY;
Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/mcenc.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -264,22 +264,22 @@
may cause increased degredation in many blocks to come.
We could artificially reduce lambda to compensate, but it's faster to just
disable it entirely, and use D (the distortion) as the sole criterion.*/
- const ptrdiff_t *frag_buf_offs;
- const ptrdiff_t *fragis;
- const unsigned char *src;
- const unsigned char *ref;
- int ystride;
- oc_mb_enc_info *embs;
- ogg_int32_t hit_cache[31];
- ogg_int32_t hitbit;
- unsigned best_block_err[4];
- unsigned block_err[4];
- unsigned best_err;
- int best_vec[2];
- int best_block_vec[4][2];
- int candx;
- int candy;
- int bi;
+ const ptrdiff_t *frag_buf_offs;
+ const ptrdiff_t *fragis;
+ const unsigned char *src;
+ const unsigned char *ref;
+ int ystride;
+ oc_mb_enc_info *embs;
+ ogg_int32_t hit_cache[31];
+ ogg_int32_t hitbit;
+ unsigned best_block_err[4];
+ unsigned block_err[4];
+ unsigned best_err;
+ int best_vec[2];
+ int best_block_vec[4][2];
+ int candx;
+ int candy;
+ int bi;
embs=_enc->mb_info;
/*Find some candidate motion vectors.*/
oc_mcenc_find_candidates(_enc,_mcenc,_mbi,_frame);
@@ -701,14 +701,14 @@
}
void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
- oc_mb_enc_info *embs;
- const ptrdiff_t *frag_buf_offs;
- const ptrdiff_t *fragis;
- const unsigned char *src;
- const unsigned char *ref;
- int offset_y[9];
- int ystride;
- int bi;
+ oc_mb_enc_info *embs;
+ const ptrdiff_t *frag_buf_offs;
+ const ptrdiff_t *fragis;
+ const unsigned char *src;
+ const unsigned char *ref;
+ int offset_y[9];
+ int ystride;
+ int bi;
ystride=_enc->state.ref_ystride[0];
frag_buf_offs=_enc->state.frag_buf_offs;
fragis=_enc->state.mb_maps[_mbi][0];
Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c 2009-06-14 18:45:42 UTC (rev 16129)
+++ branches/theora-thusnelda/lib/enc/tokenize.c 2009-06-14 18:50:22 UTC (rev 16130)
@@ -392,7 +392,7 @@
return total_bits;
}
-static void oc_enc_pred_dc_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend){
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,int _pli,int _y0,int _yend){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
ogg_int16_t *frag_dc;
@@ -420,239 +420,242 @@
}
}
-static void oc_enc_tokenize_dc(oc_enc_ctx *_enc){
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1){
const ogg_int16_t *frag_dc;
- const ptrdiff_t *coded_fragis;
- ptrdiff_t ncoded_fragis;
ptrdiff_t fragii;
- int pli;
+ unsigned char *dct_tokens0;
+ unsigned char *dct_tokens1;
+ ogg_uint16_t *extra_bits0;
+ ogg_uint16_t *extra_bits1;
+ ptrdiff_t ti0;
+ ptrdiff_t ti1r;
+ ptrdiff_t ti1w;
+ int eob_run0;
+ int eob_run1;
+ int neobs1;
+ int token;
+ int eb;
+ /*eb1 and token1 are always initialized before use; if your compiler thinks
+ otherwise, it is dumb.*/
+ int token1;
+ int eb1;
+ /*Return immediately if there are no coded fragments; otherwise we'd flush
+ any trailing EOB run into the AC 1 list and never read it back out.*/
+ if(_ncoded_fragis<=0)return;
frag_dc=_enc->frag_dc;
- coded_fragis=_enc->state.coded_fragis;
- ncoded_fragis=fragii=0;
- for(pli=0;pli<3;pli++){
- unsigned char *dct_tokens0;
- unsigned char *dct_tokens1;
- ogg_uint16_t *extra_bits0;
- ogg_uint16_t *extra_bits1;
- ptrdiff_t ti0;
- ptrdiff_t ti1r;
- ptrdiff_t ti1w;
- int eob_run0;
- int eob_run1;
- int neobs1;
- int token;
- int eb;
- int token1;
- int eb1;
- /*TODO: Move this inline with reconstruction.*/
- oc_enc_pred_dc_rows(_enc,pli,0,_enc->state.fplanes[pli].nvfrags);
- dct_tokens0=_enc->dct_tokens[pli][0];
- dct_tokens1=_enc->dct_tokens[pli][1];
- extra_bits0=_enc->extra_bits[pli][0];
- extra_bits1=_enc->extra_bits[pli][1];
- ncoded_fragis+=_enc->state.ncoded_fragis[pli];
- ti0=ti1w=ti1r=0;
- eob_run0=eob_run1=neobs1=0;
- for(;fragii<ncoded_fragis;fragii++){
- int val;
- /*All tokens in the 1st AC coefficient stack are regenerated as the DC
- coefficients are produced.
- This can be done in-place; stack 1 cannot get larger.*/
- if(!neobs1){
- /*There's no active EOB run in stack 1; read the next token.*/
- token1=dct_tokens1[ti1r];
- eb1=extra_bits1[ti1r];
- ti1r++;
- if(token1<OC_NDCT_EOB_TOKEN_MAX){
- neobs1=oc_decode_eob_token(token1,eb1);
- /*It's an EOB run; add it to the current (inactive) one.
- Because we may have moved entries to stack 0, we may have an
- opportunity to merge two EOB runs in stack 1.*/
- eob_run1+=neobs1;
- }
+ dct_tokens0=_enc->dct_tokens[_pli][0];
+ dct_tokens1=_enc->dct_tokens[_pli][1];
+ extra_bits0=_enc->extra_bits[_pli][0];
+ extra_bits1=_enc->extra_bits[_pli][1];
+ ti0=_enc->ndct_tokens[_pli][0];
+ ti1w=ti1r=_prev_ndct_tokens1;
+ eob_run0=_enc->eob_run[_pli][0];
+ /*Flush any trailing EOB run for the 1st AC coefficient.
+ This is needed to allow us to track tokens to the end of the list.*/
+ eob_run1=_enc->eob_run[_pli][1];
+ if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
+ /*If there was an active EOB run at the start of the 1st AC stack, read it
+ in and decode it.*/
+ if(_prev_eob_run1>0){
+ token1=dct_tokens1[ti1r];
+ eb1=extra_bits1[ti1r];
+ ti1r++;
+ eob_run1=oc_decode_eob_token(token1,eb1);
+ /*Consume the portion of the run that came before these fragments.*/
+ neobs1=eob_run1-_prev_eob_run1;
+ }
+ else eob_run1=neobs1=0;
+ for(fragii=0;fragii<_ncoded_fragis;fragii++){
+ int val;
+ /*All tokens in the 1st AC coefficient stack are regenerated as the DC
+ coefficients are produced.
+ This can be done in-place; stack 1 cannot get larger.*/
+ if(!neobs1){
+ /*There's no active EOB run in stack 1; read the next token.*/
+ token1=dct_tokens1[ti1r];
+ eb1=extra_bits1[ti1r];
+ ti1r++;
+ if(token1<OC_NDCT_EOB_TOKEN_MAX){
+ neobs1=oc_decode_eob_token(token1,eb1);
+ /*It's an EOB run; add it to the current (inactive) one.
+ Because we may have moved entries to stack 0, we may have an
+ opportunity to merge two EOB runs in stack 1.*/
+ eob_run1+=neobs1;
}
- val=frag_dc[coded_fragis[fragii]];
- if(val){
- /*There was a non-zero DC value, so there's no alteration to stack 1
- for this fragment; just code the stack 0 token.*/
- /*Flush any pending EOB run.*/
- if(eob_run0>0){
+ }
+ val=frag_dc[_coded_fragis[fragii]];
+ if(val){
+ /*There was a non-zero DC value, so there's no alteration to stack 1
+ for this fragment; just code the stack 0 token.*/
+ /*Flush any pending EOB run.*/
+ if(eob_run0>0){
+ token=oc_make_eob_token_full(eob_run0,&eb);
+ dct_tokens0[ti0]=(unsigned char)token;
+ extra_bits0[ti0]=(ogg_uint16_t)eb;
+ ti0++;
+ eob_run0=0;
+ }
+ token=oc_make_dct_token_full(0,0,val,&eb);
+ dct_tokens0[ti0]=(unsigned char)token;
+ extra_bits0[ti0]=(ogg_uint16_t)eb;
+ ti0++;
+ }
+ else{
+ /*Zero DC value; that means the entry in stack 1 might need to be coded
+ from stack 0.
+ This requires a stack 1 fixup.*/
+ if(neobs1>0){
+ /*We're in the middle of an active EOB run in stack 1.
+ Move it to stack 0.*/
+ if(++eob_run0>=4095){
token=oc_make_eob_token_full(eob_run0,&eb);
dct_tokens0[ti0]=(unsigned char)token;
extra_bits0[ti0]=(ogg_uint16_t)eb;
ti0++;
eob_run0=0;
}
- token=oc_make_dct_token_full(0,0,val,&eb);
- dct_tokens0[ti0]=(unsigned char)token;
- extra_bits0[ti0]=(ogg_uint16_t)eb;
- ti0++;
+ eob_run1--;
}
else{
- /*Zero DC value; that means the entry in stack 1 might need to be coded
- from stack 0.
- This requires a stack 1 fixup.*/
- if(neobs1){
- /*We're in the middle of an active EOB run in stack 1.
- Move it to stack 0.*/
- if(++eob_run0>=4095){
- token=oc_make_eob_token_full(eob_run0,&eb);
- dct_tokens0[ti0]=(unsigned char)token;
- extra_bits0[ti0]=(ogg_uint16_t)eb;
- ti0++;
- eob_run0=0;
- }
- eob_run1--;
+ /*No active EOB run in stack 1, so we can't extend one in stack 0.
+ Flush it if we've got it.*/
+ if(eob_run0>0){
+ token=oc_make_eob_token_full(eob_run0,&eb);
+ dct_tokens0[ti0]=(unsigned char)token;
+ extra_bits0[ti0]=(ogg_uint16_t)eb;
+ ti0++;
+ eob_run0=0;
}
- else{
- /*No active EOB run in stack 1, so we can't extend one in stack 0.
- Flush it if we've got it.*/
- if(eob_run0>0){
- token=oc_make_eob_token_full(eob_run0,&eb);
- dct_tokens0[ti0]=(unsigned char)token;
- extra_bits0[ti0]=(ogg_uint16_t)eb;
- ti0++;
- eob_run0=0;
- }
- /*Stack 1 token is one of: a pure zero run token, a single
- coefficient token, or a zero run/coefficient combo token.
- A zero run token is expanded and moved to token stack 0, and the
- stack 1 entry dropped.
- A single coefficient value may be transformed into combo token that
- is moved to stack 0, or if it cannot be combined, it is left alone
- and a single length-1 zero run is emitted in stack 0.
- A combo token is extended and moved to stack 0.
- During AC coding, we restrict the run lengths on combo tokens for
- stack 1 to guarantee we can extend them.*/
- switch(token1){
- case OC_DCT_SHORT_ZRL_TOKEN:{
- if(eb1<7){
- dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
- extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- continue;
- }
- /*Fall through.*/
- }
- case OC_DCT_ZRL_TOKEN:{
- dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+ /*Stack 1 token is one of: a pure zero run token, a single
+ coefficient token, or a zero run/coefficient combo token.
+ A zero run token is expanded and moved to token stack 0, and the
+ stack 1 entry dropped.
+ A single coefficient value may be transformed into combo token that
+ is moved to stack 0, or if it cannot be combined, it is left alone
+ and a single length-1 zero run is emitted in stack 0.
+ A combo token is extended and moved to stack 0.
+ During AC coding, we restrict the run lengths on combo tokens for
+ stack 1 to guarantee we can extend them.*/
+ switch(token1){
+ case OC_DCT_SHORT_ZRL_TOKEN:{
+ if(eb1<7){
+ dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
ti0++;
/*Don't write the AC coefficient back out.*/
- }continue;
- case OC_ONE_TOKEN:
- case OC_MINUS_ONE_TOKEN:{
- dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
- extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
- case OC_TWO_TOKEN:
- case OC_MINUS_TWO_TOKEN:{
- dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
- extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
- case OC_DCT_VAL_CAT2:{
- dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
- extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
- case OC_DCT_RUN_CAT1A:
- case OC_DCT_RUN_CAT1A+1:
- case OC_DCT_RUN_CAT1A+2:
- case OC_DCT_RUN_CAT1A+3:{
- dct_tokens0[ti0]=(unsigned char)(token1+1);
- extra_bits0[ti0]=(ogg_uint16_t)eb1;
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
- case OC_DCT_RUN_CAT1A+4:{
+ continue;
+ }
+ /*Fall through.*/
+ }
+ case OC_DCT_ZRL_TOKEN:{
+ dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+ extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_ONE_TOKEN:
+ case OC_MINUS_ONE_TOKEN:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
+ extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_TWO_TOKEN:
+ case OC_MINUS_TWO_TOKEN:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+ extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_DCT_VAL_CAT2:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+ extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_DCT_RUN_CAT1A:
+ case OC_DCT_RUN_CAT1A+1:
+ case OC_DCT_RUN_CAT1A+2:
+ case OC_DCT_RUN_CAT1A+3:{
+ dct_tokens0[ti0]=(unsigned char)(token1+1);
+ extra_bits0[ti0]=(ogg_uint16_t)eb1;
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_DCT_RUN_CAT1A+4:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+ extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_DCT_RUN_CAT1B:{
+ if((eb1&3)<3){
dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
- extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
- case OC_DCT_RUN_CAT1B:{
- if((eb1&3)<3){
- dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
- extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- continue;
- }
- eb1=((eb1&4)<<1)-1;
- /*Fall through.*/
- }
- case OC_DCT_RUN_CAT1C:{
- dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
ti0++;
/*Don't write the AC coefficient back out.*/
- }continue;
- case OC_DCT_RUN_CAT2A:{
- eb1=(eb1<<1)-1;
- /*Fall through.*/
+ continue;
}
- case OC_DCT_RUN_CAT2B:{
- dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
- extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
- ti0++;
- /*Don't write the AC coefficient back out.*/
- }continue;
+ eb1=((eb1&4)<<1)-1;
+ /*Fall through.*/
}
- /*We can't merge tokens, write a short zero run and keep going.*/
- dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
- extra_bits0[ti0]=0;
- ti0++;
+ case OC_DCT_RUN_CAT1C:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
+ extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
+ case OC_DCT_RUN_CAT2A:{
+ eb1=(eb1<<1)-1;
+ /*Fall through.*/
+ }
+ case OC_DCT_RUN_CAT2B:{
+ dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
+ extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+ ti0++;
+ /*Don't write the AC coefficient back out.*/
+ }continue;
}
+ /*We can't merge tokens, write a short zero run and keep going.*/
+ dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+ extra_bits0[ti0]=0;
+ ti0++;
}
- if(!neobs1){
- /*Flush any (inactive) EOB run.*/
- if(eob_run1>0){
- token=oc_make_eob_token_full(eob_run1,&eb);
- dct_tokens1[ti1w]=(unsigned char)token;
- extra_bits1[ti1w]=(ogg_uint16_t)eb;
- ti1w++;
- eob_run1=0;
- }
- /*There's no active EOB run, so log the current token.*/
- dct_tokens1[ti1w]=(unsigned char)token1;
- extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+ }
+ if(!neobs1){
+ /*Flush any (inactive) EOB run.*/
+ if(eob_run1>0){
+ token=oc_make_eob_token_full(eob_run1,&eb);
+ dct_tokens1[ti1w]=(unsigned char)token;
+ extra_bits1[ti1w]=(ogg_uint16_t)eb;
ti1w++;
+ eob_run1=0;
}
- else{
- /*Otherwise consume one EOB from the current run.*/
- neobs1--;
- /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
- if(eob_run1-neobs1>=4095){
- token=oc_make_eob_token_full(4095,&eb);
- dct_tokens1[ti1w]=(unsigned char)token;
- extra_bits1[ti1w]=(ogg_uint16_t)eb;
- ti1w++;
- eob_run1-=4095;
- }
+ /*There's no active EOB run, so log the current token.*/
+ dct_tokens1[ti1w]=(unsigned char)token1;
+ extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+ ti1w++;
+ }
+ else{
+ /*Otherwise consume one EOB from the current run.*/
+ neobs1--;
+ /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
+ if(eob_run1-neobs1>=4095){
+ token=oc_make_eob_token_full(4095,&eb);
+ dct_tokens1[ti1w]=(unsigned char)token;
+ extra_bits1[ti1w]=(ogg_uint16_t)eb;
+ ti1w++;
+ eob_run1-=4095;
}
}
- /*Flush the trailing EOB runs.*/
- if(eob_run0>0){
- token=oc_make_eob_token_full(eob_run0,&eb);
- dct_tokens0[ti0]=(unsigned char)token;
- extra_bits0[ti0]=(ogg_uint16_t)eb;
- ti0++;
- }
- if(eob_run1>0){
- token=oc_make_eob_token_full(eob_run1,&eb);
- dct_tokens1[ti1w]=(unsigned char)token;
- extra_bits1[ti1w]=(ogg_uint16_t)eb;
- ti1w++;
- }
- _enc->ndct_tokens[pli][0]=ti0;
- _enc->ndct_tokens[pli][1]=ti1w;
}
+ /*Save the current state.*/
+ _enc->ndct_tokens[_pli][0]=ti0;
+ _enc->ndct_tokens[_pli][1]=ti1w;
+ _enc->eob_run[_pli][0]=eob_run0;
+ _enc->eob_run[_pli][1]=eob_run1;
}
/*DC prediction, post-facto DC tokenization (has to be completed after DC
@@ -660,16 +663,12 @@
void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
int pli;
int zzi;
- /*Emit final EOB runs for the AC coefficients.
- This must be done before we tokenize the DC coefficients, so we can
- properly track the 1st AC coefficient to the end of the list.*/
- for(pli=0;pli<3;pli++)for(zzi=1;zzi<64;zzi++){
+ /*Emit final EOB runs.*/
+ for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
int eob_run;
eob_run=_enc->eob_run[pli][zzi];
if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
}
- /*Fill in the DC token list and fix-up the 1st AC coefficient.*/
- oc_enc_tokenize_dc(_enc);
/*Merge the final EOB run of one token list with the start of the next, if
possible.*/
for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
@@ -681,6 +680,8 @@
int new_eb;
int zzj;
int plj;
+ /*ti is always initialized before use; if your compiler thinks otherwise,
+ it is dumb.*/
ptrdiff_t ti;
int run_count;
/*Make sure this coefficient has tokens at all.*/
@@ -704,7 +705,6 @@
/*Ensure its last token was an EOB run.*/
if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
/*Pull off the associated extra bits, if any, and decode the runs.*/
- /*ti is always initialized; if your compiler thinks otherwise, it is dumb.*/
old_eb1=_enc->extra_bits[plj][zzj][ti];
old_eb2=_enc->extra_bits[pli][zzi][0];
run_count=oc_decode_eob_token(old_tok1,old_eb1)
More information about the commits
mailing list