[xiph-commits] r17328 - experimental/derf/theora-ptalarbvorm/lib

gmaxwell at svn.xiph.org gmaxwell at svn.xiph.org
Thu Jul 8 11:03:11 PDT 2010


Author: gmaxwell
Date: 2010-07-08 11:03:11 -0700 (Thu, 08 Jul 2010)
New Revision: 17328

Modified:
   experimental/derf/theora-ptalarbvorm/lib/analyze.c
   experimental/derf/theora-ptalarbvorm/lib/encint.h
   experimental/derf/theora-ptalarbvorm/lib/rate.c
   experimental/derf/theora-ptalarbvorm/lib/tokenize.c
Log:
Rename speed level 2 to 3 and provide a new speed level 2.  Provides a new simple and fast tokenizer for these speed levels. The new speed level 2 is intended to provide quality similar to speed level 1 but is currently 33% faster through the use of the new tokenizer, disabling 3qi, and 4mv. Speed level 3 also uses the new tokenizer and is about 33% faster than the prior speel level 2.

Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-07-07 22:38:00 UTC (rev 17327)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-07-08 18:03:11 UTC (rev 17328)
@@ -761,9 +761,11 @@
   int                     nmv_offs;
   int                     ac_bits;
   int                     borderi;
+  int                     nqis;
   int                     qti;
   int                     qii;
   int                     dc;
+  nqis=_enc->state.nqis;
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
@@ -841,8 +843,14 @@
   dc=data[0];
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
-   _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
+  else{
+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+  }
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
   dequant_dc=dequant[0];
@@ -866,8 +874,10 @@
     data[0]=dc*dequant_dc;
     oc_idct8x8(&_enc->state,data,nonzero+1);
   }
-  oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
-  ac_bits+=qs.bits-_pipe->qs[_pli].bits;
+  if(nqis>1){
+    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
+    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
+  }
   if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
   else{
     oc_enc_frag_recon_inter(_enc,dst,
@@ -923,7 +933,10 @@
     _mo->ac_bits+=ac_bits;
     oc_fr_code_block(_fr);
   }
-  *(_pipe->qs+_pli)=*&qs;
+  /*GCC 4.4.4 generates a warning here because it can't tell that
+     the init code in the nqis check above will run anytime this
+     line runs.*/
+  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
   frags[_fragi].dc=dc;
   frags[_fragi].coded=1;
   return 1;
@@ -1267,6 +1280,23 @@
   return luma;
 }
 
+static unsigned oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _activity[4], unsigned _intra_satd[12]){
+  int bi;
+  for(bi=0;bi<4;bi++){
+    unsigned act;
+    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
+    if(act<8<<12){
+      /*The region is flat.*/
+      act=OC_MINI(act,5<<12);
+    }
+    _activity[bi]=act;
+  }
+  /*TODO: Once frag_intra_satd returns the signed DC value instead
+     of the absolute value, this should pass it through.*/
+  return 1;
+}
+
 /*Compute the masking scales for the blocks in a macro block.
   All masking is computed from the luma blocks.
   We derive scaling factors for the chroma blocks from these, and use the same
@@ -1374,6 +1404,46 @@
   return activity_sum;
 }
 
+static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _frag_satd[12]){
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    ystride;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  unsigned               dc;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  ystride=_enc->state.ref_ystride[0];
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+  }
+}
+
 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
 static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
  const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
@@ -1612,6 +1682,7 @@
   oc_sb_flags            *sb_flags;
   signed char            *mb_modes;
   const oc_mb_map        *mb_maps;
+  const oc_sb_map        *sb_maps;
   oc_fragment            *frags;
   unsigned                stripe_sby;
   unsigned                mcu_nvsbs;
@@ -1639,6 +1710,7 @@
   sb_flags=_enc->state.sb_flags;
   mb_modes=_enc->state.mb_modes;
   mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
   frags=_enc->state.frags;
   notstart=0;
   notdone=1;
@@ -1665,7 +1737,15 @@
         ptrdiff_t fragi;
         mbi=sbi<<2|quadi;
         /*Activity masking.*/
-        luma=oc_mb_activity(_enc,mbi,activity);
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          luma=oc_mb_activity(_enc,mbi,activity);
+        }
+        else{
+          unsigned intra_satd[12];
+          oc_mb_intra_satd(_enc,mbi,intra_satd);
+          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
+        }
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
          chroma_rd_scale,activity,activity_avg,luma,luma_avg);
         luma_sum+=luma;
@@ -1676,7 +1756,8 @@
          _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
           oc_mcenc_search(_enc,mbi);
         }
-        oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi,rd_scale);
+        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS)
+          oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi,rd_scale);
         mb_modes[mbi]=OC_MODE_INTRA;
         oc_enc_mb_transform_quantize_intra_luma(_enc,&pipe,
          mbi,rd_scale,rd_iscale);
@@ -1789,10 +1870,13 @@
     satd=_frag_satd[bi];
     *(ft+0)=*&fr;
     oc_fr_code_block(ft+0);
-    oc_qii_state_advance(qt+0,&qs,0);
     cur_overhead=ft[0].bits-fr.bits;
     best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
-     +(cur_overhead+qt[0].bits-qs.bits<<OC_BIT_SCALE);
+     +(cur_overhead<<OC_BIT_SCALE);
+    if(nqis>1){
+      oc_qii_state_advance(qt+0,&qs,0);
+      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
+    }
     best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
     best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
     best_fri=0;
@@ -1910,7 +1994,7 @@
   _modec->rate=rate;
 }
 
-static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
+static unsigned oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
  unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
   const unsigned char   *src;
   const unsigned char   *ref;
@@ -1923,6 +2007,7 @@
   oc_mv                 *mvs;
   int                    map_nidxs;
   unsigned               uncoded_ssd;
+  unsigned               total_ssd;
   int                    mapii;
   int                    mapi;
   int                    pli;
@@ -1937,6 +2022,7 @@
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
   mvs=_enc->mb_info[_mbi].block_mv;
+  total_ssd=0;
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
     borderi=frags[fragi].borderi;
@@ -1956,6 +2042,7 @@
        hard limit.*/
     if(mvs[bi][0]!=0||mvs[bi][1]!=0)uncoded_ssd*=2;
     _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
+    total_ssd+=uncoded_ssd>>4;
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
   map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
@@ -1986,50 +2073,13 @@
          a hard limit*/
       if(mvs[OC_FRAME_PREV][0]!=0||mvs[OC_FRAME_PREV][1]!=0)uncoded_ssd*=2;
       _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
+      total_ssd+=uncoded_ssd>>4;
     }
     map_nidxs=(map_nidxs-4<<1)+4;
   }
+  return total_ssd;
 }
 
-static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
- unsigned _frag_satd[12]){
-  const unsigned char   *src;
-  const ptrdiff_t       *frag_buf_offs;
-  const ptrdiff_t       *sb_map;
-  const oc_mb_map_plane *mb_map;
-  const unsigned char   *map_idxs;
-  int                    map_nidxs;
-  int                    mapii;
-  int                    mapi;
-  int                    ystride;
-  int                    pli;
-  int                    bi;
-  ptrdiff_t              fragi;
-  ptrdiff_t              frag_offs;
-  unsigned               dc;
-  frag_buf_offs=_enc->state.frag_buf_offs;
-  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ystride=_enc->state.ref_ystride[0];
-  for(bi=0;bi<4;bi++){
-    fragi=sb_map[bi];
-    frag_offs=frag_buf_offs[fragi];
-    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-  }
-  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
-  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
-  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
-  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
-  ystride=_enc->state.ref_ystride[1];
-  for(mapii=4;mapii<map_nidxs;mapii++){
-    mapi=map_idxs[mapii];
-    pli=mapi>>2;
-    bi=mapi&3;
-    fragi=mb_map[pli][bi];
-    frag_offs=frag_buf_offs[fragi];
-    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
-  }
-}
 
 static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
  unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
@@ -2297,6 +2347,8 @@
   unsigned                sbi_end;
   int                     refi;
   int                     pli;
+  int                     sp_level;
+  sp_level=_enc->sp_level;
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
   _enc->state.frame_type=OC_INTER_FRAME;
   oc_mode_scheme_chooser_reset(&_enc->chooser);
@@ -2350,6 +2402,7 @@
         unsigned       skip_ssd[12];
         unsigned       intra_satd[12];
         unsigned       luma;
+        unsigned       uncoded_ssd;
         int            mb_mv_bits_0;
         int            mb_gmv_bits_0;
         int            inter_mv_pref;
@@ -2362,15 +2415,21 @@
         int            bi;
         ptrdiff_t      fragi;
         mbi=sbi<<2|quadi;
+        oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Activity masking.*/
-        luma=oc_mb_activity(_enc,mbi,activity);
+        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+          luma=oc_mb_activity(_enc,mbi,activity);
+        }
+        else{
+          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+        }
         luma_sum+=luma;
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
          chroma_rd_scale,activity,activity_avg,luma,luma_avg);
         /*Motion estimation:
           We always do a basic 1MV search for all macroblocks, coded or not,
            keyframe or not.*/
-        if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
+        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
         dx=dy=0;
         /*Find the block choice with the lowest estimated coding cost.
           If a Cb or Cr block is coded but no Y' block from a macro block then
@@ -2384,7 +2443,6 @@
            embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
           embs[mbi].refined=0;
         }
-        oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Estimate the cost of coding this MB in a keyframe.*/
         if(_allow_keyframe){
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
@@ -2396,10 +2454,10 @@
           }
         }
         /*Estimate the cost in a delta frame for various modes.*/
-        oc_skip_cost(_enc,&pipe,mbi,rd_scale,skip_ssd);
-        oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
-         OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
-        if(_enc->sp_level<OC_SP_LEVEL_NOMC){
+        uncoded_ssd=oc_skip_cost(_enc,&pipe,mbi,rd_scale,skip_ssd);
+        if(sp_level<OC_SP_LEVEL_NOMC){
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
            pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
@@ -2409,8 +2467,6 @@
            OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
           oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
            OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
-          oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-           embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
            OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
           mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
@@ -2423,8 +2479,15 @@
             We have to be careful to remember which ones we've refined so that
              we don't refine it again if we re-encode this frame.*/
           inter_mv_pref=_enc->lambda*3;
+          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+          }
+          else{
+            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
+          }
           if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
-           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+             modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
             if(!(embs[mbi].refined&0x80)){
               oc_mcenc_refine4mv(_enc,mbi);
               embs[mbi].refined|=0x80;
@@ -2477,7 +2540,11 @@
             mb_mode=OC_MODE_INTER_MV;
           }
         }
-        else{
+        else {
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+           OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
            OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
           mb_mode=OC_MODE_INTER_NOMV;

Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-07-07 22:38:00 UTC (rev 17327)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-07-08 18:03:11 UTC (rev 17328)
@@ -49,13 +49,15 @@
 #define OC_PACKET_READY (1)
 
 /*All features enabled.*/
-#define OC_SP_LEVEL_SLOW       (0)
+#define OC_SP_LEVEL_SLOW          (0)
 /*Enable early skip.*/
-#define OC_SP_LEVEL_EARLY_SKIP (1)
+#define OC_SP_LEVEL_EARLY_SKIP    (1)
+/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
+#define OC_SP_LEVEL_FAST_ANALYSIS (2)
 /*Disable motion compensation.*/
-#define OC_SP_LEVEL_NOMC       (2)
+#define OC_SP_LEVEL_NOMC          (3)
 /*Maximum valid speed level.*/
-#define OC_SP_LEVEL_MAX        (2)
+#define OC_SP_LEVEL_MAX           (3)
 
 
 /*The number of extra bits of precision at which to store rate metrics.*/
@@ -522,6 +524,9 @@
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
  const oc_token_checkpoint *_stack,int _n);
 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,

Modified: experimental/derf/theora-ptalarbvorm/lib/rate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/rate.c	2010-07-07 22:38:00 UTC (rev 17327)
+++ experimental/derf/theora-ptalarbvorm/lib/rate.c	2010-07-08 18:03:11 UTC (rev 17328)
@@ -190,7 +190,8 @@
     This may need to be revised if the R-D cost estimation or qii flag
      optimization strategies change.*/
   nqis=1;
-  if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible){
+  if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&&
+   _enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
     qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0,
      lq+(OC_Q57(7)+5)/10);
     if(qi1!=qi)_enc->state.qis[nqis++]=qi1;

Modified: experimental/derf/theora-ptalarbvorm/lib/tokenize.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-07-07 22:38:00 UTC (rev 17327)
+++ experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-07-08 18:03:11 UTC (rev 17328)
@@ -80,7 +80,7 @@
   21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,
   21,21,21,21,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,
   19,19,19,19,19,19,19,19,18,18,18,18,17,17,16,15,14,13,12,10,
-  -1,
+   7,
    9,11,13,14,15,16,17,17,18,18,18,18,19,19,19,19,19,19,19,19,
   20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,21,21,21,21,
   21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,
@@ -171,7 +171,7 @@
     25,  24,  23,  22,  21,  20,  19,  18,  17,  16,
     15,  14,  13,  12,  11,  10,   9,   8,   7,   6,
      5,   4,   3,   2,   1,   1,   1,   1,   0,   0,
-    -1,
+     0,
      0,   0,   0,   0,   0,   0,   0,   1,   0,   1,
      2,   3,   0,   1,   2,   3,   4,   5,   6,   7,
      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
@@ -746,6 +746,240 @@
   return bits;
 }
 
+/*Simplistic R/D tokenizer.
+  This could be made more accurate by using more sophisticated
+   rate predictions for zeros.
+  It could be made faster by switching from R/D decisions to static
+   lambda-derived rounding biases.*/
+int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
+  OC_ALIGN16(ogg_int16_t  coef[64]);
+  const unsigned char *dct_fzig_zag;
+  ogg_uint16_t        *eob_run;
+  oc_token_checkpoint *stack;
+  int                  huffi;
+  int                  zzi;
+  int                  zzj;
+  int                  zzk;
+  int                  total_bits;
+  int                  zr[4];
+  stack=*_stack;
+  total_bits=0;
+  /*The apparent bit-cost of coding a zero from observing the trellis
+     quantizer is pre-combined with lambda.
+    Four predictive cases are considered: The last optimized value is zero (+2) or
+     non-zero and the non-optimized value is zero (+1) or non-zero.*/
+  zr[0]=3*_lambda>>1;
+  zr[1]=_lambda;
+  zr[2]=4*_lambda;
+  zr[3]=7*_lambda>>1;
+  eob_run=_enc->eob_run[_pli];
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
+  huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
+  memcpy(coef,_qdct,_zzi*sizeof(*coef));
+  for(zzj=zzi=1;zzj<_zzi&&!coef[zzj];zzj++);
+  while(zzj<_zzi){
+    int v;
+    int d0;
+    int d1;
+    int sign;
+    int k;
+    int eob;
+    int dq0;
+    int dq1;
+    int dd0;
+    int dd1;
+    int next_zero;
+    int eob_bits;
+    int dct_fzig_zzj;
+    dct_fzig_zzj=dct_fzig_zag[zzj];
+    v=_dct[OC_FZIG_ZAG[zzj]];
+    d0=coef[zzj];
+    eob=eob_run[zzi];
+    for(zzk=zzj+1;zzk<_zzi&&!coef[zzk];zzk++);
+    next_zero=zzk-zzj+62>>6;
+    dq0=d0*_dequant[zzj];
+    dd0=dq0-v;
+    dd0*=dd0;
+    sign=-(d0<0);
+    k=d0+sign^sign;
+    d1=(k-(zzj>_acmin))+sign^sign;
+    dq1=d1*_dequant[zzj];
+    dd1=dq1-v;
+    dd1*=dd1;
+    /*The cost of ending an eob run is included when the alternative is to
+      extend this eob run.
+      A per qi/zzi weight would probably be useful.
+      Including it in the overall tokenization cost was not helpful.
+      The same is true at the far end of the zero run plus token case.*/
+    if(eob>0&&d1==0&&zzk==_zzi){
+      eob_bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
+    }
+    else eob_bits=0;
+    if(zzj==zzi){
+      /*No active zero run.*/
+      int best_token;
+      int best_eb;
+      int token;
+      int best_bits;
+      int bits;
+      int cost;
+      best_token=*(OC_DCT_VALUE_TOKEN_PTR+d0);
+      best_bits=oc_token_bits(_enc,huffi,zzi,best_token);
+      if(d1!=0){
+        token=*(OC_DCT_VALUE_TOKEN_PTR+d1);
+        bits=oc_token_bits(_enc,huffi,zzi,token);
+        cost=dd1+(bits+eob_bits)*_lambda;
+      }
+      else{
+        token=bits=0;
+        cost=dd1+zr[next_zero];
+      }
+      if((dd0+(best_bits+eob_bits)*_lambda)>cost){
+        _qdct[dct_fzig_zzj]=dq1;
+        if(d1==0){
+          zzj=zzk;
+          continue;
+        }
+        best_bits=bits;
+        best_token=token;
+        best_eb=*(OC_DCT_VALUE_EB_PTR+d1);
+      }
+      else{
+        best_eb=*(OC_DCT_VALUE_EB_PTR+d0);
+        _qdct[dct_fzig_zzj]=dq0;
+      }
+      oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+      if(eob>0){
+        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        eob_run[zzi]=0;
+      }
+      oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb);
+      total_bits+=best_bits;
+    }
+    else{
+      int d;
+      int dc_reserve;
+      int best_token;
+      int best_eb;
+      int best_bits;
+      int best_cost;
+      int best_bits1;
+      int best_token1;
+      int best_eb1;
+      int zr_bits;
+      int eob2;
+      int eob_bits2;
+      int bits;
+      int token;
+      int nzeros;
+      nzeros=zzj-zzi;
+      dc_reserve=zzi+62>>6;
+      /*A zero run, followed by the value alone.*/
+      best_token=best_token1=OC_DCT_SHORT_ZRL_TOKEN+(nzeros+55>>6);
+      best_eb=best_eb1=nzeros-1;
+      eob2=eob_run[zzj];
+      if(eob2>0){
+        eob_bits2=oc_token_bits(_enc,huffi,zzj,OC_DCT_EOB1_TOKEN);
+      }
+      else eob_bits2=0;
+      zr_bits=oc_token_bits(_enc,huffi,zzi,best_token)+eob_bits2;
+      best_bits=zr_bits
+       +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d0));
+      d=d0;
+
+      best_bits1=0;
+      if(d1!=0){
+        best_bits1=zr_bits
+         +oc_token_bits(_enc,huffi,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d1));
+      }
+      if(nzeros<17+dc_reserve){
+        if(k<=2){
+          /*+/- 1 combo token.*/
+          token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
+          bits=oc_token_bits(_enc,huffi,zzi,token);
+          if(k==2&&bits<=best_bits1){
+            best_bits1=bits;
+            best_token1=token;
+            best_eb1=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign];
+          }
+          if(k==1&&bits<=best_bits){
+            best_bits=bits;
+            best_token=token;
+            best_eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-sign];
+          }
+        }
+        if(nzeros<3+dc_reserve&&2<=k&&k<=4){
+          /*+/- 2/3 combo token.*/
+          token=OC_DCT_RUN_CAT2A+(nzeros>>1);
+          bits=oc_token_bits(_enc,huffi,zzi,token);
+          if(k==4&&bits<=best_bits1){
+            best_bits1=bits;
+            best_token1=token;
+            best_eb1=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][1];
+          }
+          if(k!=4&&bits<=best_bits){
+            best_bits=bits;
+            best_token=token;
+            best_eb=OC_DCT_RUN_CAT2_EB[nzeros-1][-sign][k-2];
+          }
+        }
+      }
+      best_cost=dd0+(best_bits+eob_bits)*_lambda;
+      if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){
+        _qdct[dct_fzig_zzj]=0;
+        zzj=zzk;
+        continue;
+      }
+      if(d1!=0&&dd1+(best_bits1+eob_bits)*_lambda<best_cost){
+        best_bits=best_bits1;
+        best_token=best_token1;
+        best_eb=best_eb1;
+        d=d1;
+        _qdct[dct_fzig_zzj]=dq1;
+      }
+      else _qdct[dct_fzig_zzj]=dq0;
+      oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+      if(eob){
+        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        eob_run[zzi]=0;
+      }
+      oc_enc_token_log(_enc,_pli,zzi,best_token,best_eb);
+      /*If a zero run won vs. the combo token we still need to code this value.*/
+      if(best_token<=OC_DCT_ZRL_TOKEN){
+        oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzj);
+        if(eob2){
+          oc_enc_eob_log(_enc,_pli,zzj,eob2);
+          /*The cost of any EOB run we disrupted is ignored because doing so
+             improved PSNR/SSIM by a small amount.*/
+          best_bits-=eob_bits2;
+          eob_run[zzj]=0;
+        }
+        oc_enc_token_log(_enc,_pli,zzj,*(OC_DCT_VALUE_TOKEN_PTR+d),*(OC_DCT_VALUE_EB_PTR+d));
+      }
+      total_bits+=best_bits;
+    }
+    zzi=zzj+1;
+    zzj=zzk;
+  }
+  /*Code an EOB run to complete this block.
+    The cost of the EOB run is not included in the total as explained in
+     in a comment in the trellis tokenizer above.*/
+  if(zzi<64){
+    int eob;
+    eob=eob_run[zzi]+1;
+    oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+    if(eob>=4095){
+      oc_enc_token_log(_enc,_pli,zzi,OC_DCT_REPEAT_RUN3_TOKEN,eob);
+      eob=0;
+    }
+    eob_run[zzi]=eob;
+  }
+  *_stack=stack;
+  return total_bits;
+}
+
 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
  int _pli,int _fragy0,int _frag_yend){
   const oc_fragment_plane *fplane;



More information about the commits mailing list