[xiph-commits] r17563 - in trunk/theora/lib: . arm x86 x86_vc

tterribe at svn.xiph.org tterribe at svn.xiph.org
Mon Oct 25 10:40:54 PDT 2010


Author: tterribe
Date: 2010-10-25 10:40:54 -0700 (Mon, 25 Oct 2010)
New Revision: 17563

Modified:
   trunk/theora/lib/analyze.c
   trunk/theora/lib/arm/armstate.c
   trunk/theora/lib/decode.c
   trunk/theora/lib/encode.c
   trunk/theora/lib/mcenc.c
   trunk/theora/lib/state.c
   trunk/theora/lib/state.h
   trunk/theora/lib/x86/mmxstate.c
   trunk/theora/lib/x86_vc/mmxstate.c
Log:
Ensure frame rows are 16-byte aligned.

We don't actually use this for anything yet, but it may help calling
 applications (e.g., doing software YUV2RGB conversion).
Also, change ref_frame_data to point directly to the desired reference frame,
 rather than require a lookup through ref_frame_idx first.
This saves an indirection and gives a 0.7% speed-up at 720p on a Cortex A8.
It should have an even bigger benefit on C64x, though it wasn't benchmarked
 there.


Modified: trunk/theora/lib/analyze.c
===================================================================
--- trunk/theora/lib/analyze.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/analyze.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -610,14 +610,13 @@
 
 static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
  oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
-  int refi;
   /*Copy over all the uncoded fragments from this plane and advance the uncoded
      fragment list.*/
   if(_pipe->nuncoded_fragis[_pli]>0){
     _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
     oc_frag_copy_list(&_enc->state,
-     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]],
-     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]],
+     _enc->state.ref_frame_data[OC_FRAME_SELF],
+     _enc->state.ref_frame_data[OC_FRAME_PREV],
      _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
      _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
     _pipe->nuncoded_fragis[_pli]=0;
@@ -636,17 +635,18 @@
   _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
   _pipe->ncoded_fragis[_pli]=0;
   /*Apply the loop filter if necessary.*/
-  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
   if(_pipe->loop_filter){
-    oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
-     refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+    oc_state_loop_filter_frag_rows(&_enc->state,
+     _pipe->bounding_values,OC_FRAME_SELF,_pli,
+     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
   }
   else _sdelay=_edelay=0;
   /*To fill borders, we have an additional two pixel delay, since a fragment
      in the next row could filter its top edge, using two pixels from a
      fragment in this row.
     But there's no reason to delay a full fragment between the two.*/
-  oc_state_borders_fill_rows(&_enc->state,refi,_pli,
+  oc_state_borders_fill_rows(&_enc->state,
+   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
    (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
    (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
 }
@@ -696,8 +696,7 @@
   frags=_enc->state.frags;
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   ystride=_enc->state.ref_ystride[_pli];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]]
-   +frag_offs;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
   borderi=frags[_fragi].borderi;
   qii=frags[_fragi].qii;
   data=_enc->pipe.dct_data;
@@ -718,9 +717,8 @@
   }
   refi=frags[_fragi].refi;
   mb_mode=frags[_fragi].mb_mode;
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[refi]]+frag_offs;
-  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
-   +frag_offs;
+  ref=_enc->state.ref_frame_data[refi]+frag_offs;
+  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
   /*Motion compensation:*/
   switch(mb_mode){
     case OC_MODE_INTRA:{
@@ -1146,7 +1144,7 @@
   int                    bi;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[0];
   luma=0;
   for(bi=0;bi<4;bi++){
@@ -1363,7 +1361,7 @@
   unsigned               dc;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[0];
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
@@ -1412,7 +1410,7 @@
   int                  bi;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[0];
   fragi=sb_maps[_mbi>>2][_mbi&3][0];
   frag_offs=frag_buf_offs[fragi];
@@ -1501,7 +1499,7 @@
   int                  lambda;
   int                  ystride;
   int                  nqis;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[_pli];
   frag_offs=_enc->state.frag_buf_offs[_fragi];
   satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
@@ -1956,8 +1954,8 @@
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
   int                    borderi;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   ystride=_enc->state.ref_ystride[0];
   frags=_enc->state.frags;
   frag_buf_offs=_enc->state.frag_buf_offs;
@@ -2051,9 +2049,8 @@
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
   unsigned               dc;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[
-   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
@@ -2163,8 +2160,8 @@
   int                    bits1;
   unsigned               satd;
   unsigned               dc;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   frag_mvs=_enc->state.frag_mvs;

Modified: trunk/theora/lib/arm/armstate.c
===================================================================
--- trunk/theora/lib/arm/armstate.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/arm/armstate.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -119,12 +119,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
@@ -159,12 +159,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
@@ -199,12 +199,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,

Modified: trunk/theora/lib/decode.c
===================================================================
--- trunk/theora/lib/decode.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/decode.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -1597,8 +1597,8 @@
   if(_pipe->nuncoded_fragis[_pli]>0){
     _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
     oc_frag_copy_list(&_dec->state,
-     _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
-     _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_PREV]],
+     _dec->state.ref_frame_data[OC_FRAME_SELF],
+     _dec->state.ref_frame_data[OC_FRAME_PREV],
      _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
      _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
   }
@@ -2053,26 +2053,33 @@
    buffers (i.e., decoding did not start on a key frame).
   We initialize them to a solid gray here.*/
 static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
-  th_info *info;
-  size_t   yplane_sz;
-  size_t   cplane_sz;
-  int      yhstride;
-  int      yheight;
-  int      chstride;
-  int      cheight;
+  th_info   *info;
+  size_t     yplane_sz;
+  size_t     cplane_sz;
+  ptrdiff_t  yoffset;
+  int        yhstride;
+  int        yheight;
+  int        chstride;
+  int        cheight;
   _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
   _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
   _dec->state.ref_frame_idx[OC_FRAME_SELF]=0;
+  _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+   _dec->state.ref_frame_data[OC_FRAME_PREV]=
+   _dec->state.ref_frame_data[OC_FRAME_SELF]=
+   _dec->state.ref_frame_bufs[0][0].data;
   memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[0],
    sizeof(_dec->pp_frame_buf[0])*3);
   info=&_dec->state.info;
-  yhstride=info->frame_width+2*OC_UMV_PADDING;
+  yhstride=abs(_dec->state.ref_ystride[0]);
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>!(info->pixel_fmt&1);
+  chstride=abs(_dec->state.ref_ystride[1]);
   cheight=yheight>>!(info->pixel_fmt&2);
-  yplane_sz=yhstride*(size_t)yheight;
+  yplane_sz=yhstride*(size_t)yheight+16;
   cplane_sz=chstride*(size_t)cheight;
-  memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
+  yoffset=_dec->state.ref_ystride[0]*(yheight-1)-
+   (OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride);
+  memset(_dec->state.ref_frame_data[0]-yoffset,0x80,yplane_sz+2*cplane_sz);
 }
 
 int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
@@ -2119,6 +2126,8 @@
     for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
      refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
     _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+    _dec->state.ref_frame_data[OC_FRAME_SELF]=
+     _dec->state.ref_frame_bufs[refi][0].data;
 #if defined(HAVE_CAIRO)
     _dec->telemetry_frame_bytes=_op->bytes;
 #endif
@@ -2207,7 +2216,7 @@
           sdelay+=notstart;
           edelay+=notdone;
           oc_state_loop_filter_frag_rows(&_dec->state,
-           _dec->pipe.bounding_values,refi,pli,
+           _dec->pipe.bounding_values,OC_FRAME_SELF,pli,
            _dec->pipe.fragy0[pli]-sdelay,_dec->pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a
@@ -2272,11 +2281,16 @@
       _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
        _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     else{
       /*Otherwise, just replace the previous reference frame.*/
       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
        _dec->state.ref_frame_idx[OC_FRAME_SELF];
+      _dec->state.ref_frame_data[OC_FRAME_PREV]=
+       _dec->state.ref_frame_data[OC_FRAME_SELF];
     }
     /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
        gamma values, if nothing else).*/

Modified: trunk/theora/lib/encode.c
===================================================================
--- trunk/theora/lib/encode.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/encode.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -1255,6 +1255,8 @@
   /*Use the previous frame's reconstruction.*/
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=
    _enc->state.ref_frame_idx[OC_FRAME_PREV];
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_data[OC_FRAME_PREV];
   /*Flag motion vector analysis about the frame drop.*/
   _enc->prevframe_dropped=1;
   /*Zero the packet.*/
@@ -1690,27 +1692,37 @@
   if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
     _enc->state.ref_frame_idx[OC_FRAME_PREV]=
      _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    _enc->state.ref_frame_data[OC_FRAME_PREV]=
+     _enc->state.ref_frame_data[OC_FRAME_SELF];
     if(_enc->state.frame_type==OC_INTRA_FRAME){
       /*The new frame becomes both the previous and gold reference frames.*/
       _enc->state.keyframe_num=_enc->state.curframe_num;
       _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
        _enc->state.ref_frame_idx[OC_FRAME_SELF];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_data[OC_FRAME_SELF];
     }
   }
   if(_enc->state.ref_frame_idx[OC_FRAME_IO]>=0&&_enc->prevframe_dropped==0){
     _enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG]=
      _enc->state.ref_frame_idx[OC_FRAME_IO];
+    _enc->state.ref_frame_data[OC_FRAME_PREV_ORIG]=
+     _enc->state.ref_frame_data[OC_FRAME_IO];
     if(_enc->state.frame_type==OC_INTRA_FRAME){
       /*The new input frame becomes both the previous and gold
          original-reference frames.*/
       _enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]=
        _enc->state.ref_frame_idx[OC_FRAME_IO];
+      _enc->state.ref_frame_data[OC_FRAME_GOLD_ORIG]=
+       _enc->state.ref_frame_data[OC_FRAME_IO];
     }
   }
   /*Select a free buffer to use for the incoming frame*/
   for(refi=3;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD_ORIG]||
    refi==_enc->state.ref_frame_idx[OC_FRAME_PREV_ORIG];refi++);
   _enc->state.ref_frame_idx[OC_FRAME_IO]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_IO]=
+   _enc->state.ref_frame_bufs[refi][0].data;
   /*Step 3: Copy the input to our internal buffer.
     This lets us add padding, so we don't have to worry about dereferencing
      possibly invalid addresses, and allows us to use the same strides and
@@ -1729,6 +1741,8 @@
   for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
    refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
   _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+  _enc->state.ref_frame_data[OC_FRAME_SELF]=
+   _enc->state.ref_frame_bufs[refi][0].data;
   _enc->state.curframe_num+=_enc->prev_dup_count+1;
   /*Step 4: Compress the frame.*/
   /*Start with a keyframe, and don't allow the generation of invalid files that

Modified: trunk/theora/lib/mcenc.c
===================================================================
--- trunk/theora/lib/mcenc.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/mcenc.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -308,9 +308,9 @@
   hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame_full]];
-  satd_ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_frame_full];
+  satd_ref=_enc->state.ref_frame_data[_frame];
   ystride=_enc->state.ref_ystride[0];
   /*TODO: customize error function for speed/(quality+size) tradeoff.*/
   best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
@@ -557,8 +557,8 @@
   int                  best_site;
   int                  sitei;
   int                  err;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_framei];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -612,8 +612,8 @@
   int                  best_site;
   int                  sitei;
   int                  err;
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_frame];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
   ystride=_enc->state.ref_ystride[0];
@@ -763,8 +763,8 @@
   ystride=_enc->state.ref_ystride[0];
   frag_buf_offs=_enc->state.frag_buf_offs;
   fragis=_enc->state.mb_maps[_mbi][0];
-  src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]];
-  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
   offset_y[3]=offset_y[5]=0;
   offset_y[6]=offset_y[7]=offset_y[8]=ystride;

Modified: trunk/theora/lib/state.c
===================================================================
--- trunk/theora/lib/state.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/state.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -548,6 +548,7 @@
   int            yheight;
   int            chstride;
   int            cheight;
+  ptrdiff_t      align;
   ptrdiff_t      yoffset;
   ptrdiff_t      coffset;
   ptrdiff_t     *frag_buf_offs;
@@ -563,21 +564,26 @@
   vdec=!(info->pixel_fmt&2);
   yhstride=info->frame_width+2*OC_UMV_PADDING;
   yheight=info->frame_height+2*OC_UMV_PADDING;
-  chstride=yhstride>>hdec;
+  /*Require 16-byte aligned rows in the chroma planes.*/
+  chstride=(yhstride>>hdec)+15&~15;
   cheight=yheight>>vdec;
   yplane_sz=yhstride*(size_t)yheight;
   cplane_sz=chstride*(size_t)cheight;
   yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
   coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
-  ref_frame_sz=yplane_sz+2*cplane_sz;
+  /*Although we guarantee the rows of the chroma planes are a multiple of 16
+     bytes, the initial padding on the first row may only be 8 bytes.
+    Compute the offset needed to the actual image data to a multiple of 16.*/
+  align=-coffset&15;
+  ref_frame_sz=yplane_sz+2*cplane_sz+16;
   ref_frame_data_sz=_nrefs*ref_frame_sz;
   /*Check for overflow.
     The same caveats apply as for oc_state_frarray_init().*/
-  if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
+  if(yplane_sz/yhstride!=yheight||2*cplane_sz+16<cplane_sz||
    ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
     return TH_EIMPL;
   }
-  ref_frame_data=_ogg_malloc(ref_frame_data_sz);
+  ref_frame_data=oc_aligned_malloc(ref_frame_data_sz,16);
   frag_buf_offs=_state->frag_buf_offs=
    _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
   if(ref_frame_data==NULL||frag_buf_offs==NULL){
@@ -599,15 +605,15 @@
     memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
      sizeof(_state->ref_frame_bufs[0]));
   }
+  _state->ref_frame_handle=ref_frame_data;
   /*Set up the data pointers for the image buffers.*/
   for(rfi=0;rfi<_nrefs;rfi++){
-    _state->ref_frame_data[rfi]=ref_frame_data;
     _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
-    ref_frame_data+=yplane_sz;
+    ref_frame_data+=yplane_sz+align;
     _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
     ref_frame_data+=cplane_sz;
     _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
-    ref_frame_data+=cplane_sz;
+    ref_frame_data+=cplane_sz+(16-align);
     /*Flip the buffer upside down.
       This allows us to decode Theora's bottom-up frames in their natural
        order, yet return a top-down buffer with a positive stride to the user.*/
@@ -617,7 +623,7 @@
   _state->ref_ystride[0]=-yhstride;
   _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
   /*Initialize the fragment buffer offsets.*/
-  ref_frame_data=_state->ref_frame_data[0];
+  ref_frame_data=_state->ref_frame_bufs[0][0].data;
   fragi=0;
   for(pli=0;pli<3;pli++){
     th_img_plane      *iplane;
@@ -643,19 +649,25 @@
       vpix+=stride<<3;
     }
   }
-  /*Initialize the reference frame indices.*/
+  /*Initialize the reference frame pointers and indices.*/
   _state->ref_frame_idx[OC_FRAME_GOLD]=
    _state->ref_frame_idx[OC_FRAME_PREV]=
    _state->ref_frame_idx[OC_FRAME_GOLD_ORIG]=
    _state->ref_frame_idx[OC_FRAME_PREV_ORIG]=
    _state->ref_frame_idx[OC_FRAME_SELF]=
    _state->ref_frame_idx[OC_FRAME_IO]=-1;
+  _state->ref_frame_data[OC_FRAME_GOLD]=
+   _state->ref_frame_data[OC_FRAME_PREV]=
+   _state->ref_frame_data[OC_FRAME_GOLD_ORIG]=
+   _state->ref_frame_data[OC_FRAME_PREV_ORIG]=
+   _state->ref_frame_data[OC_FRAME_SELF]=
+   _state->ref_frame_data[OC_FRAME_IO]=NULL;
   return 0;
 }
 
 static void oc_state_ref_bufs_clear(oc_theora_state *_state){
   _ogg_free(_state->frag_buf_offs);
-  _ogg_free(_state->ref_frame_data[0]);
+  oc_aligned_free(_state->ref_frame_handle);
 }
 
 
@@ -963,12 +975,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2(_state,

Modified: trunk/theora/lib/state.h
===================================================================
--- trunk/theora/lib/state.h	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/state.h	2010-10-25 17:40:54 UTC (rev 17563)
@@ -427,12 +427,16 @@
   ptrdiff_t           ncoded_fragis[3];
   /*The total number of coded fragments.*/
   ptrdiff_t           ntotal_coded_fragis;
+  /*The actual buffers used for the reference frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[6];
   /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
   int                 ref_frame_idx[6];
-  /*The actual buffers used for the reference frames.*/
-  th_ycbcr_buffer     ref_frame_bufs[6];
-  /*The storage for the reference frame buffers.*/
+  /*The storage for the reference frame buffers.
+    This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
+     for faster look-up.*/
   unsigned char      *ref_frame_data[6];
+  /*The handle used to allocate the reference frame buffers.*/
+  unsigned char      *ref_frame_handle;
   /*The strides for each plane in the reference frames.*/
   int                 ref_ystride[3];
   /*The number of unique border patterns.*/

Modified: trunk/theora/lib/x86/mmxstate.c
===================================================================
--- trunk/theora/lib/x86/mmxstate.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/x86/mmxstate.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -69,12 +69,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,

Modified: trunk/theora/lib/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxstate.c	2010-10-24 04:50:46 UTC (rev 17562)
+++ trunk/theora/lib/x86_vc/mmxstate.c	2010-10-25 17:40:54 UTC (rev 17563)
@@ -80,12 +80,12 @@
   frag_buf_off=_state->frag_buf_offs[_fragi];
   refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=_state->ref_frame_data[_state->ref_frame_idx[refi]]+frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,



More information about the commits mailing list