[xiph-commits] r17274 - in branches/theora-gumboot/lib: . x86

gumboot at svn.xiph.org gumboot at svn.xiph.org
Fri Jun 4 18:58:19 PDT 2010


Author: gumboot
Date: 2010-06-04 18:58:19 -0700 (Fri, 04 Jun 2010)
New Revision: 17274

Modified:
   branches/theora-gumboot/lib/decode.c
   branches/theora-gumboot/lib/x86/mmxstate.c
Log:
Get GCC's derranged handling of dct_coeffs out of the way using inline assembly.
Put in the code I thought should make things faster.
Fix a bug that would have mis-handled an all-zero intra block.


Modified: branches/theora-gumboot/lib/decode.c
===================================================================
--- branches/theora-gumboot/lib/decode.c	2010-06-04 20:30:38 UTC (rev 17273)
+++ branches/theora-gumboot/lib/decode.c	2010-06-05 01:58:19 UTC (rev 17274)
@@ -1528,8 +1528,12 @@
    (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
 }
 
-static int oc_dec_get_dct_coeffs(ogg_int16_t dct_coeffs[65],
- oc_dec_ctx *_dec,oc_dec_pipeline_state *_pipe,int _pli, const oc_fragment *_fragp){
+static void oc_dec_get_dct_coeffs(oc_dec_ctx *_dec,oc_dec_pipeline_state *_pipe,
+ int _pli, const oc_fragment *_fragp,ptrdiff_t _frag_buf_off){
+  /*This array is made one element larger because the zig-zag index array
+     uses the final element as a dumping ground for out-of-range indices
+     to protect us from buffer overflow.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[64+1]);
   unsigned char       *dct_tokens;
   const unsigned char *dct_fzig_zag;
   ptrdiff_t           *ti;
@@ -1543,7 +1547,24 @@
   ti=_pipe->ti[_pli];
   eob_runs=_pipe->eob_runs[_pli];
 
+#if 0
   for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
+#else
+      __asm__ __volatile__(
+        "pxor %%xmm0,%%xmm0\n\t"
+        "movdqa %%xmm0,(%[y])\n\t"
+        "movdqa %%xmm0,16(%[y])\n\t"
+        "movdqa %%xmm0,32(%[y])\n\t"
+        "movdqa %%xmm0,48(%[y])\n\t"
+        "movdqa %%xmm0,64(%[y])\n\t"
+        "movdqa %%xmm0,80(%[y])\n\t"
+        "movdqa %%xmm0,96(%[y])\n\t"
+        "movdqa %%xmm0,112(%[y])\n\t"
+        :
+        :[y]"r"(dct_coeffs)
+        :"memory"
+      );
+#endif
   qti=_fragp->mb_mode!=OC_MODE_INTRA;
   ac_quant=_pipe->dequant[_pli][_fragp->qii][qti];
   /*Decode the AC coefficients.*/
@@ -1590,7 +1611,7 @@
   dct_coeffs[0]=(ogg_int16_t)_fragp->dc;
   /*last_zzi is always initialized.
     If your compiler thinks otherwise, it is dumb.*/
-  return last_zzi;
+  oc_state_frag_residual(&_dec->state,_frag_buf_off,_pli,dct_coeffs,last_zzi,_pipe->dequant[_pli][0][qti][0],_fragp->mb_mode);
 }
 
 /*Reconstructs all coded fragments in a single MCU (one or two super block
@@ -1613,14 +1634,16 @@
   oc_fragment             *frags;
   ogg_uint16_t            *sb_masks;
   int                      nhmbs;
-  int                      mbi,
+  int                      mbsi,
                            mb_stepx,
                            mb_stepy;
   int                      sbi,
                            sb_end,
                            sb_newline;
   int                      pixel_fmt;
-  int                      mbo[4];
+  ptrdiff_t                mbo[4];
+  ptrdiff_t                fbo[4];
+  ptrdiff_t                fbsi;
 
   sb_masks = _dec->state.sb_masks;
   frags=_dec->state.frags;
@@ -1639,18 +1662,25 @@
   mbo[1]=mbo[0]+nhmbs*mb_stepy;
   mbo[2]=mbo[1]+mb_stepx;
   mbo[3]=mbo[0]+mb_stepx;
+  fbo[0]=_dec->state.frag_buf_offs[_dec->state.sb_maps[_dec->state.fplanes[_pli].sboffset][0][0]];
+  fbo[1]=_dec->state.frag_buf_offs[_dec->state.sb_maps[_dec->state.fplanes[_pli].sboffset][0][4]];
+  fbo[2]=_dec->state.frag_buf_offs[_dec->state.sb_maps[_dec->state.fplanes[_pli].sboffset][0][8]];
+  fbo[3]=_dec->state.frag_buf_offs[_dec->state.sb_maps[_dec->state.fplanes[_pli].sboffset][0][12+2]];
 
-  mbi=(_pipe->fragy0[_pli]>>2-mb_stepy)*nhmbs;
+  mbsi=(_pipe->fragy0[_pli]>>2-mb_stepy)*nhmbs;
+  fbsi=_pipe->fragy0[_pli]*8*_dec->state.ref_ystride[_pli];
 
-  for ( ; sbi < sb_end; sbi++,mbi+=1<<mb_stepx)
+  for ( ; sbi < sb_end; sbi++,mbsi+=1<<mb_stepx,fbsi+=32)
   {
     ptrdiff_t *fragip;
     ogg_uint16_t bmask;
     int quadi;
 
     if(sbi>=sb_newline){
-      mbi+=(nhmbs<<mb_stepy)-nhmbs;
+      mbsi+=(nhmbs<<mb_stepy)-nhmbs;
       sb_newline+=_dec->state.fplanes[_pli].nhsbs;
+      fbsi-=32*_dec->state.fplanes[_pli].nhsbs;
+      fbsi+=32*_dec->state.ref_ystride[_pli];
     }
 
     bmask = sb_masks[sbi];
@@ -1660,27 +1690,23 @@
 
     for (quadi = 0; quadi < 4; quadi++, bmask >>= 4, fragip += 4)
     {
-      /*This array is made one element larger because the zig-zag index array
-         uses the final element as a dumping ground for out-of-range indices
-         to protect us from buffer overflow.*/
-      OC_ALIGN8(ogg_int16_t dct_coeffs[4][64 + 8]);
       int bi;
-      int mask;
-      int mb_mode;
-      ogg_uint16_t dc_quant;
-      oc_mv *mb_mvs;
-      int frag_buf_off;
-      oc_mv cmv[4];
 
       if ((bmask&15)==0) continue;
 
-      mask = bitraster[quadi][bmask&15];
-
       if (_dec->state.frame_type!=OC_INTRA_FRAME){
-        mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]];
-        mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]];
-        frag_buf_off = _dec->state.frag_buf_offs[fragip[quadi==3?2:0]];
+        int mb_mode;
+        ptrdiff_t frag_buf_off;
+        int mask=bitraster[quadi][bmask&15];
+        int mbi=mbsi+mbo[quadi];
+        oc_mv *mb_mvs;
+        oc_mv cmv[4];
 
+        frag_buf_off = fbsi+fbo[quadi];
+
+        mb_mode = _dec->state.raster_mb_modes[mbi];
+        mb_mvs = _dec->state.raster_mb_mvs[mbi];
+
         switch (pixel_fmt){
         case TH_PF_444:
           if (mb_mode==OC_MODE_INTER_MV_FOUR)
@@ -1706,8 +1732,8 @@
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&5,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
           }
 
-          mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
-          mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
+          mb_mode = _dec->state.raster_mb_modes[mbi+1];
+          mb_mvs = _dec->state.raster_mb_mvs[mbi+1];
 
           if (mask&10){
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
@@ -1736,8 +1762,8 @@
             else if (mb_mode!=OC_MODE_INTRA)
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,1,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
-          mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
-          mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
+          mb_mode = _dec->state.raster_mb_modes[mbi+1];
+          mb_mvs = _dec->state.raster_mb_mvs[mbi+1];
 
           if (mask&2)
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
@@ -1748,8 +1774,9 @@
             else if (mb_mode!=OC_MODE_INTRA)
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,2,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
-          mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs];
-          mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs];
+          mbi+=nhmbs;
+          mb_mode = _dec->state.raster_mb_modes[mbi];
+          mb_mvs = _dec->state.raster_mb_mvs[mbi];
 
           /* TODO: code the reference frame index and the motion vector into a
            * single word and then compare left and right copies -- if they're the
@@ -1764,8 +1791,8 @@
             else if (mb_mode!=OC_MODE_INTRA)
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,4,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
-          mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs+1];
-          mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs+1];
+          mb_mode = _dec->state.raster_mb_modes[mbi+1];
+          mb_mvs = _dec->state.raster_mb_mvs[mbi+1];
 
           if (mask&8)
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
@@ -1782,15 +1809,12 @@
       for (bi = 0; bi < 4; bi++)
       {
         ptrdiff_t fragi;
-        int last_zzi;
+        int frag_buf_off;
         if ((bmask & (1 << bi)) == 0) continue;
         fragi = fragip[bi];
-        mb_mode=frags[fragi].mb_mode;
-        dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
         frag_buf_off = _dec->state.frag_buf_offs[fragi];
 
-        last_zzi = oc_dec_get_dct_coeffs(dct_coeffs[bi], _dec, _pipe, _pli, frags + fragi);
-        oc_state_frag_residual(&_dec->state,frag_buf_off,_pli,dct_coeffs[bi],last_zzi,dc_quant,mb_mode);
+        oc_dec_get_dct_coeffs(_dec, _pipe, _pli, frags + fragi,frag_buf_off);
       }
     }
   }

Modified: branches/theora-gumboot/lib/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-04 20:30:38 UTC (rev 17273)
+++ branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-05 01:58:19 UTC (rev 17274)
@@ -272,7 +272,7 @@
   if(_last_zzi<2){
     /*Note that this value must be unsigned, to keep the __asm__ block from
        sign-extending it when it puts it in a register.*/
-    if (_dct_coeffs[0]){
+    if (_mb_mode==OC_MODE_INTRA||_dct_coeffs[0]){
       ogg_uint16_t p;
       /*We round this dequant product (and not any of the others) because there's
          no iDCT rounding.*/



More information about the commits mailing list