[xiph-commits] r17273 - in branches/theora-gumboot/lib: . x86

gumboot at svn.xiph.org gumboot at svn.xiph.org
Fri Jun 4 13:30:39 PDT 2010


Author: gumboot
Date: 2010-06-04 13:30:38 -0700 (Fri, 04 Jun 2010)
New Revision: 17273

Modified:
   branches/theora-gumboot/lib/decode.c
   branches/theora-gumboot/lib/x86/mmxfrag.h
   branches/theora-gumboot/lib/x86/mmxstate.c
Log:
Optimise bits and pieces.  Add 16x8 fragment copy/predict.
Also capture this apparent local maximum in performance in removing cruft from oc_dec_frags_recon_mcu_plane().  There are obvious things to clean up remaining (ignoring the blindingly obvious work of not calling a quad function for individual fragments), but when they are cleaned up the code slows down.


Modified: branches/theora-gumboot/lib/decode.c
===================================================================
--- branches/theora-gumboot/lib/decode.c	2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/decode.c	2010-06-04 20:30:38 UTC (rev 17273)
@@ -1543,8 +1543,6 @@
   ti=_pipe->ti[_pli];
   eob_runs=_pipe->eob_runs[_pli];
 
-  assert(_fragp == _dec->state.frags + *_pipe->coded_fragis[_pli]++); /*XXX:DEBUG*/
-
   for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
   qti=_fragp->mb_mode!=OC_MODE_INTRA;
   ac_quant=_pipe->dequant[_pli][_fragp->qii][qti];
@@ -1674,14 +1672,12 @@
       int frag_buf_off;
       oc_mv cmv[4];
 
-      if ((bmask & 15) == 0)
-        continue;
+      if ((bmask&15)==0) continue;
 
       mask = bitraster[quadi][bmask&15];
 
       if (_dec->state.frame_type!=OC_INTRA_FRAME){
         mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]];
-        dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
         mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]];
         frag_buf_off = _dec->state.frag_buf_offs[fragip[quadi==3?2:0]];
 
@@ -1689,7 +1685,7 @@
         case TH_PF_444:
           if (mb_mode==OC_MODE_INTER_MV_FOUR)
             oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask,mb_mvs);
-          else
+          else if (mb_mode!=OC_MODE_INTRA)
             oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
           break;
 
@@ -1706,12 +1702,11 @@
               cmv[2][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[2][1]+mb_mvs[3][1],1,1);
               oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&5,cmv);
             }
-            else
+            else if (mb_mode!=OC_MODE_INTRA)
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&5,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
           }
 
           mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
-          dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
           mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
 
           if (mask&10){
@@ -1722,7 +1717,7 @@
               cmv[3][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[2][1]+mb_mvs[3][1],1,1);
               oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&10,cmv);
             }
-            else
+            else if (mb_mode!=OC_MODE_INTRA)
               oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&10,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
           }
           break;
@@ -1736,26 +1731,24 @@
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
               cmv[0][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
               cmv[0][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
-              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&1,cmv);
+              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,1,cmv);
             }
-            else
-              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&1,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+            else if (mb_mode!=OC_MODE_INTRA)
+              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,1,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
           mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
-          dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
           mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
 
           if (mask&2)
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
               cmv[1][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
               cmv[1][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
-              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&2,cmv);
+              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,2,cmv);
             }
-            else
-              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&2,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+            else if (mb_mode!=OC_MODE_INTRA)
+              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,2,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
           mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs];
-          dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
           mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs];
 
           /* TODO: code the reference frame index and the motion vector into a
@@ -1766,23 +1759,22 @@
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
               cmv[2][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
               cmv[2][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
-              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&4,cmv);
+              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,4,cmv);
             }
-            else
-              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&4,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+            else if (mb_mode!=OC_MODE_INTRA)
+              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,4,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
 
           mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs+1];
-          dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
           mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs+1];
 
           if (mask&8)
             if (mb_mode==OC_MODE_INTER_MV_FOUR){
               cmv[3][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
               cmv[3][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
-              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&8,cmv);
+              oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,8,cmv);
             }
-            else
-              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&8,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+            else if (mb_mode!=OC_MODE_INTRA)
+              oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,8,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
           break;
         }
       }

Modified: branches/theora-gumboot/lib/x86/mmxfrag.h
===================================================================
--- branches/theora-gumboot/lib/x86/mmxfrag.h	2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/x86/mmxfrag.h	2010-06-04 20:30:38 UTC (rev 17273)
@@ -9,11 +9,11 @@
    between rows.*/
 #define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
   do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
+    const unsigned char *cpysrc; \
+    unsigned char       *cpydst; \
     ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
+    cpysrc=(_src); \
+    cpydst=(_dst); \
     __asm__ __volatile__( \
       /*src+0*ystride*/ \
       "movq (%[src]),%%mm0\n\t" \
@@ -53,12 +53,67 @@
       "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
       /*dst+3*ystride*/ \
       "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[dst]"+r"(cpydst),[src]"+r"(cpysrc),[ystride3]"=&r"(ystride3) \
       :[ystride]"r"((ptrdiff_t)(_ystride)) \
       :"memory" \
     ); \
   } \
   while(0)
 
+/*Copies a 16x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+#define OC_FRAGX2_COPY_SSE2(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *cpysrc; \
+    unsigned char       *cpydst; \
+    ptrdiff_t            ystride3; \
+    cpysrc=(_src); \
+    cpydst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movdqu (%[src]),%%xmm0\n\t" \
+      /*src+1*ystride*/ \
+      "movdqu (%[src],%[ystride]),%%xmm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movdqu (%[src],%[ystride],2),%%xmm2\n\t" \
+      /*src+3*ystride*/ \
+      "movdqu (%[src],%[ystride3]),%%xmm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movdqa %%xmm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movdqu (%[src]),%%xmm0\n\t" \
+      /*src+1*ystride*/ \
+      "movdqu (%[src],%[ystride]),%%xmm1\n\t" \
+      /*src+2*ystride*/ \
+      "movdqu (%[src],%[ystride],2),%%xmm2\n\t" \
+      /*src+3*ystride*/ \
+      "movdqu (%[src],%[ystride3]),%%xmm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movdqa %%xmm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(cpydst),[src]"+r"(cpysrc),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
 # endif
 #endif

Modified: branches/theora-gumboot/lib/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-04 20:30:38 UTC (rev 17273)
@@ -149,6 +149,8 @@
 
 void oc_state_quad_predict_mmx(const oc_theora_state *_state,ptrdiff_t _frag_buf_off,
  int _pli,int _mask,int _ref_frame, oc_mv _mv){
+  const unsigned char *ref;
+  int                  mvoffsets[2];
   unsigned char *dst;
   int            ystride;
   int            nhfrags;
@@ -158,69 +160,61 @@
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+_frag_buf_off;
 
   /*Fill in the target buffer.*/
-  if(_ref_frame!=OC_FRAME_SELF){
-    const unsigned char *ref;
-    int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[_ref_frame]]
-     +_frag_buf_off;
-    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mv[0],_mv[1])>1){
-      switch(_mask&3){
-      case 3:
-        oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
-        break;
-      case 1:
-        oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-         ystride,zeroes);
-        break;
-      case 2:
-        oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-         ystride,zeroes);
-      }
-      dst+=ystride*8;
-      ref+=ystride*8;
-      switch(_mask>>2){
-      case 3:
-        oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
-        break;
-      case 1:
-        oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-         ystride,zeroes);
-        break;
-      case 2:
-        oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-         ystride,zeroes);
-      }
+  ref=_state->ref_frame_data[_state->ref_frame_idx[_ref_frame]]+_frag_buf_off;
+  if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mv[0],_mv[1])>1){
+    switch(_mask&3){
+    case 3:
+      oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+      break;
+    case 1:
+      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
+       ystride,zeroes);
+      break;
+    case 2:
+      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
+       ystride,zeroes);
     }
-    else{
-      switch(_mask&3){
-      case 3:
-        oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
-        oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
-        break;
-      case 1:
-        oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
-        break;
-      case 2:
-        oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
-        break;
-      }
-      dst+=ystride*8;
-      ref+=ystride*8;
-      switch(_mask>>2){
-      case 3:
-        oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
-        oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
-        break;
-      case 1:
-        oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
-        break;
-      case 2:
-        oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
-        break;
-      };
+    dst+=ystride*8;
+    ref+=ystride*8;
+    switch(_mask>>2){
+    case 3:
+      oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+      break;
+    case 1:
+      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
+       ystride,zeroes);
+      break;
+    case 2:
+      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
+       ystride,zeroes);
     }
   }
+  else{
+    switch(_mask&3){
+    case 3:
+      OC_FRAGX2_COPY_SSE2(dst,ref+mvoffsets[0],ystride);
+      break;
+    case 1:
+      OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
+      break;
+    case 2:
+      OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
+      break;
+    }
+    dst+=ystride*8;
+    ref+=ystride*8;
+    switch(_mask>>2){
+    case 3:
+      OC_FRAGX2_COPY_SSE2(dst,ref+mvoffsets[0],ystride);
+      break;
+    case 1:
+      OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
+      break;
+    case 2:
+      OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
+      break;
+    };
+  }
 }
 
 void oc_state_4mv_predict_mmx(const oc_theora_state *_state,ptrdiff_t _frag_buf_off,
@@ -242,14 +236,14 @@
       oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
        ystride,zeroes);
     }
-    else oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
+    else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
   }
   if (_mask & 2){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[1][0],_mvs[1][1])>1){
       oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
        ystride,zeroes);
     }
-    else oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
+    else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
   }
   dst+=ystride*8;
   ref+=ystride*8;
@@ -258,14 +252,14 @@
       oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
        ystride,zeroes);
     }
-    else oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
+    else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
   }
   if (_mask & 8){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[3][0],_mvs[3][1])>1){
       oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
        ystride,zeroes);
     }
-    else oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
+    else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
   }
 }
 



More information about the commits mailing list