[xiph-commits] r17273 - in branches/theora-gumboot/lib: . x86
gumboot at svn.xiph.org
gumboot at svn.xiph.org
Fri Jun 4 13:30:39 PDT 2010
Author: gumboot
Date: 2010-06-04 13:30:38 -0700 (Fri, 04 Jun 2010)
New Revision: 17273
Modified:
branches/theora-gumboot/lib/decode.c
branches/theora-gumboot/lib/x86/mmxfrag.h
branches/theora-gumboot/lib/x86/mmxstate.c
Log:
Optimise bits and pieces. Add 16x8 fragment copy/predict.
Also capture this apparent local maximum in performance in removing cruft from oc_dec_frags_recon_mcu_plane(). There are obvious things to clean up remaining (ignoring the blindingly obvious work of not calling a quad function for individual fragments), but when they are cleaned up the code slows down.
Modified: branches/theora-gumboot/lib/decode.c
===================================================================
--- branches/theora-gumboot/lib/decode.c 2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/decode.c 2010-06-04 20:30:38 UTC (rev 17273)
@@ -1543,8 +1543,6 @@
ti=_pipe->ti[_pli];
eob_runs=_pipe->eob_runs[_pli];
- assert(_fragp == _dec->state.frags + *_pipe->coded_fragis[_pli]++); /*XXX:DEBUG*/
-
for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
qti=_fragp->mb_mode!=OC_MODE_INTRA;
ac_quant=_pipe->dequant[_pli][_fragp->qii][qti];
@@ -1674,14 +1672,12 @@
int frag_buf_off;
oc_mv cmv[4];
- if ((bmask & 15) == 0)
- continue;
+ if ((bmask&15)==0) continue;
mask = bitraster[quadi][bmask&15];
if (_dec->state.frame_type!=OC_INTRA_FRAME){
mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]];
- dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]];
frag_buf_off = _dec->state.frag_buf_offs[fragip[quadi==3?2:0]];
@@ -1689,7 +1685,7 @@
case TH_PF_444:
if (mb_mode==OC_MODE_INTER_MV_FOUR)
oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask,mb_mvs);
- else
+ else if (mb_mode!=OC_MODE_INTRA)
oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
break;
@@ -1706,12 +1702,11 @@
cmv[2][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[2][1]+mb_mvs[3][1],1,1);
oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&5,cmv);
}
- else
+ else if (mb_mode!=OC_MODE_INTRA)
oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&5,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
}
mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
- dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
if (mask&10){
@@ -1722,7 +1717,7 @@
cmv[3][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[2][1]+mb_mvs[3][1],1,1);
oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&10,cmv);
}
- else
+ else if (mb_mode!=OC_MODE_INTRA)
oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&10,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
}
break;
@@ -1736,26 +1731,24 @@
if (mb_mode==OC_MODE_INTER_MV_FOUR){
cmv[0][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
cmv[0][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
- oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&1,cmv);
+ oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,1,cmv);
}
- else
- oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&1,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+ else if (mb_mode!=OC_MODE_INTRA)
+ oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,1,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+1];
- dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+1];
if (mask&2)
if (mb_mode==OC_MODE_INTER_MV_FOUR){
cmv[1][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
cmv[1][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
- oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&2,cmv);
+ oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,2,cmv);
}
- else
- oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&2,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+ else if (mb_mode!=OC_MODE_INTRA)
+ oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,2,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs];
- dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs];
/* TODO: code the reference frame index and the motion vector into a
@@ -1766,23 +1759,22 @@
if (mb_mode==OC_MODE_INTER_MV_FOUR){
cmv[2][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
cmv[2][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
- oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&4,cmv);
+ oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,4,cmv);
}
- else
- oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&4,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+ else if (mb_mode!=OC_MODE_INTRA)
+ oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,4,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
mb_mode = _dec->state.raster_mb_modes[mbi+mbo[quadi]+nhmbs+1];
- dc_quant = _pipe->dequant[_pli][0][mb_mode!=OC_MODE_INTRA][0];
mb_mvs = _dec->state.raster_mb_mvs[mbi+mbo[quadi]+nhmbs+1];
if (mask&8)
if (mb_mode==OC_MODE_INTER_MV_FOUR){
cmv[3][0]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][0]+mb_mvs[1][0]+mb_mvs[2][0]+mb_mvs[3][0],2,2);
cmv[3][1]=(signed char)OC_DIV_ROUND_POW2(mb_mvs[0][1]+mb_mvs[1][1]+mb_mvs[2][1]+mb_mvs[3][1],2,2);
- oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,mask&8,cmv);
+ oc_state_4mv_predict(&_dec->state,frag_buf_off,_pli,8,cmv);
}
- else
- oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,mask&8,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
+ else if (mb_mode!=OC_MODE_INTRA)
+ oc_state_quad_predict(&_dec->state,frag_buf_off,_pli,8,OC_FRAME_FOR_MODE(mb_mode),mb_mvs[0]);
break;
}
}
Modified: branches/theora-gumboot/lib/x86/mmxfrag.h
===================================================================
--- branches/theora-gumboot/lib/x86/mmxfrag.h 2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/x86/mmxfrag.h 2010-06-04 20:30:38 UTC (rev 17273)
@@ -9,11 +9,11 @@
between rows.*/
#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
do{ \
- const unsigned char *src; \
- unsigned char *dst; \
+ const unsigned char *cpysrc; \
+ unsigned char *cpydst; \
ptrdiff_t ystride3; \
- src=(_src); \
- dst=(_dst); \
+ cpysrc=(_src); \
+ cpydst=(_dst); \
__asm__ __volatile__( \
/*src+0*ystride*/ \
"movq (%[src]),%%mm0\n\t" \
@@ -53,12 +53,67 @@
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
/*dst+3*ystride*/ \
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
- :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+ :[dst]"+r"(cpydst),[src]"+r"(cpysrc),[ystride3]"=&r"(ystride3) \
:[ystride]"r"((ptrdiff_t)(_ystride)) \
:"memory" \
); \
} \
while(0)
+/*Copies a 16x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
+#define OC_FRAGX2_COPY_SSE2(_dst,_src,_ystride) \
+ do{ \
+ const unsigned char *cpysrc; \
+ unsigned char *cpydst; \
+ ptrdiff_t ystride3; \
+ cpysrc=(_src); \
+ cpydst=(_dst); \
+ __asm__ __volatile__( \
+ /*src+0*ystride*/ \
+ "movdqu (%[src]),%%xmm0\n\t" \
+ /*src+1*ystride*/ \
+ "movdqu (%[src],%[ystride]),%%xmm1\n\t" \
+ /*ystride3=ystride*3*/ \
+ "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+ /*src+2*ystride*/ \
+ "movdqu (%[src],%[ystride],2),%%xmm2\n\t" \
+ /*src+3*ystride*/ \
+ "movdqu (%[src],%[ystride3]),%%xmm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movdqa %%xmm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ /*dst+2*ystride*/ \
+ "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+ /*src+0*ystride*/ \
+ "movdqu (%[src]),%%xmm0\n\t" \
+ /*src+1*ystride*/ \
+ "movdqu (%[src],%[ystride]),%%xmm1\n\t" \
+ /*src+2*ystride*/ \
+ "movdqu (%[src],%[ystride],2),%%xmm2\n\t" \
+ /*src+3*ystride*/ \
+ "movdqu (%[src],%[ystride3]),%%xmm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movdqa %%xmm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+ /*dst+2*ystride*/ \
+ "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+ :[dst]"+r"(cpydst),[src]"+r"(cpysrc),[ystride3]"=&r"(ystride3) \
+ :[ystride]"r"((ptrdiff_t)(_ystride)) \
+ :"memory" \
+ ); \
+ } \
+ while(0)
+
# endif
#endif
Modified: branches/theora-gumboot/lib/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/x86/mmxstate.c 2010-06-04 13:43:43 UTC (rev 17272)
+++ branches/theora-gumboot/lib/x86/mmxstate.c 2010-06-04 20:30:38 UTC (rev 17273)
@@ -149,6 +149,8 @@
void oc_state_quad_predict_mmx(const oc_theora_state *_state,ptrdiff_t _frag_buf_off,
int _pli,int _mask,int _ref_frame, oc_mv _mv){
+ const unsigned char *ref;
+ int mvoffsets[2];
unsigned char *dst;
int ystride;
int nhfrags;
@@ -158,69 +160,61 @@
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+_frag_buf_off;
/*Fill in the target buffer.*/
- if(_ref_frame!=OC_FRAME_SELF){
- const unsigned char *ref;
- int mvoffsets[2];
- ref=
- _state->ref_frame_data[_state->ref_frame_idx[_ref_frame]]
- +_frag_buf_off;
- if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mv[0],_mv[1])>1){
- switch(_mask&3){
- case 3:
- oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
- break;
- case 1:
- oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
- ystride,zeroes);
- break;
- case 2:
- oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
- ystride,zeroes);
- }
- dst+=ystride*8;
- ref+=ystride*8;
- switch(_mask>>2){
- case 3:
- oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
- break;
- case 1:
- oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
- ystride,zeroes);
- break;
- case 2:
- oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
- ystride,zeroes);
- }
+ ref=_state->ref_frame_data[_state->ref_frame_idx[_ref_frame]]+_frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mv[0],_mv[1])>1){
+ switch(_mask&3){
+ case 3:
+ oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+ break;
+ case 1:
+ oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
+ ystride,zeroes);
+ break;
+ case 2:
+ oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
+ ystride,zeroes);
}
- else{
- switch(_mask&3){
- case 3:
- oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
- oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
- break;
- case 1:
- oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
- break;
- case 2:
- oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
- break;
- }
- dst+=ystride*8;
- ref+=ystride*8;
- switch(_mask>>2){
- case 3:
- oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
- oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
- break;
- case 1:
- oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
- break;
- case 2:
- oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
- break;
- };
+ dst+=ystride*8;
+ ref+=ystride*8;
+ switch(_mask>>2){
+ case 3:
+ oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+ break;
+ case 1:
+ oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
+ ystride,zeroes);
+ break;
+ case 2:
+ oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
+ ystride,zeroes);
}
}
+ else{
+ switch(_mask&3){
+ case 3:
+ OC_FRAGX2_COPY_SSE2(dst,ref+mvoffsets[0],ystride);
+ break;
+ case 1:
+ OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
+ break;
+ case 2:
+ OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
+ break;
+ }
+ dst+=ystride*8;
+ ref+=ystride*8;
+ switch(_mask>>2){
+ case 3:
+ OC_FRAGX2_COPY_SSE2(dst,ref+mvoffsets[0],ystride);
+ break;
+ case 1:
+ OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
+ break;
+ case 2:
+ OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
+ break;
+ };
+ }
}
void oc_state_4mv_predict_mmx(const oc_theora_state *_state,ptrdiff_t _frag_buf_off,
@@ -242,14 +236,14 @@
oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
ystride,zeroes);
}
- else oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
+ else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
}
if (_mask & 2){
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[1][0],_mvs[1][1])>1){
oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
ystride,zeroes);
}
- else oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
+ else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
}
dst+=ystride*8;
ref+=ystride*8;
@@ -258,14 +252,14 @@
oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
ystride,zeroes);
}
- else oc_frag_recon_inter_mmx(dst+0,ref+0+mvoffsets[0],ystride,zeroes);
+ else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
}
if (_mask & 8){
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[3][0],_mvs[3][1])>1){
oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
ystride,zeroes);
}
- else oc_frag_recon_inter_mmx(dst+8,ref+8+mvoffsets[0],ystride,zeroes);
+ else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
}
}
More information about the commits
mailing list