[xiph-commits] r17755 - in trunk/theora/lib: . x86 x86_vc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Mon Dec 13 19:07:41 PST 2010
Author: tterribe
Date: 2010-12-13 19:07:41 -0800 (Mon, 13 Dec 2010)
New Revision: 17755
Modified:
trunk/theora/lib/analyze.c
trunk/theora/lib/encfrag.c
trunk/theora/lib/encint.h
trunk/theora/lib/mcenc.c
trunk/theora/lib/x86/mmxencfrag.c
trunk/theora/lib/x86/sse2encfrag.c
trunk/theora/lib/x86/x86enc.h
trunk/theora/lib/x86_vc/mmxencfrag.c
Log:
Make the SATD functions return the signed DC value instead of abs(dc).
Right now we still immediately compute abs(dc), but we will need the signed
value to do real DC costing.
This commit should not change the output of the encoder.
Modified: trunk/theora/lib/analyze.c
===================================================================
--- trunk/theora/lib/analyze.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/analyze.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -750,16 +750,15 @@
#if defined(OC_COLLECT_METRICS)
{
unsigned satd;
- unsigned dc;
switch(nmv_offs){
case 0:satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);break;
case 1:{
satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
- satd+=dc;
+ satd+=abs(dc);
}break;
default:{
satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
- satd+=dc;
+ satd+=abs(dc);
}break;
}
_enc->frag_satd[_fragi]=satd;
@@ -1139,14 +1138,14 @@
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
unsigned _activity[4]){
- const unsigned char *src;
- const ptrdiff_t *frag_buf_offs;
- const ptrdiff_t *sb_map;
- unsigned luma;
- int ystride;
- ptrdiff_t frag_offs;
- ptrdiff_t fragi;
- int bi;
+ const unsigned char *src;
+ const ptrdiff_t *frag_buf_offs;
+ const ptrdiff_t *sb_map;
+ unsigned luma;
+ int ystride;
+ ptrdiff_t frag_offs;
+ ptrdiff_t fragi;
+ int bi;
frag_buf_offs=_enc->state.frag_buf_offs;
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
src=_enc->state.ref_frame_data[OC_FRAME_IO];
@@ -1224,7 +1223,7 @@
return luma;
}
-static unsigned oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
unsigned _activity[4],const unsigned _intra_satd[12]){
int bi;
for(bi=0;bi<4;bi++){
@@ -1236,9 +1235,6 @@
}
_activity[bi]=act;
}
- /*TODO: Once frag_intra_satd returns the signed DC value instead
- of the absolute value, this should pass it through.*/
- return 1;
}
/*Compute the masking scales for the blocks in a macro block.
@@ -1348,7 +1344,7 @@
return activity_sum;
}
-static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
unsigned _frag_satd[12]){
const unsigned char *src;
const ptrdiff_t *frag_buf_offs;
@@ -1363,15 +1359,18 @@
int bi;
ptrdiff_t fragi;
ptrdiff_t frag_offs;
- unsigned dc;
+ unsigned luma;
+ int dc;
frag_buf_offs=_enc->state.frag_buf_offs;
sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
src=_enc->state.ref_frame_data[OC_FRAME_IO];
ystride=_enc->state.ref_ystride[0];
+ luma=0;
for(bi=0;bi<4;bi++){
fragi=sb_map[bi];
frag_offs=frag_buf_offs[fragi];
_frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+ luma+=dc;
}
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
@@ -1386,6 +1385,7 @@
frag_offs=frag_buf_offs[fragi];
_frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
}
+ return luma;
}
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
@@ -1403,7 +1403,7 @@
unsigned rate[4][3];
int prev[3][3];
unsigned satd;
- unsigned dc;
+ int dc;
unsigned best_cost;
unsigned best_ssd;
unsigned best_rate;
@@ -1497,7 +1497,7 @@
oc_qii_state qt[3];
unsigned cost[3];
unsigned satd;
- unsigned dc;
+ int dc;
unsigned best_cost;
int best_qii;
int qii;
@@ -1682,8 +1682,8 @@
}
else{
unsigned intra_satd[12];
- oc_mb_intra_satd(_enc,mbi,intra_satd);
- luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+ luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+ oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
}
activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
@@ -2051,7 +2051,7 @@
int bi;
ptrdiff_t fragi;
ptrdiff_t frag_offs;
- unsigned dc;
+ int dc;
src=_enc->state.ref_frame_data[OC_FRAME_IO];
ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
ystride=_enc->state.ref_ystride[0];
@@ -2064,7 +2064,7 @@
frag_offs=frag_buf_offs[fragi];
frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
- frag_satd[bi]+=dc;
+ frag_satd[bi]+=abs(dc);
}
}
else{
@@ -2073,7 +2073,7 @@
frag_offs=frag_buf_offs[fragi];
frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ystride);
- frag_satd[bi]+=dc;
+ frag_satd[bi]+=abs(dc);
}
}
mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -2090,7 +2090,7 @@
frag_offs=frag_buf_offs[fragi];
frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
- frag_satd[mapii]+=dc;
+ frag_satd[mapii]+=abs(dc);
}
}
else{
@@ -2102,7 +2102,7 @@
frag_offs=frag_buf_offs[fragi];
frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ystride);
- frag_satd[mapii]+=dc;
+ frag_satd[mapii]+=abs(dc);
}
}
oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
@@ -2162,7 +2162,7 @@
int bits0;
int bits1;
unsigned satd;
- unsigned dc;
+ int dc;
src=_enc->state.ref_frame_data[OC_FRAME_IO];
ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
ystride=_enc->state.ref_ystride[0];
@@ -2184,7 +2184,7 @@
satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ystride);
}
- frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+dc;
+ frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
}
oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
_enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
@@ -2222,7 +2222,7 @@
satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
ref+frag_offs+mv_offs[0],ystride);
}
- frag_satd[mapii]=satd+dc;
+ frag_satd[mapii]=satd+abs(dc);
}
oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
frag_satd,_skip_ssd,_rd_scale[4],1);
@@ -2335,14 +2335,12 @@
int bi;
ptrdiff_t fragi;
mbi=sbi<<2|quadi;
- oc_mb_intra_satd(_enc,mbi,intra_satd);
+ luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
/*Activity masking.*/
if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
- luma=oc_mb_activity(_enc,mbi,activity);
+ oc_mb_activity(_enc,mbi,activity);
}
- else{
- luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
- }
+ else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
luma_sum+=luma;
activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
chroma_rd_scale,activity,activity_avg,luma,luma_avg);
Modified: trunk/theora/lib/encfrag.c
===================================================================
--- trunk/theora/lib/encfrag.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/encfrag.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -241,19 +241,19 @@
}
}
-unsigned oc_hadamard_sad(unsigned *_dc,const ogg_int16_t _buf[64]){
- unsigned sad;
- unsigned dc;
- int t0;
- int t1;
- int t2;
- int t3;
- int t4;
- int t5;
- int t6;
- int t7;
- int r;
- int i;
+unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
+ unsigned sad;
+ int dc;
+ int t0;
+ int t1;
+ int t2;
+ int t3;
+ int t4;
+ int t5;
+ int t6;
+ int t7;
+ int r;
+ int i;
sad=dc=0;
for(i=0;i<8;i++){
/*Hadamard stage 1:*/
@@ -279,7 +279,7 @@
t5+=t7;
t7=r-t7;
/*Hadamard stage 3:*/
- r=abs(t0+t1);
+ r=abs(t0+t1)&-(i>0);
r+=abs(t0-t1);
r+=abs(t2+t3);
r+=abs(t2-t3);
@@ -289,26 +289,26 @@
r+=abs(t6-t7);
sad+=r;
}
- dc=abs(_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7]);
+ dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
*_dc=dc;
- return sad-dc;
+ return sad;
}
-unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
ogg_int16_t buf[64];
oc_diff_hadamard(buf,_src,_ref,_ystride);
return oc_hadamard_sad(_dc,buf);
}
-unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
ogg_int16_t buf[64];
oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
return oc_hadamard_sad(_dc,buf);
}
-unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
const unsigned char *_src,int _ystride){
ogg_int16_t buf[64];
oc_intra_hadamard(buf,_src,_ystride);
Modified: trunk/theora/lib/encint.h
===================================================================
--- trunk/theora/lib/encint.h 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/encint.h 2010-12-14 03:07:41 UTC (rev 17755)
@@ -292,12 +292,11 @@
unsigned (*frag_sad2_thresh)(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh);
- unsigned (*frag_satd)(unsigned *_dc,const unsigned char *_src,
+ unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
- unsigned (*frag_satd2)(unsigned *_dc,const unsigned char *_src,
+ unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
- unsigned (*frag_intra_satd)(unsigned *_dc,const unsigned char *_src,
- int _ystride);
+ unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
unsigned (*frag_ssd)(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned (*frag_border_ssd)(const unsigned char *_src,
@@ -808,12 +807,12 @@
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh);
-unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,const unsigned char *_src,
- int _ystride);
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
+ const unsigned char *_src,int _ystride);
unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
Modified: trunk/theora/lib/mcenc.c
===================================================================
--- trunk/theora/lib/mcenc.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/mcenc.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -187,7 +187,7 @@
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
const unsigned char *_ref,int _ystride,unsigned _best_err){
unsigned err;
- unsigned dc;
+ int dc;
int bi;
err=0;
for(bi=0;bi<4;bi++){
@@ -195,7 +195,7 @@
frag_offs=_frag_buf_offs[_fragis[bi]];
err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
_ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
- err+=dc;
+ err+=abs(dc);
}
return err;
}
@@ -231,11 +231,11 @@
err=0;
for(bi=0;bi<4;bi++){
ptrdiff_t frag_offs;
- unsigned dc;
+ int dc;
frag_offs=_frag_buf_offs[_fragis[bi]];
err+=oc_enc_frag_satd(_enc,&dc,
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
- err+=dc;
+ err+=abs(dc);
}
return err;
}
@@ -244,10 +244,10 @@
ptrdiff_t _frag_offs,int _dx,int _dy,
const unsigned char *_src,const unsigned char *_ref,int _ystride){
unsigned err;
- unsigned dc;
+ int dc;
err=oc_enc_frag_satd(_enc,&dc,
_src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
- return err+dc;
+ return err+abs(dc);
}
/*Perform a motion vector search for this macro block against a single
@@ -718,7 +718,7 @@
best_site=4;
for(sitei=0;sitei<8;sitei++){
unsigned err;
- unsigned dc;
+ int dc;
int site;
int xmask;
int ymask;
@@ -740,7 +740,7 @@
mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
err=oc_enc_frag_satd2(_enc,&dc,_src,
_ref+mvoffset0,_ref+mvoffset1,_ystride);
- err+=dc;
+ err+=abs(dc);
if(err<_best_err){
_best_err=err;
best_site=site;
Modified: trunk/theora/lib/x86/mmxencfrag.c
===================================================================
--- trunk/theora/lib/x86/mmxencfrag.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/mmxencfrag.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -449,13 +449,13 @@
mm6 = d2 c2 b2 a2 \
mm7 = d3 c3 b3 a3*/ \
-static unsigned oc_int_frag_satd_mmxext(unsigned *_dc,
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
unsigned ret;
unsigned ret2;
- unsigned dc;
+ int dc;
__asm__ __volatile__(
OC_LOAD_SUB_8x4(0x00)
OC_HADAMARD_8x4
@@ -500,22 +500,22 @@
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
"movd %%mm4,%[ret2]\n\t"
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
- /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
- added to them, and a factor of two removed; correct the final sum here.*/
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
"pmaddwd %%mm7,%%mm0\n\t"
- /*Compute abs(dc).*/
- "movsx %w[dc],%[ret]\n\t"
+ /*Subtract abs(dc) from 2*ret2.*/
+ "movsx %w[dc],%[dc]\n\t"
"cdq\n\t"
- "add %[ret2],%[ret2]\n\t"
- "add %[dc],%[ret]\n\t"
+ "lea (%[ret],%[ret2],2),%[ret2]\n\t"
"movq %%mm0,%%mm4\n\t"
"punpckhdq %%mm0,%%mm0\n\t"
- "xor %[ret],%[dc]\n\t"
+ "xor %[dc],%[ret]\n\t"
"paddd %%mm0,%%mm4\n\t"
- "sub %[dc],%[ret2]\n\t"
+ /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+ added to them, a factor of two removed, and the DC value included;
+ correct the final sum here.*/
+ "sub %[ret],%[ret2]\n\t"
"movd %%mm4,%[ret]\n\t"
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
@@ -525,7 +525,7 @@
constraints, otherewise if gcc can prove they're equal it will allocate
them to the same register (which is bad); _src and _ref face a similar
problem, though those are never actually the same.*/
- :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=d"(dc),
+ :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
@@ -537,7 +537,7 @@
return ret;
}
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
}
@@ -660,19 +660,19 @@
);
}
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
}
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
const unsigned char *_src,int _ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
unsigned ret;
unsigned ret2;
- unsigned dc;
+ int dc;
__asm__ __volatile__(
OC_LOAD_8x4(0x00)
OC_HADAMARD_8x4
Modified: trunk/theora/lib/x86/sse2encfrag.c
===================================================================
--- trunk/theora/lib/x86/sse2encfrag.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/sse2encfrag.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -382,12 +382,13 @@
OC_HADAMARD_AB_8x8 \
OC_HADAMARD_C_ABS_ACCUM_8x8
-static unsigned oc_int_frag_satd_sse2(unsigned *_dc,
+static unsigned oc_int_frag_satd_sse2(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
unsigned ret;
- unsigned dc;
+ unsigned ret2;
+ int dc;
__asm__ __volatile__(
OC_LOAD_SUB_8x8
OC_HADAMARD_8x8
@@ -403,22 +404,22 @@
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
- latency of pmaddwd by computing abs(dc) here.*/
+ latency of pmaddwd by starting to compute abs(dc) here.*/
"pmaddwd %%xmm7,%%xmm0\n\t"
- "movsx %w[dc],%[ret]\n\t"
+ "movsx %w[dc],%[dc]\n\t"
"cdq\n\t"
"movdqa %%xmm0,%%xmm1\n\t"
"punpckhqdq %%xmm0,%%xmm0\n\t"
- "add %[dc],%[ret]\n\t"
"paddd %%xmm1,%%xmm0\n\t"
- "pshufd $1,%%xmm0,%%xmm1\n\t"
- "xor %[ret],%[dc]\n\t"
+ "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"movd %%xmm0,%[ret]\n\t"
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
- added to them, and a factor of two removed; correct the final sum here.*/
- "lea -64(%[ret],%[ret]),%[ret]\n\t"
- "sub %[dc],%[ret]\n\t"
+ added to them, a factor of two removed, and the DC value included;
+ correct the final sum here.*/
+ "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+ "xor %[dc],%[ret2]\n\t"
+ "sub %[ret2],%[ret]\n\t"
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
and %[dc] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf].*/
@@ -428,7 +429,8 @@
problem.
All four are destructively modified, but if we list them as output
constraints, gcc can't alias them with other outputs.*/
- :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+ :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
+ [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
/*We have to use neg, so we actually clobber the condition codes for once
@@ -439,23 +441,23 @@
return ret;
}
-unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}
-unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}
-unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
const unsigned char *_src,int _ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
unsigned ret;
- unsigned dc;
+ int dc;
__asm__ __volatile__(
OC_LOAD_8x8
OC_HADAMARD_8x8
@@ -477,7 +479,7 @@
"movdqa %%xmm0,%%xmm1\n\t"
"punpckhqdq %%xmm0,%%xmm0\n\t"
"paddd %%xmm1,%%xmm0\n\t"
- "pshufd $1,%%xmm0,%%xmm1\n\t"
+ "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
"paddd %%xmm1,%%xmm0\n\t"
"movd %%xmm0,%[ret]\n\t"
"lea -64(%[ret],%[ret]),%[ret]\n\t"
@@ -485,7 +487,8 @@
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
and %[dc] with some of the inputs, since for once we don't write to
them until after we're done using everything but %[buf].*/
- :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+ :[ret]"=a"(ret),[dc]"=r"(dc),
+ [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
/*We have to use sub, so we actually clobber the condition codes for once.*/
Modified: trunk/theora/lib/x86/x86enc.h
===================================================================
--- trunk/theora/lib/x86/x86enc.h 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/x86enc.h 2010-12-14 03:07:41 UTC (rev 17755)
@@ -80,17 +80,17 @@
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
unsigned _thresh);
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
const unsigned char *_src,int _ystride);
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
Modified: trunk/theora/lib/x86_vc/mmxencfrag.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxencfrag.c 2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86_vc/mmxencfrag.c 2010-12-14 03:07:41 UTC (rev 17755)
@@ -468,14 +468,14 @@
mm7 = d3 c3 b3 a3*/ \
}
-static unsigned oc_int_frag_satd_mmxext(unsigned *_dc,
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
ogg_int16_t *bufp;
- unsigned ret1;
+ unsigned ret;
unsigned ret2;
- unsigned dc;
+ int dc;
bufp=buf;
__asm{
#define SRC esi
@@ -483,10 +483,10 @@
#define SRC_YSTRIDE ecx
#define REF_YSTRIDE edx
#define BUF edi
-#define RET eax
+#define RET edx
#define RET2 ecx
-#define DC edx
-#define DC_WORD dx
+#define DC eax
+#define DC_WORD ax
mov SRC,_src
mov SRC_YSTRIDE,_src_ystride
mov REF,_ref
@@ -535,26 +535,25 @@
movq mm3,[0x70+BUF]
movd RET2,mm4
movq mm7,[0x78+BUF]
- /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
- added to them, and a factor of two removed; correct the final sum here.*/
- lea RET,[RET+RET-32]
movq mm0,[0x40+BUF]
movq mm4,[0x48+BUF]
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
pmaddwd mm0,mm7
- /*Compute abs(dc).*/
- movsx RET,DC_WORD
+ /*Subtract abs(dc) from 2*ret2.*/
+ movsx DC,DC_WORD
cdq
- add RET2,RET2
- add RET,DC
+ lea RET2,[RET+RET2*2]
movq mm4,mm0
punpckhdq mm0,mm0
- xor DC,RET
+ xor RET,DC
paddd mm4,mm0
- sub RET2,DC
+ /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+ added to them, a factor of two removed, and the DC value included;
+ correct the final sum here.*/
+ sub RET2,RET
movd RET,mm4
lea RET,[RET2+RET*2-64]
- mov ret1,RET
+ mov ret,RET
mov dc,DC
#undef SRC
#undef REF
@@ -567,10 +566,10 @@
#undef DC_WORD
}
*_dc=dc;
- return ret1;
+ return ret;
}
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
}
@@ -705,20 +704,20 @@
}
}
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
}
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
int _ystride){
OC_ALIGN8(ogg_int16_t buf[64]);
ogg_int16_t *bufp;
unsigned ret1;
unsigned ret2;
- unsigned dc;
+ int dc;
bufp=buf;
__asm{
#define SRC eax
@@ -788,7 +787,7 @@
because the input to the INTRA transform was not a difference).*/
movzx DC,DC_WORD
add RET,RET
- sub RET, DC
+ sub RET,DC
movq mm4,mm0
punpckhdq mm0,mm0
paddd mm4,mm0
More information about the commits
mailing list