[xiph-commits] r17755 - in trunk/theora/lib: . x86 x86_vc

tterribe at svn.xiph.org tterribe at svn.xiph.org
Mon Dec 13 19:07:41 PST 2010


Author: tterribe
Date: 2010-12-13 19:07:41 -0800 (Mon, 13 Dec 2010)
New Revision: 17755

Modified:
   trunk/theora/lib/analyze.c
   trunk/theora/lib/encfrag.c
   trunk/theora/lib/encint.h
   trunk/theora/lib/mcenc.c
   trunk/theora/lib/x86/mmxencfrag.c
   trunk/theora/lib/x86/sse2encfrag.c
   trunk/theora/lib/x86/x86enc.h
   trunk/theora/lib/x86_vc/mmxencfrag.c
Log:
Make the SATD functions return the signed DC value instead of abs(dc).

Right now we still immediately compute abs(dc), but we will need the signed
 value to do real DC costing.
This commit should not change the output of the encoder.


Modified: trunk/theora/lib/analyze.c
===================================================================
--- trunk/theora/lib/analyze.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/analyze.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -750,16 +750,15 @@
 #if defined(OC_COLLECT_METRICS)
   {
     unsigned satd;
-    unsigned dc;
     switch(nmv_offs){
       case 0:satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);break;
       case 1:{
         satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
-        satd+=dc;
+        satd+=abs(dc);
       }break;
       default:{
         satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
-        satd+=dc;
+        satd+=abs(dc);
       }break;
     }
     _enc->frag_satd[_fragi]=satd;
@@ -1139,14 +1138,14 @@
 
 static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
  unsigned _activity[4]){
-  const unsigned char   *src;
-  const ptrdiff_t       *frag_buf_offs;
-  const ptrdiff_t       *sb_map;
-  unsigned               luma;
-  int                    ystride;
-  ptrdiff_t              frag_offs;
-  ptrdiff_t              fragi;
-  int                    bi;
+  const unsigned char *src;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *sb_map;
+  unsigned             luma;
+  int                  ystride;
+  ptrdiff_t            frag_offs;
+  ptrdiff_t            fragi;
+  int                  bi;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
@@ -1224,7 +1223,7 @@
   return luma;
 }
 
-static unsigned oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
+static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
  unsigned _activity[4],const unsigned _intra_satd[12]){
   int bi;
   for(bi=0;bi<4;bi++){
@@ -1236,9 +1235,6 @@
     }
     _activity[bi]=act;
   }
-  /*TODO: Once frag_intra_satd returns the signed DC value instead
-     of the absolute value, this should pass it through.*/
-  return 1;
 }
 
 /*Compute the masking scales for the blocks in a macro block.
@@ -1348,7 +1344,7 @@
   return activity_sum;
 }
 
-static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
  unsigned _frag_satd[12]){
   const unsigned char   *src;
   const ptrdiff_t       *frag_buf_offs;
@@ -1363,15 +1359,18 @@
   int                    bi;
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
-  unsigned               dc;
+  unsigned               luma;
+  int                    dc;
   frag_buf_offs=_enc->state.frag_buf_offs;
   sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ystride=_enc->state.ref_ystride[0];
+  luma=0;
   for(bi=0;bi<4;bi++){
     fragi=sb_map[bi];
     frag_offs=frag_buf_offs[fragi];
     _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
+    luma+=dc;
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
   map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
@@ -1386,6 +1385,7 @@
     frag_offs=frag_buf_offs[fragi];
     _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
   }
+  return luma;
 }
 
 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
@@ -1403,7 +1403,7 @@
   unsigned             rate[4][3];
   int                  prev[3][3];
   unsigned             satd;
-  unsigned             dc;
+  int                  dc;
   unsigned             best_cost;
   unsigned             best_ssd;
   unsigned             best_rate;
@@ -1497,7 +1497,7 @@
   oc_qii_state         qt[3];
   unsigned             cost[3];
   unsigned             satd;
-  unsigned             dc;
+  int                  dc;
   unsigned             best_cost;
   int                  best_qii;
   int                  qii;
@@ -1682,8 +1682,8 @@
         }
         else{
           unsigned intra_satd[12];
-          oc_mb_intra_satd(_enc,mbi,intra_satd);
-          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
+          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
+          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
           for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
         }
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
@@ -2051,7 +2051,7 @@
   int                    bi;
   ptrdiff_t              fragi;
   ptrdiff_t              frag_offs;
-  unsigned               dc;
+  int                    dc;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
   ystride=_enc->state.ref_ystride[0];
@@ -2064,7 +2064,7 @@
       frag_offs=frag_buf_offs[fragi];
       frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
-      frag_satd[bi]+=dc;
+      frag_satd[bi]+=abs(dc);
     }
   }
   else{
@@ -2073,7 +2073,7 @@
       frag_offs=frag_buf_offs[fragi];
       frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
-      frag_satd[bi]+=dc;
+      frag_satd[bi]+=abs(dc);
     }
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -2090,7 +2090,7 @@
       frag_offs=frag_buf_offs[fragi];
       frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
-      frag_satd[mapii]+=dc;
+      frag_satd[mapii]+=abs(dc);
     }
   }
   else{
@@ -2102,7 +2102,7 @@
       frag_offs=frag_buf_offs[fragi];
       frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
-      frag_satd[mapii]+=dc;
+      frag_satd[mapii]+=abs(dc);
     }
   }
   oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
@@ -2162,7 +2162,7 @@
   int                    bits0;
   int                    bits1;
   unsigned               satd;
-  unsigned               dc;
+  int                    dc;
   src=_enc->state.ref_frame_data[OC_FRAME_IO];
   ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
   ystride=_enc->state.ref_ystride[0];
@@ -2184,7 +2184,7 @@
       satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+dc;
+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
   }
   oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
    _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
@@ -2222,7 +2222,7 @@
       satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
        ref+frag_offs+mv_offs[0],ystride);
     }
-    frag_satd[mapii]=satd+dc;
+    frag_satd[mapii]=satd+abs(dc);
   }
   oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
    frag_satd,_skip_ssd,_rd_scale[4],1);
@@ -2335,14 +2335,12 @@
         int            bi;
         ptrdiff_t      fragi;
         mbi=sbi<<2|quadi;
-        oc_mb_intra_satd(_enc,mbi,intra_satd);
+        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
         /*Activity masking.*/
         if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
-          luma=oc_mb_activity(_enc,mbi,activity);
+          oc_mb_activity(_enc,mbi,activity);
         }
-        else{
-          luma=oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
-        }
+        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
         luma_sum+=luma;
         activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
          chroma_rd_scale,activity,activity_avg,luma,luma_avg);

Modified: trunk/theora/lib/encfrag.c
===================================================================
--- trunk/theora/lib/encfrag.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/encfrag.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -241,19 +241,19 @@
   }
 }
 
-unsigned oc_hadamard_sad(unsigned *_dc,const ogg_int16_t _buf[64]){
-  unsigned    sad;
-  unsigned    dc;
-  int         t0;
-  int         t1;
-  int         t2;
-  int         t3;
-  int         t4;
-  int         t5;
-  int         t6;
-  int         t7;
-  int         r;
-  int         i;
+unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
+  unsigned sad;
+  int      dc;
+  int      t0;
+  int      t1;
+  int      t2;
+  int      t3;
+  int      t4;
+  int      t5;
+  int      t6;
+  int      t7;
+  int      r;
+  int      i;
   sad=dc=0;
   for(i=0;i<8;i++){
     /*Hadamard stage 1:*/
@@ -279,7 +279,7 @@
     t5+=t7;
     t7=r-t7;
     /*Hadamard stage 3:*/
-    r=abs(t0+t1);
+    r=abs(t0+t1)&-(i>0);
     r+=abs(t0-t1);
     r+=abs(t2+t3);
     r+=abs(t2-t3);
@@ -289,26 +289,26 @@
     r+=abs(t6-t7);
     sad+=r;
   }
-  dc=abs(_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7]);
+  dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
   *_dc=dc;
-  return sad-dc;
+  return sad;
 }
 
-unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard(buf,_src,_ref,_ystride);
   return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   ogg_int16_t buf[64];
   oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
   return oc_hadamard_sad(_dc,buf);
 }
 
-unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
  const unsigned char *_src,int _ystride){
   ogg_int16_t buf[64];
   oc_intra_hadamard(buf,_src,_ystride);

Modified: trunk/theora/lib/encint.h
===================================================================
--- trunk/theora/lib/encint.h	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/encint.h	2010-12-14 03:07:41 UTC (rev 17755)
@@ -292,12 +292,11 @@
   unsigned (*frag_sad2_thresh)(const unsigned char *_src,
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
    unsigned _thresh);
-  unsigned (*frag_satd)(unsigned *_dc,const unsigned char *_src,
+  unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
-  unsigned (*frag_satd2)(unsigned *_dc,const unsigned char *_src,
+  unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
    const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-  unsigned (*frag_intra_satd)(unsigned *_dc,const unsigned char *_src,
-   int _ystride);
+  unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
   unsigned (*frag_ssd)(const unsigned char *_src,
    const unsigned char *_ref,int _ystride);
   unsigned (*frag_border_ssd)(const unsigned char *_src,
@@ -808,12 +807,12 @@
 unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_c(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_c(unsigned *_dc,const unsigned char *_src,
- int _ystride);
+unsigned oc_enc_frag_intra_satd_c(int *_dc,
+ const unsigned char *_src,int _ystride);
 unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,

Modified: trunk/theora/lib/mcenc.c
===================================================================
--- trunk/theora/lib/mcenc.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/mcenc.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -187,7 +187,7 @@
  int _mvoffset0,int _mvoffset1,const unsigned char *_src,
  const unsigned char *_ref,int _ystride,unsigned _best_err){
   unsigned err;
-  unsigned dc;
+  int      dc;
   int      bi;
   err=0;
   for(bi=0;bi<4;bi++){
@@ -195,7 +195,7 @@
     frag_offs=_frag_buf_offs[_fragis[bi]];
     err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
      _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
-    err+=dc;
+    err+=abs(dc);
   }
   return err;
 }
@@ -231,11 +231,11 @@
   err=0;
   for(bi=0;bi<4;bi++){
     ptrdiff_t frag_offs;
-    unsigned  dc;
+    int       dc;
     frag_offs=_frag_buf_offs[_fragis[bi]];
     err+=oc_enc_frag_satd(_enc,&dc,
      _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
-    err+=dc;
+    err+=abs(dc);
   }
   return err;
 }
@@ -244,10 +244,10 @@
  ptrdiff_t _frag_offs,int _dx,int _dy,
  const unsigned char *_src,const unsigned char *_ref,int _ystride){
   unsigned err;
-  unsigned dc;
+  int      dc;
   err=oc_enc_frag_satd(_enc,&dc,
    _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
-  return err+dc;
+  return err+abs(dc);
 }
 
 /*Perform a motion vector search for this macro block against a single
@@ -718,7 +718,7 @@
   best_site=4;
   for(sitei=0;sitei<8;sitei++){
     unsigned err;
-    unsigned dc;
+    int      dc;
     int      site;
     int      xmask;
     int      ymask;
@@ -740,7 +740,7 @@
     mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
     err=oc_enc_frag_satd2(_enc,&dc,_src,
      _ref+mvoffset0,_ref+mvoffset1,_ystride);
-    err+=dc;
+    err+=abs(dc);
     if(err<_best_err){
       _best_err=err;
       best_site=site;

Modified: trunk/theora/lib/x86/mmxencfrag.c
===================================================================
--- trunk/theora/lib/x86/mmxencfrag.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/mmxencfrag.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -449,13 +449,13 @@
    mm6 = d2 c2 b2 a2 \
    mm7 = d3 c3 b3 a3*/ \
 
-static unsigned oc_int_frag_satd_mmxext(unsigned *_dc,
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
  const unsigned char *_src,int _src_ystride,
  const unsigned char *_ref,int _ref_ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
   unsigned ret;
   unsigned ret2;
-  unsigned dc;
+  int      dc;
   __asm__ __volatile__(
     OC_LOAD_SUB_8x4(0x00)
     OC_HADAMARD_8x4
@@ -500,22 +500,22 @@
     "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
     "movd %%mm4,%[ret2]\n\t"
     "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
     "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
     "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
-    /*Compute abs(dc).*/
-    "movsx %w[dc],%[ret]\n\t"
+    /*Subtract abs(dc) from 2*ret2.*/
+    "movsx %w[dc],%[dc]\n\t"
     "cdq\n\t"
-    "add %[ret2],%[ret2]\n\t"
-    "add %[dc],%[ret]\n\t"
+    "lea (%[ret],%[ret2],2),%[ret2]\n\t"
     "movq %%mm0,%%mm4\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "xor %[ret],%[dc]\n\t"
+    "xor %[dc],%[ret]\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "sub %[dc],%[ret2]\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "sub %[ret],%[ret2]\n\t"
     "movd %%mm4,%[ret]\n\t"
     "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
     /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
@@ -525,7 +525,7 @@
        constraints, otherewise if gcc can prove they're equal it will allocate
        them to the same register (which is bad); _src and _ref face a similar
        problem, though those are never actually the same.*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=d"(dc),
+    :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
      [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
      [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
@@ -537,7 +537,7 @@
   return ret;
 }
 
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
@@ -660,19 +660,19 @@
   );
 }
 
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
   return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
  const unsigned char *_src,int _ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
   unsigned ret;
   unsigned ret2;
-  unsigned dc;
+  int      dc;
   __asm__ __volatile__(
     OC_LOAD_8x4(0x00)
     OC_HADAMARD_8x4

Modified: trunk/theora/lib/x86/sse2encfrag.c
===================================================================
--- trunk/theora/lib/x86/sse2encfrag.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/sse2encfrag.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -382,12 +382,13 @@
  OC_HADAMARD_AB_8x8 \
  OC_HADAMARD_C_ABS_ACCUM_8x8
 
-static unsigned oc_int_frag_satd_sse2(unsigned *_dc,
+static unsigned oc_int_frag_satd_sse2(int *_dc,
  const unsigned char *_src,int _src_ystride,
  const unsigned char *_ref,int _ref_ystride){
   OC_ALIGN16(ogg_int16_t buf[16]);
   unsigned ret;
-  unsigned dc;
+  unsigned ret2;
+  int      dc;
   __asm__ __volatile__(
     OC_LOAD_SUB_8x8
     OC_HADAMARD_8x8
@@ -403,22 +404,22 @@
        for the factor of two we dropped + 3 for the vertical accumulation).
       Now we finally have to promote things to dwords.
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
-       latency of pmaddwd by computing abs(dc) here.*/
+       latency of pmaddwd by starting to compute abs(dc) here.*/
     "pmaddwd %%xmm7,%%xmm0\n\t"
-    "movsx %w[dc],%[ret]\n\t"
+    "movsx %w[dc],%[dc]\n\t"
     "cdq\n\t"
     "movdqa %%xmm0,%%xmm1\n\t"
     "punpckhqdq %%xmm0,%%xmm0\n\t"
-    "add %[dc],%[ret]\n\t"
     "paddd %%xmm1,%%xmm0\n\t"
-    "pshufd $1,%%xmm0,%%xmm1\n\t"
-    "xor %[ret],%[dc]\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
     "paddd %%xmm1,%%xmm0\n\t"
     "movd %%xmm0,%[ret]\n\t"
     /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    "lea -64(%[ret],%[ret]),%[ret]\n\t"
-    "sub %[dc],%[ret]\n\t"
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    "xor %[dc],%[ret2]\n\t"
+    "sub %[ret2],%[ret]\n\t"
     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
        and %[dc] with some of the inputs, since for once we don't write to
        them until after we're done using everything but %[buf].*/
@@ -428,7 +429,8 @@
        problem.
       All four are destructively modified, but if we list them as output
        constraints, gcc can't alias them with other outputs.*/
-    :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
     :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
      [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
     /*We have to use neg, so we actually clobber the condition codes for once
@@ -439,23 +441,23 @@
   return ret;
 }
 
-unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
 }
 
-unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
   return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
  const unsigned char *_src,int _ystride){
   OC_ALIGN16(ogg_int16_t buf[16]);
   unsigned ret;
-  unsigned dc;
+  int      dc;
   __asm__ __volatile__(
     OC_LOAD_8x8
     OC_HADAMARD_8x8
@@ -477,7 +479,7 @@
     "movdqa %%xmm0,%%xmm1\n\t"
     "punpckhqdq %%xmm0,%%xmm0\n\t"
     "paddd %%xmm1,%%xmm0\n\t"
-    "pshufd $1,%%xmm0,%%xmm1\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
     "paddd %%xmm1,%%xmm0\n\t"
     "movd %%xmm0,%[ret]\n\t"
     "lea -64(%[ret],%[ret]),%[ret]\n\t"
@@ -485,7 +487,8 @@
     /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
        and %[dc] with some of the inputs, since for once we don't write to
        them until after we're done using everything but %[buf].*/
-    :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
+    :[ret]"=a"(ret),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
     /*We have to use sub, so we actually clobber the condition codes for once.*/

Modified: trunk/theora/lib/x86/x86enc.h
===================================================================
--- trunk/theora/lib/x86/x86enc.h	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86/x86enc.h	2010-12-14 03:07:41 UTC (rev 17755)
@@ -80,17 +80,17 @@
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
  const unsigned char *_src,int _ystride);
-unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
  const unsigned char *_src,int _ystride);
 unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);

Modified: trunk/theora/lib/x86_vc/mmxencfrag.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxencfrag.c	2010-12-14 01:53:19 UTC (rev 17754)
+++ trunk/theora/lib/x86_vc/mmxencfrag.c	2010-12-14 03:07:41 UTC (rev 17755)
@@ -468,14 +468,14 @@
     mm7 = d3 c3 b3 a3*/ \
 }
 
-static unsigned oc_int_frag_satd_mmxext(unsigned *_dc,
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
  const unsigned char *_src,int _src_ystride,
  const unsigned char *_ref,int _ref_ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
   ogg_int16_t *bufp;
-  unsigned     ret1;
+  unsigned     ret;
   unsigned     ret2;
-  unsigned     dc;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC esi
@@ -483,10 +483,10 @@
 #define SRC_YSTRIDE ecx
 #define REF_YSTRIDE edx
 #define BUF edi
-#define RET eax
+#define RET edx
 #define RET2 ecx
-#define DC edx
-#define DC_WORD dx
+#define DC eax
+#define DC_WORD ax
     mov SRC,_src
     mov SRC_YSTRIDE,_src_ystride
     mov REF,_ref
@@ -535,26 +535,25 @@
     movq mm3,[0x70+BUF]
     movd RET2,mm4
     movq mm7,[0x78+BUF]
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    lea RET,[RET+RET-32]
     movq mm0,[0x40+BUF]
     movq mm4,[0x48+BUF]
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     pmaddwd mm0,mm7
-    /*Compute abs(dc).*/
-    movsx RET,DC_WORD
+    /*Subtract abs(dc) from 2*ret2.*/
+    movsx DC,DC_WORD
     cdq
-    add RET2,RET2
-    add RET,DC
+    lea RET2,[RET+RET2*2]
     movq mm4,mm0
     punpckhdq mm0,mm0
-    xor DC,RET
+    xor RET,DC
     paddd mm4,mm0
-    sub RET2,DC
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    sub RET2,RET
     movd RET,mm4
     lea RET,[RET2+RET*2-64]
-    mov ret1,RET
+    mov ret,RET
     mov dc,DC
 #undef SRC
 #undef REF
@@ -567,10 +566,10 @@
 #undef DC_WORD
   }
   *_dc=dc;
-  return ret1;
+  return ret;
 }
 
-unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref,int _ystride){
   return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
@@ -705,20 +704,20 @@
   }
 }
 
-unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
   return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
  int _ystride){
   OC_ALIGN8(ogg_int16_t buf[64]);
   ogg_int16_t *bufp;
   unsigned     ret1;
   unsigned     ret2;
-  unsigned     dc;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC eax
@@ -788,7 +787,7 @@
     because the input to the INTRA transform was not a difference).*/
     movzx DC,DC_WORD
     add RET,RET
-    sub RET, DC
+    sub RET,DC
     movq mm4,mm0
     punpckhdq mm0,mm0
     paddd mm4,mm0



More information about the commits mailing list