[xiph-commits] r16260 - in branches/theora-gumboot/lib: dec/x86 enc

gumboot at svn.xiph.org gumboot at svn.xiph.org
Sat Jul 11 13:19:36 PDT 2009


Author: gumboot
Date: 2009-07-11 13:19:36 -0700 (Sat, 11 Jul 2009)
New Revision: 16260

Modified:
   branches/theora-gumboot/lib/dec/x86/mmxidct.c
   branches/theora-gumboot/lib/enc/analyze.c
   branches/theora-gumboot/lib/enc/tokenize.c
Log:
Make the encoder work, but still be dirty and broken on everything except non-VC x86 with MMX.  The encoder now seems to be slightly slower, but at least it works.

Also create a zig-zag-ordered un-quantised dct coefficient buffer for oc_enc_tokenize_ac() so that it doesn't have to mess about with double-indirection through the zig-zag buffer.  This did help a little, but it didn't completely compensate for whatever is slowing things down.



Modified: branches/theora-gumboot/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/dec/x86/mmxidct.c	2009-07-11 20:19:36 UTC (rev 16260)
@@ -581,7 +581,11 @@
     ogg_uint16_t p;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
+#if 0
     p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
+#else
+    p=(ogg_int16_t)(_y[0]*(ogg_int32_t)_dc_quant+15>>5);
+#endif
     /*Fill _y with p.*/
     __asm__ __volatile__(
       /*mm0=0000 0000 0000 AAAA*/
@@ -640,11 +644,13 @@
     );
 #endif
     /*Dequantize the coefficients.*/
+#if 0
     _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
-#if 0
     for(zzi=1;zzi<_ncoefs;zzi++){
       _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
     }
+#else
+    _y[0]=(ogg_int16_t)(_y[0]*(int)_dc_quant);
 #endif
     /*Then perform the iDCT.*/
     if(_last_zzi<10)oc_idct8x8_10(_y);

Modified: branches/theora-gumboot/lib/enc/analyze.c
===================================================================
--- branches/theora-gumboot/lib/enc/analyze.c	2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/enc/analyze.c	2009-07-11 20:19:36 UTC (rev 16260)
@@ -516,6 +516,7 @@
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
  oc_rd_metric *_mo,oc_token_checkpoint **_stack){
   OC_ALIGN16(ogg_int16_t buffer[64]);
+  OC_ALIGN16(ogg_int16_t zzbuffer[64]);
   OC_ALIGN16(ogg_int16_t data[64]);
   const ogg_uint16_t  *dequant;
   const oc_iquant     *enquant;
@@ -651,6 +652,7 @@
     int d;
     v=buffer[OC_FZIG_ZAG[zzi]];
     d=dequant[zzi];
+    zzbuffer[zzi]=v;
     val=v<<1;
     v=abs(val);
     if(v>=d){
@@ -670,36 +672,43 @@
   }
   /*Tokenize.*/
   checkpoint=*_stack;
-  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,buffer,nonzero+1,
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,zzbuffer,nonzero+1,
    _stack,mb_mode==OC_MODE_INTRA?3:0);
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
+#if 0
   oc_dequant_idct8x8(&_enc->state,buffer,data,
    nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
-  if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,buffer);
+#else
+//  memcpy(buffer, data, sizeof(buffer));
+  int dc=data[0];
+  oc_dequant_idct8x8(&_enc->state,data,data,
+   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
+#endif
+  if(mb_mode==OC_MODE_INTRA)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
   else{
     oc_enc_frag_recon_inter(_enc,dst,
-     nmv_offs==1?ref+mv_offs[0]:dst,ystride,buffer);
+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
   }
 #if !defined(OC_COLLECT_METRICS)
   if(frame_type!=OC_INTRA_FRAME)
 #endif
   {
     /*In retrospect, should we have skipped this block?*/
-    oc_enc_frag_sub(_enc,buffer,src,dst,ystride);
+    oc_enc_frag_sub(_enc,data,src,dst,ystride);
     coded_ssd=coded_dc=0;
     if(borderi<0){
       for(pi=0;pi<64;pi++){
-        coded_ssd+=buffer[pi]*buffer[pi];
-        coded_dc+=buffer[pi];
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
       }
     }
     else{
       ogg_int64_t mask;
       mask=_enc->state.borders[borderi].mask;
       for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
-        coded_ssd+=buffer[pi]*buffer[pi];
-        coded_dc+=buffer[pi];
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
       }
     }
     /*Scale to match DCT domain.*/
@@ -736,7 +745,11 @@
       _mo->ac_bits+=ac_bits;
     }
   }
+#if 0
   frags[_fragi].dc=data[0];
+#else
+  frags[_fragi].dc=dc;
+#endif
   frags[_fragi].coded=1;
   return 1;
 }

Modified: branches/theora-gumboot/lib/enc/tokenize.c
===================================================================
--- branches/theora-gumboot/lib/enc/tokenize.c	2009-07-11 10:03:10 UTC (rev 16259)
+++ branches/theora-gumboot/lib/enc/tokenize.c	2009-07-11 20:19:36 UTC (rev 16260)
@@ -208,6 +208,29 @@
   int           qc;
 };
 
+#if 1
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+#endif
+
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _acmin){
@@ -255,7 +278,7 @@
     qc=_qdct[zzi];
     s=-(qc<0);
     qc=qc+s^s;
-    c=_dct[OC_FZIG_ZAG[zzi]];
+    c=_dct[zzi];
     if(qc<=1){
       ogg_uint32_t sum_d2;
       int          nzeros;
@@ -324,7 +347,7 @@
               token=OC_DCT_RUN_CAT2A+cat;
               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
               val=2+((val+val_s^val_s)>2);
-              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
+              e=(_dct[zzj]+val_s^val_s)-_dequant[zzj]*val;
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               cost=d2+lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
@@ -347,7 +370,7 @@
                 token=OC_DCT_RUN_CAT1B+cat;
                 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
               }
-              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
+              e=(_dct[zzj]+val_s^val_s)-_dequant[zzj];
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
               cost=d2+lambda*bits+tokens[zzk][tk].cost;
@@ -626,6 +649,32 @@
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
+#if 1
+  int dc=_qdct[0];
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,(%[y])\n\t"
+    "movq %%mm0,8(%[y])\n\t"
+    "movq %%mm0,16(%[y])\n\t"
+    "movq %%mm0,24(%[y])\n\t"
+    "movq %%mm0,32(%[y])\n\t"
+    "movq %%mm0,40(%[y])\n\t"
+    "movq %%mm0,48(%[y])\n\t"
+    "movq %%mm0,56(%[y])\n\t"
+    "movq %%mm0,64(%[y])\n\t"
+    "movq %%mm0,72(%[y])\n\t"
+    "movq %%mm0,80(%[y])\n\t"
+    "movq %%mm0,88(%[y])\n\t"
+    "movq %%mm0,96(%[y])\n\t"
+    "movq %%mm0,104(%[y])\n\t"
+    "movq %%mm0,112(%[y])\n\t"
+    "movq %%mm0,120(%[y])\n\t"
+    :
+    :[y]"r"(_qdct)
+    :"memory"
+  );
+  _qdct[0]=dc;
+#endif
   zzi=1;
   ti=best_flags>>1&1;
   bits=tokens[zzi][ti].bits;
@@ -642,7 +691,11 @@
       /*We don't include the actual EOB cost for this block in the return value.
         It will be paid for by the fragment that terminates the EOB run.*/
       bits-=tokens[zzi][ti].bits;
+#if 0
       for(;zzi<_zzi;zzi++)_qdct[zzi]=0;
+#else
+      zzi=_zzi;
+#endif
       break;
     }
     /*Emit pending EOB run if any.*/
@@ -654,8 +707,12 @@
     next=tokens[zzi][ti].next;
     qc=tokens[zzi][ti].qc;
     zzj=(next>>1)-1&63;
+#if 0
     for(;zzi<zzj;zzi++)_qdct[zzi]=0;
     _qdct[zzj]=qc;
+#else
+    _qdct[OC_FZIG_ZAG_MMX[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+#endif
     zzi=next>>1;
     ti=next&1;
   }



More information about the commits mailing list