[xiph-commits] r15939 - in branches/theora-thusnelda/lib: . enc

Sat Apr 18 00:43:26 PDT 2009

Author: tterribe
Date: 2009-04-18 00:43:25 -0700 (Sat, 18 Apr 2009)
New Revision: 15939

Modified:
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct.c
   branches/theora-thusnelda/lib/enc/dsp.h
   branches/theora-thusnelda/lib/enc/encapiwrapper.c
Log:
New fDCT, now with less leakage.
Only x86-64 asm for now, because it was easier; MMX will be added soon.
The rest of the encoder now needs to be re-tuned.


Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================

--- branches/theora-thusnelda/lib/Makefile.am	2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/Makefile.am	2009-04-18 07:43:25 UTC (rev 15939)
@@ -8,6 +8,7 @@
         enc/x86/dsp_mmxext.c \
         enc/x86/recon_mmx.c \
         enc/x86/fdct_mmx.c \
+        enc/x86/fdct_sse2.c \
         enc/x86/idct_mmx.c \
         enc/x86_32_vs/dsp_mmx.c \
         enc/x86_32_vs/fdct_mmx.c \
@@ -48,7 +49,8 @@
 	enc/x86/dsp_mmxext.c \
 	enc/x86/recon_mmx.c \
 	enc/x86/idct_mmx.c \
-	enc/x86/fdct_mmx.c
+	enc/x86/fdct_mmx.c \
+	enc/x86/fdct_sse2.c
 
 if CPU_x86_64
 encoder_arch_sources = $(encoder_x86_sources)

Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h	2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2009-04-18 07:43:25 UTC (rev 15939)
@@ -25,6 +25,7 @@
 //#define COLLECT_METRICS 
 
 #include "theora/theora.h"
+#include "../internal.h"
 #include "encoder_huffman.h"
 #include "../dec/ocintrin.h"
 typedef struct CP_INSTANCE CP_INSTANCE;

Modified: branches/theora-thusnelda/lib/enc/dct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct.c	2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/dct.c	2009-04-18 07:43:25 UTC (rev 15939)
@@ -19,250 +19,162 @@
 #include "dsp.h"
 #include "../cpu.h"
 
-static ogg_int32_t xC1S7 = 64277;
-static ogg_int32_t xC2S6 = 60547;
-static ogg_int32_t xC3S5 = 54491;
-static ogg_int32_t xC4S4 = 46341;
-static ogg_int32_t xC5S3 = 36410;
-static ogg_int32_t xC6S2 = 25080;
-static ogg_int32_t xC7S1 = 12785;
 
-#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
-#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
 
-static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
-  int loop;
+#define OC_C1S7 (64277)
+#define OC_C2S6 (60547)
+#define OC_C3S5 (54491)
+#define OC_C4S4 (46341)
+#define OC_C5S3 (36410)
+#define OC_C6S2 (25080)
+#define OC_C7S1 (12785)
 
-  ogg_int32_t  is07, is12, is34, is56;
-  ogg_int32_t  is0734, is1256;
-  ogg_int32_t  id07, id12, id34, id56;
 
-  ogg_int32_t  irot_input_x, irot_input_y;
-  ogg_int32_t  icommon_product1;   /* Re-used product  (c4s4 * (s12 - s56)). */
-  ogg_int32_t  icommon_product2;   /* Re-used product  (c4s4 * (d12 + d56)). */
 
-  ogg_int32_t  temp1, temp2;         /* intermediate variable for computation */
+/*Performs a forward 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 from the orthonormal version of the
+   transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      The first 8 entries are used (e.g., from a row of an 8x8 block).*/
+static void oc_fdct8(const ogg_int16_t _x[8],ogg_int16_t *_y){
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int r;
+  int s;
+  int u;
+  int v;
+  /*Stage 1:*/
+  /*0-7 butterfly.*/
+  t0=_x[0]+(int)_x[7];
+  t7=_x[0]-(int)_x[7];
+  /*1-6 butterfly.*/
+  t1=_x[1]+(int)_x[6];
+  t6=_x[1]-(int)_x[6];
+  /*2-5 butterfly.*/
+  t2=_x[2]+(int)_x[5];
+  t5=_x[2]-(int)_x[5];
+  /*3-4 butterfly.*/
+  t3=_x[3]+(int)_x[4];
+  t4=_x[3]-(int)_x[4];
+  /*Stage 2:*/
+  /*0-3 butterfly.*/
+  r=t0+t3;
+  t3=t0-t3;
+  t0=r;
+  /*1-2 butterfly.*/
+  r=t1+t2;
+  t2=t1-t2;
+  t1=r;
+  /*6-5 butterfly.*/
+  r=t6+t5;
+  t5=t6-t5;
+  t6=r;
+  /*Stages 3 and 4 are where all the approximation occurs.
+    These are chosen to be as close to an exact inverse of the approximations
+     made in the iDCT as possible, while still using mostly 16-bit arithmetic.
+    We use some 16x16->32 signed MACs, but those still commonly execute in 1
+     cycle on a 16-bit DSP.
+    For example, s=(27146*t5+0x4000>>16)+t5+(t5!=0) is an exact inverse of
+     t5=(OC_C4S4*s>>16).
+    That is, applying the latter to the output of the former will recover t5
+     exactly (over the valid input range of t5, -23171...23169).
+    We increase the rounding bias to 0xB500 in this particular case so that
+     errors inverting the subsequent butterfly are not one-sided (e.g., the
+     mean error is very close to zero).
+    The (t5!=0) term could be replaced simply by 1, but we want to send 0 to 0.
+    The fDCT of an all-zeros block will still not be zero, because of the
+     biases we added at the very beginning of the process, but it will be close
+     enough that it is guaranteed to round to zero.*/
+  /*Stage 3:*/
+  /*4-5 butterfly.*/
+  s=(27146*t5+0xB500>>16)+t5+(t5!=0)>>1;
+  r=t4+s;
+  t5=t4-s;
+  t4=r;
+  /*7-6 butterfly.*/
+  s=(27146*t6+0xB500>>16)+t6+(t6!=0)>>1;
+  r=t7+s;
+  t6=t7-s;
+  t7=r;
+  /*Stage 4:*/
+  /*0-1 butterfly.*/
+  r=(27146*t0+0x4000>>16)+t0+(t0!=0);
+  s=(27146*t1+0xB500>>16)+t1+(t1!=0);
+  u=r+s>>1;
+  v=r-u;
+  _y[0<<3]=u;
+  _y[4<<3]=v;
+  /*3-2 rotation by 6pi/16*/
+  u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
+  s=(OC_C6S2*u>>16)-t2;
+  v=(s*21600+0x2800>>18)+s+(s!=0);
+  _y[2<<3]=u;
+  _y[6<<3]=v;
+  /*6-5 rotation by 3pi/16*/
+  u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
+  s=t6-(OC_C5S3*u>>16);
+  v=(s*26568+0x3400>>17)+s+(s!=0);
+  _y[5<<3]=u;
+  _y[3<<3]=v;
+  /*7-4 rotation by 7pi/16*/
+  u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
+  s=(OC_C7S1*u>>16)-t4;
+  v=(s*20539+0x3000>>20)+s+(s!=0);
+  _y[1<<3]=u;
+  _y[7<<3]=v;
+}
 
-  ogg_int32_t  InterData[64];
-  ogg_int32_t *ip = InterData;
-  ogg_int16_t * op = OutputData;
-  for (loop = 0; loop < 8; loop++){
-    /* Pre calculate some common sums and differences. */
-    is07 = InputData[0] + InputData[7];
-    is12 = InputData[1] + InputData[2];
-    is34 = InputData[3] + InputData[4];
-    is56 = InputData[5] + InputData[6];
-
-    id07 = InputData[0] - InputData[7];
-    id12 = InputData[1] - InputData[2];
-    id34 = InputData[3] - InputData[4];
-    id56 = InputData[5] - InputData[6];
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    /* Pre-Calculate some common product terms. */
-    icommon_product1 = xC4S4*(is12 - is56);
-    icommon_product1 = DOROUND(icommon_product1);
-    icommon_product1>>=16;
-
-    icommon_product2 = xC4S4*(id12 + id56);
-    icommon_product2 = DOROUND(icommon_product2);
-    icommon_product2>>=16;
-
-
-    ip[0] = (xC4S4*(is0734 + is1256));
-    ip[0] = DOROUND(ip[0]);
-    ip[0] >>= 16;
-
-    ip[4] = (xC4S4*(is0734 - is1256));
-    ip[4] = DOROUND(ip[4]);
-    ip[4] >>= 16;
-
-    /* Define inputs to rotation for outputs 2 and 6 */
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    /* Apply rotation for outputs 2 and 6.  */
-    temp1=xC6S2*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC2S6*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[2] = temp1 + temp2;
-
-    temp1=xC6S2*irot_input_y;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC2S6*irot_input_x ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[6] = temp1 -temp2 ;
-
-    /* Define inputs to rotation for outputs 1 and 7  */
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -( id34 + icommon_product2 );
-
-    /* Apply rotation for outputs 1 and 7.  */
-
-    temp1=xC1S7*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC7S1*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[1] = temp1 - temp2;
-
-    temp1=xC7S1*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC1S7*irot_input_y ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[7] = temp1 + temp2 ;
-
-    /* Define inputs to rotation for outputs 3 and 5 */
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    /* Apply rotation for outputs 3 and 5. */
-    temp1=xC3S5*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC5S3*irot_input_y ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[3] = temp1 - temp2 ;
-
-    temp1=xC5S3*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC3S5*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    ip[5] = temp1 + temp2;
-
-    /* Increment data pointer for next row. */
-    InputData += 8 ;
-    ip += 8; /* advance pointer to next row */
-
-  }
-
-
-  /* Performed DCT on rows, now transform the columns */
-  ip = InterData;
-  for (loop = 0; loop < 8; loop++){
-    /* Pre calculate some common sums and differences.  */
-    is07 = ip[0 * 8] + ip[7 * 8];
-    is12 = ip[1 * 8] + ip[2 * 8];
-    is34 = ip[3 * 8] + ip[4 * 8];
-    is56 = ip[5 * 8] + ip[6 * 8];
-
-    id07 = ip[0 * 8] - ip[7 * 8];
-    id12 = ip[1 * 8] - ip[2 * 8];
-    id34 = ip[3 * 8] - ip[4 * 8];
-    id56 = ip[5 * 8] - ip[6 * 8];
-
-    is0734 = is07 + is34;
-    is1256 = is12 + is56;
-
-    /* Pre-Calculate some common product terms. */
-    icommon_product1 = xC4S4*(is12 - is56) ;
-    icommon_product2 = xC4S4*(id12 + id56) ;
-    icommon_product1 = DOROUND(icommon_product1);
-    icommon_product2 = DOROUND(icommon_product2);
-    icommon_product1>>=16;
-    icommon_product2>>=16;
-
-
-    temp1 = xC4S4*(is0734 + is1256) ;
-    temp2 = xC4S4*(is0734 - is1256) ;
-    temp1 = DOROUND(temp1);
-    temp2 = DOROUND(temp2);
-    temp1>>=16;
-    temp2>>=16;
-    op[0*8] = (ogg_int16_t) temp1;
-    op[4*8] = (ogg_int16_t) temp2;
-
-    /* Define inputs to rotation for outputs 2 and 6 */
-    irot_input_x = id12 - id56;
-    irot_input_y = is07 - is34;
-
-    /* Apply rotation for outputs 2 and 6.  */
-    temp1=xC6S2*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC2S6*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[2*8] = (ogg_int16_t) (temp1 + temp2);
-
-    temp1=xC6S2*irot_input_y;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC2S6*irot_input_x ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[6*8] = (ogg_int16_t) (temp1 -temp2) ;
-
-    /* Define inputs to rotation for outputs 1 and 7 */
-    irot_input_x = icommon_product1 + id07;
-    irot_input_y = -( id34 + icommon_product2 );
-
-    /* Apply rotation for outputs 1 and 7. */
-    temp1=xC1S7*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC7S1*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[1*8] = (ogg_int16_t) (temp1 - temp2);
-
-    temp1=xC7S1*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC1S7*irot_input_y ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[7*8] = (ogg_int16_t) (temp1 + temp2);
-
-    /* Define inputs to rotation for outputs 3 and 5 */
-    irot_input_x = id07 - icommon_product1;
-    irot_input_y = id34 - icommon_product2;
-
-    /* Apply rotation for outputs 3 and 5. */
-    temp1=xC3S5*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC5S3*irot_input_y ;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[3*8] = (ogg_int16_t) (temp1 - temp2) ;
-
-    temp1=xC5S3*irot_input_x;
-    temp1=DOROUND(temp1);
-    temp1>>=16;
-    temp2=xC3S5*irot_input_y;
-    temp2=DOROUND(temp2);
-    temp2>>=16;
-    op[5*8] = (ogg_int16_t) (temp1 + temp2);
-
-    /* Increment data pointer for next column.  */
-    ip ++;
-    op ++;
-  }
+/*Performs a forward 8x8 Type-II DCT transform.
+  The output is scaled by a factor of 4 relative to the orthonormal version
+   of the transform.
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients. */
+static void oc_fdct8x8_c(const ogg_int16_t _x[64],ogg_int16_t _y[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  int                i;
+  /*Add two extra bits of working precision to improve accuracy; any more and
+     we could overflow.*/
+  for(i=0;i<64;i++)w[i>>3|(i&7)<<3]=_x[i]<<2;
+  /*These biases correct for some systematic error that remains in the full
+     fDCT->iDCT round trip.*/
+  w[0]+=(w[0]!=0)+1;
+  w[1]--;
+  w[8]++;
+  /*Transform rows of x into columns of w.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
+  /*Transform rows of w into columns of y.*/
+  for(in=_y,out=w,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
+  /*Round the result back to the external working precision (which is still
+     scaled by four relative to the orthogonal result).
+    TODO: We should just update the external working precision.*/
+  for(i=0;i<64;i++)_y[i>>3|(i&7)<<3]=w[i]+2>>2;
 }
 
-void dsp_dct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
-{
-  funcs->fdct_short = fdct_short__c;
-  dsp_dct_decode_init(funcs, cpu_flags);
-  dsp_idct_init(funcs, cpu_flags);
+
+void dsp_dct_init(DspFunctions *_funcs,ogg_uint32_t _cpu_flags){
+  _funcs->fdct_short=oc_fdct8x8_c;
+  dsp_dct_decode_init(_funcs,_cpu_flags);
+  dsp_idct_init(_funcs,_cpu_flags);
 #if defined(USE_ASM)
-  if (cpu_flags & OC_CPU_X86_MMX) {
-    dsp_mmx_fdct_init(funcs);
+  /*TODO: Need to write an MMX version.*/
+  if(_cpu_flags&OC_CPU_X86_MMX){
+    dsp_mmx_fdct_init(_funcs);
   }
+  if(_cpu_flags&OC_CPU_X86_SSE2){
+    dsp_sse2_fdct_init(_funcs);
+  }
 #endif
 }
-

Modified: branches/theora-thusnelda/lib/enc/dsp.h
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.h	2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/dsp.h	2009-04-18 07:43:25 UTC (rev 15939)
@@ -18,11 +18,13 @@
 #ifndef DSP_H
 #define DSP_H
 
+typedef struct DspFunctions DspFunctions;
+
 #include "theora/theora.h"
+#include "codec_internal.h"
 #include "../cpu.h"
 
-typedef struct
-{
+struct DspFunctions{
   void   (*save_fpu)              (void);
   void   (*restore_fpu)           (void);
 
@@ -72,7 +74,7 @@
   void (*IDct10)                  (const ogg_int16_t *InputData, 
 				   const ogg_int16_t *QuantMatrix, 
 				   ogg_int16_t *OutputData);
-} DspFunctions;
+};
 
 extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
 extern void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
@@ -88,6 +90,9 @@
 extern void dsp_mmx_recon_init(DspFunctions *funcs);
 extern void dsp_mmx_dct_decode_init(DspFunctions *funcs);
 extern void dsp_mmx_idct_init(DspFunctions *funcs);
+# if defined(__amd64__)||defined(__x86_64__)
+extern void dsp_sse2_fdct_init(DspFunctions *funcs);
+# endif
 #endif
 
 #define dsp_save_fpu(funcs) (funcs.save_fpu ())

Modified: branches/theora-thusnelda/lib/enc/encapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-04-18 07:43:25 UTC (rev 15939)
@@ -2,6 +2,7 @@
 #include "theora/theoraenc.h"
 #include "theora/theora.h"
 #include "codec_internal.h"
+#include "mathops.h"
 #include "../dec/ocintrin.h"
 
 /*Wrapper to translate the new API into the old API.
@@ -862,14 +863,8 @@
   _ci->quick_p=1;
 }
 
-static int _ilog(unsigned _v){
-  int ret;
-  for(ret=0;_v;ret++)_v>>=1;
-  return ret;
-}
 
 
-
 struct th_enc_ctx{
   /*This is required at the start of the struct for the common functions to
      work.*/
@@ -920,7 +915,8 @@
       enc->info.target_bitrate=ci.target_bitrate;
       enc->info.fps_numerator=ci.fps_numerator;
       enc->info.fps_denominator=ci.fps_denominator;
-      enc->info.keyframe_granule_shift=_ilog(ci.keyframe_frequency_force-1);
+      enc->info.keyframe_granule_shift=
+       OC_ILOG_32(ci.keyframe_frequency_force-1);
     }
   }
   return enc;