[xiph-commits] r15939 - in branches/theora-thusnelda/lib: . enc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sat Apr 18 00:43:26 PDT 2009
Author: tterribe
Date: 2009-04-18 00:43:25 -0700 (Sat, 18 Apr 2009)
New Revision: 15939
Modified:
branches/theora-thusnelda/lib/Makefile.am
branches/theora-thusnelda/lib/enc/codec_internal.h
branches/theora-thusnelda/lib/enc/dct.c
branches/theora-thusnelda/lib/enc/dsp.h
branches/theora-thusnelda/lib/enc/encapiwrapper.c
Log:
New fDCT, now with less leakage.
Only x86-64 asm for now, because it was easier; MMX will be added soon.
The rest of the encoder now needs to be re-tuned.
Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================
--- branches/theora-thusnelda/lib/Makefile.am 2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/Makefile.am 2009-04-18 07:43:25 UTC (rev 15939)
@@ -8,6 +8,7 @@
enc/x86/dsp_mmxext.c \
enc/x86/recon_mmx.c \
enc/x86/fdct_mmx.c \
+ enc/x86/fdct_sse2.c \
enc/x86/idct_mmx.c \
enc/x86_32_vs/dsp_mmx.c \
enc/x86_32_vs/fdct_mmx.c \
@@ -48,7 +49,8 @@
enc/x86/dsp_mmxext.c \
enc/x86/recon_mmx.c \
enc/x86/idct_mmx.c \
- enc/x86/fdct_mmx.c
+ enc/x86/fdct_mmx.c \
+ enc/x86/fdct_sse2.c
if CPU_x86_64
encoder_arch_sources = $(encoder_x86_sources)
Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h 2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h 2009-04-18 07:43:25 UTC (rev 15939)
@@ -25,6 +25,7 @@
//#define COLLECT_METRICS
#include "theora/theora.h"
+#include "../internal.h"
#include "encoder_huffman.h"
#include "../dec/ocintrin.h"
typedef struct CP_INSTANCE CP_INSTANCE;
Modified: branches/theora-thusnelda/lib/enc/dct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct.c 2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/dct.c 2009-04-18 07:43:25 UTC (rev 15939)
@@ -19,250 +19,162 @@
#include "dsp.h"
#include "../cpu.h"
-static ogg_int32_t xC1S7 = 64277;
-static ogg_int32_t xC2S6 = 60547;
-static ogg_int32_t xC3S5 = 54491;
-static ogg_int32_t xC4S4 = 46341;
-static ogg_int32_t xC5S3 = 36410;
-static ogg_int32_t xC6S2 = 25080;
-static ogg_int32_t xC7S1 = 12785;
-#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
-#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
-static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
- int loop;
+#define OC_C1S7 (64277)
+#define OC_C2S6 (60547)
+#define OC_C3S5 (54491)
+#define OC_C4S4 (46341)
+#define OC_C5S3 (36410)
+#define OC_C6S2 (25080)
+#define OC_C7S1 (12785)
- ogg_int32_t is07, is12, is34, is56;
- ogg_int32_t is0734, is1256;
- ogg_int32_t id07, id12, id34, id56;
- ogg_int32_t irot_input_x, irot_input_y;
- ogg_int32_t icommon_product1; /* Re-used product (c4s4 * (s12 - s56)). */
- ogg_int32_t icommon_product2; /* Re-used product (c4s4 * (d12 + d56)). */
- ogg_int32_t temp1, temp2; /* intermediate variable for computation */
+/*Performs a forward 8 point Type-II DCT transform.
+ The output is scaled by a factor of 2 from the orthonormal version of the
+ transform.
+ _y: The buffer to store the result in.
+ Data will be placed in every 8th entry (e.g., in a column of an 8x8
+ block).
+ _x: The input coefficients.
+ The first 8 entries are used (e.g., from a row of an 8x8 block).*/
+static void oc_fdct8(const ogg_int16_t _x[8],ogg_int16_t *_y){
+ int t0;
+ int t1;
+ int t2;
+ int t3;
+ int t4;
+ int t5;
+ int t6;
+ int t7;
+ int r;
+ int s;
+ int u;
+ int v;
+ /*Stage 1:*/
+ /*0-7 butterfly.*/
+ t0=_x[0]+(int)_x[7];
+ t7=_x[0]-(int)_x[7];
+ /*1-6 butterfly.*/
+ t1=_x[1]+(int)_x[6];
+ t6=_x[1]-(int)_x[6];
+ /*2-5 butterfly.*/
+ t2=_x[2]+(int)_x[5];
+ t5=_x[2]-(int)_x[5];
+ /*3-4 butterfly.*/
+ t3=_x[3]+(int)_x[4];
+ t4=_x[3]-(int)_x[4];
+ /*Stage 2:*/
+ /*0-3 butterfly.*/
+ r=t0+t3;
+ t3=t0-t3;
+ t0=r;
+ /*1-2 butterfly.*/
+ r=t1+t2;
+ t2=t1-t2;
+ t1=r;
+ /*6-5 butterfly.*/
+ r=t6+t5;
+ t5=t6-t5;
+ t6=r;
+ /*Stages 3 and 4 are where all the approximation occurs.
+ These are chosen to be as close to an exact inverse of the approximations
+ made in the iDCT as possible, while still using mostly 16-bit arithmetic.
+ We use some 16x16->32 signed MACs, but those still commonly execute in 1
+ cycle on a 16-bit DSP.
+ For example, s=(27146*t5+0x4000>>16)+t5+(t5!=0) is an exact inverse of
+ t5=(OC_C4S4*s>>16).
+ That is, applying the latter to the output of the former will recover t5
+ exactly (over the valid input range of t5, -23171...23169).
+ We increase the rounding bias to 0xB500 in this particular case so that
+ errors inverting the subsequent butterfly are not one-sided (e.g., the
+ mean error is very close to zero).
+ The (t5!=0) term could be replaced simply by 1, but we want to send 0 to 0.
+ The fDCT of an all-zeros block will still not be zero, because of the
+ biases we added at the very beginning of the process, but it will be close
+ enough that it is guaranteed to round to zero.*/
+ /*Stage 3:*/
+ /*4-5 butterfly.*/
+ s=(27146*t5+0xB500>>16)+t5+(t5!=0)>>1;
+ r=t4+s;
+ t5=t4-s;
+ t4=r;
+ /*7-6 butterfly.*/
+ s=(27146*t6+0xB500>>16)+t6+(t6!=0)>>1;
+ r=t7+s;
+ t6=t7-s;
+ t7=r;
+ /*Stage 4:*/
+ /*0-1 butterfly.*/
+ r=(27146*t0+0x4000>>16)+t0+(t0!=0);
+ s=(27146*t1+0xB500>>16)+t1+(t1!=0);
+ u=r+s>>1;
+ v=r-u;
+ _y[0<<3]=u;
+ _y[4<<3]=v;
+ /*3-2 rotation by 6pi/16*/
+ u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
+ s=(OC_C6S2*u>>16)-t2;
+ v=(s*21600+0x2800>>18)+s+(s!=0);
+ _y[2<<3]=u;
+ _y[6<<3]=v;
+ /*6-5 rotation by 3pi/16*/
+ u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
+ s=t6-(OC_C5S3*u>>16);
+ v=(s*26568+0x3400>>17)+s+(s!=0);
+ _y[5<<3]=u;
+ _y[3<<3]=v;
+ /*7-4 rotation by 7pi/16*/
+ u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
+ s=(OC_C7S1*u>>16)-t4;
+ v=(s*20539+0x3000>>20)+s+(s!=0);
+ _y[1<<3]=u;
+ _y[7<<3]=v;
+}
- ogg_int32_t InterData[64];
- ogg_int32_t *ip = InterData;
- ogg_int16_t * op = OutputData;
- for (loop = 0; loop < 8; loop++){
- /* Pre calculate some common sums and differences. */
- is07 = InputData[0] + InputData[7];
- is12 = InputData[1] + InputData[2];
- is34 = InputData[3] + InputData[4];
- is56 = InputData[5] + InputData[6];
-
- id07 = InputData[0] - InputData[7];
- id12 = InputData[1] - InputData[2];
- id34 = InputData[3] - InputData[4];
- id56 = InputData[5] - InputData[6];
-
- is0734 = is07 + is34;
- is1256 = is12 + is56;
-
- /* Pre-Calculate some common product terms. */
- icommon_product1 = xC4S4*(is12 - is56);
- icommon_product1 = DOROUND(icommon_product1);
- icommon_product1>>=16;
-
- icommon_product2 = xC4S4*(id12 + id56);
- icommon_product2 = DOROUND(icommon_product2);
- icommon_product2>>=16;
-
-
- ip[0] = (xC4S4*(is0734 + is1256));
- ip[0] = DOROUND(ip[0]);
- ip[0] >>= 16;
-
- ip[4] = (xC4S4*(is0734 - is1256));
- ip[4] = DOROUND(ip[4]);
- ip[4] >>= 16;
-
- /* Define inputs to rotation for outputs 2 and 6 */
- irot_input_x = id12 - id56;
- irot_input_y = is07 - is34;
-
- /* Apply rotation for outputs 2 and 6. */
- temp1=xC6S2*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC2S6*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[2] = temp1 + temp2;
-
- temp1=xC6S2*irot_input_y;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC2S6*irot_input_x ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[6] = temp1 -temp2 ;
-
- /* Define inputs to rotation for outputs 1 and 7 */
- irot_input_x = icommon_product1 + id07;
- irot_input_y = -( id34 + icommon_product2 );
-
- /* Apply rotation for outputs 1 and 7. */
-
- temp1=xC1S7*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC7S1*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[1] = temp1 - temp2;
-
- temp1=xC7S1*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC1S7*irot_input_y ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[7] = temp1 + temp2 ;
-
- /* Define inputs to rotation for outputs 3 and 5 */
- irot_input_x = id07 - icommon_product1;
- irot_input_y = id34 - icommon_product2;
-
- /* Apply rotation for outputs 3 and 5. */
- temp1=xC3S5*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC5S3*irot_input_y ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[3] = temp1 - temp2 ;
-
- temp1=xC5S3*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC3S5*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- ip[5] = temp1 + temp2;
-
- /* Increment data pointer for next row. */
- InputData += 8 ;
- ip += 8; /* advance pointer to next row */
-
- }
-
-
- /* Performed DCT on rows, now transform the columns */
- ip = InterData;
- for (loop = 0; loop < 8; loop++){
- /* Pre calculate some common sums and differences. */
- is07 = ip[0 * 8] + ip[7 * 8];
- is12 = ip[1 * 8] + ip[2 * 8];
- is34 = ip[3 * 8] + ip[4 * 8];
- is56 = ip[5 * 8] + ip[6 * 8];
-
- id07 = ip[0 * 8] - ip[7 * 8];
- id12 = ip[1 * 8] - ip[2 * 8];
- id34 = ip[3 * 8] - ip[4 * 8];
- id56 = ip[5 * 8] - ip[6 * 8];
-
- is0734 = is07 + is34;
- is1256 = is12 + is56;
-
- /* Pre-Calculate some common product terms. */
- icommon_product1 = xC4S4*(is12 - is56) ;
- icommon_product2 = xC4S4*(id12 + id56) ;
- icommon_product1 = DOROUND(icommon_product1);
- icommon_product2 = DOROUND(icommon_product2);
- icommon_product1>>=16;
- icommon_product2>>=16;
-
-
- temp1 = xC4S4*(is0734 + is1256) ;
- temp2 = xC4S4*(is0734 - is1256) ;
- temp1 = DOROUND(temp1);
- temp2 = DOROUND(temp2);
- temp1>>=16;
- temp2>>=16;
- op[0*8] = (ogg_int16_t) temp1;
- op[4*8] = (ogg_int16_t) temp2;
-
- /* Define inputs to rotation for outputs 2 and 6 */
- irot_input_x = id12 - id56;
- irot_input_y = is07 - is34;
-
- /* Apply rotation for outputs 2 and 6. */
- temp1=xC6S2*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC2S6*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[2*8] = (ogg_int16_t) (temp1 + temp2);
-
- temp1=xC6S2*irot_input_y;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC2S6*irot_input_x ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[6*8] = (ogg_int16_t) (temp1 -temp2) ;
-
- /* Define inputs to rotation for outputs 1 and 7 */
- irot_input_x = icommon_product1 + id07;
- irot_input_y = -( id34 + icommon_product2 );
-
- /* Apply rotation for outputs 1 and 7. */
- temp1=xC1S7*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC7S1*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[1*8] = (ogg_int16_t) (temp1 - temp2);
-
- temp1=xC7S1*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC1S7*irot_input_y ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[7*8] = (ogg_int16_t) (temp1 + temp2);
-
- /* Define inputs to rotation for outputs 3 and 5 */
- irot_input_x = id07 - icommon_product1;
- irot_input_y = id34 - icommon_product2;
-
- /* Apply rotation for outputs 3 and 5. */
- temp1=xC3S5*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC5S3*irot_input_y ;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[3*8] = (ogg_int16_t) (temp1 - temp2) ;
-
- temp1=xC5S3*irot_input_x;
- temp1=DOROUND(temp1);
- temp1>>=16;
- temp2=xC3S5*irot_input_y;
- temp2=DOROUND(temp2);
- temp2>>=16;
- op[5*8] = (ogg_int16_t) (temp1 + temp2);
-
- /* Increment data pointer for next column. */
- ip ++;
- op ++;
- }
+/*Performs a forward 8x8 Type-II DCT transform.
+ The output is scaled by a factor of 4 relative to the orthonormal version
+ of the transform.
+ _y: The buffer to store the result in.
+ This may be the same as _x.
+ _x: The input coefficients. */
+static void oc_fdct8x8_c(const ogg_int16_t _x[64],ogg_int16_t _y[64]){
+ const ogg_int16_t *in;
+ ogg_int16_t *end;
+ ogg_int16_t *out;
+ ogg_int16_t w[64];
+ int i;
+ /*Add two extra bits of working precision to improve accuracy; any more and
+ we could overflow.*/
+ for(i=0;i<64;i++)w[i>>3|(i&7)<<3]=_x[i]<<2;
+ /*These biases correct for some systematic error that remains in the full
+ fDCT->iDCT round trip.*/
+ w[0]+=(w[0]!=0)+1;
+ w[1]--;
+ w[8]++;
+ /*Transform rows of x into columns of w.*/
+ for(in=w,out=_y,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
+ /*Transform rows of w into columns of y.*/
+ for(in=_y,out=w,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
+ /*Round the result back to the external working precision (which is still
+ scaled by four relative to the orthogonal result).
+ TODO: We should just update the external working precision.*/
+ for(i=0;i<64;i++)_y[i>>3|(i&7)<<3]=w[i]+2>>2;
}
-void dsp_dct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
-{
- funcs->fdct_short = fdct_short__c;
- dsp_dct_decode_init(funcs, cpu_flags);
- dsp_idct_init(funcs, cpu_flags);
+
+void dsp_dct_init(DspFunctions *_funcs,ogg_uint32_t _cpu_flags){
+ _funcs->fdct_short=oc_fdct8x8_c;
+ dsp_dct_decode_init(_funcs,_cpu_flags);
+ dsp_idct_init(_funcs,_cpu_flags);
#if defined(USE_ASM)
- if (cpu_flags & OC_CPU_X86_MMX) {
- dsp_mmx_fdct_init(funcs);
+ /*TODO: Need to write an MMX version.*/
+ if(_cpu_flags&OC_CPU_X86_MMX){
+ dsp_mmx_fdct_init(_funcs);
}
+ if(_cpu_flags&OC_CPU_X86_SSE2){
+ dsp_sse2_fdct_init(_funcs);
+ }
#endif
}
-
Modified: branches/theora-thusnelda/lib/enc/dsp.h
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.h 2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/dsp.h 2009-04-18 07:43:25 UTC (rev 15939)
@@ -18,11 +18,13 @@
#ifndef DSP_H
#define DSP_H
+typedef struct DspFunctions DspFunctions;
+
#include "theora/theora.h"
+#include "codec_internal.h"
#include "../cpu.h"
-typedef struct
-{
+struct DspFunctions{
void (*save_fpu) (void);
void (*restore_fpu) (void);
@@ -72,7 +74,7 @@
void (*IDct10) (const ogg_int16_t *InputData,
const ogg_int16_t *QuantMatrix,
ogg_int16_t *OutputData);
-} DspFunctions;
+};
extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
extern void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
@@ -88,6 +90,9 @@
extern void dsp_mmx_recon_init(DspFunctions *funcs);
extern void dsp_mmx_dct_decode_init(DspFunctions *funcs);
extern void dsp_mmx_idct_init(DspFunctions *funcs);
+# if defined(__amd64__)||defined(__x86_64__)
+extern void dsp_sse2_fdct_init(DspFunctions *funcs);
+# endif
#endif
#define dsp_save_fpu(funcs) (funcs.save_fpu ())
Modified: branches/theora-thusnelda/lib/enc/encapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c 2009-04-16 18:53:10 UTC (rev 15938)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c 2009-04-18 07:43:25 UTC (rev 15939)
@@ -2,6 +2,7 @@
#include "theora/theoraenc.h"
#include "theora/theora.h"
#include "codec_internal.h"
+#include "mathops.h"
#include "../dec/ocintrin.h"
/*Wrapper to translate the new API into the old API.
@@ -862,14 +863,8 @@
_ci->quick_p=1;
}
-static int _ilog(unsigned _v){
- int ret;
- for(ret=0;_v;ret++)_v>>=1;
- return ret;
-}
-
struct th_enc_ctx{
/*This is required at the start of the struct for the common functions to
work.*/
@@ -920,7 +915,8 @@
enc->info.target_bitrate=ci.target_bitrate;
enc->info.fps_numerator=ci.fps_numerator;
enc->info.fps_denominator=ci.fps_denominator;
- enc->info.keyframe_granule_shift=_ilog(ci.keyframe_frequency_force-1);
+ enc->info.keyframe_granule_shift=
+ OC_ILOG_32(ci.keyframe_frequency_force-1);
}
}
return enc;
More information about the commits
mailing list