[xiph-commits] r10677 - experimental/derf/theora-exp/lib
experimental/derf/theora-exp/lib/x86 trunk/theora/doc/spec
trunk/theora/lib
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Fri Dec 23 11:07:50 PST 2005
Author: tterribe
Date: 2005-12-23 11:07:46 -0800 (Fri, 23 Dec 2005)
New Revision: 10677
Modified:
experimental/derf/theora-exp/lib/idct.c
experimental/derf/theora-exp/lib/x86/mmxidct.c
trunk/theora/doc/spec/spec.tex
trunk/theora/lib/idct.c
Log:
Change the iDCT specification to be more friendly towards 16-bit
implementations (e.g., MMX).
The only thing that needed to change was how overflow was handled, which was
already not consistent between the C code and various SIMD implementations
used in VP3.
What changed:
1) Truncation to 16-bits before the C4S4 multiplies in the C code.
2) Conversion of saturated arithmetic to unsaturated in the MMX code.
This one is arguable, as saturated arithmetic theoretically provides a
better SNR when overflow does occur.
However, overflow is only possible with _severe_ quantization distortions
that line up in just the right way, and these changes do not affect any of
my test clips.
However, avoiding saturation means that operations are associative,
commutative, and distributive, and thus gives much greater opportunities
for re-ordering and rescheduling them.
In particular, it allows for the transformation from the original DCT
factorization used in VP3 to the more well-known Chen factorization
described in the spec while maintaining bit-exact equivalence in the
output.
These re-ordering and scheduling possiblities could allow for performance
optimization on some platforms, and unsaturated arithmetic in general is
much simpler to implement with dedicated hardware.
Not to mention that saturated arithmetic is dog-slow in C by comparison.
Thus, I've elected to go the unsaturated route.
Modified: experimental/derf/theora-exp/lib/idct.c
===================================================================
--- experimental/derf/theora-exp/lib/idct.c 2005-12-23 07:17:56 UTC (rev 10676)
+++ experimental/derf/theora-exp/lib/idct.c 2005-12-23 19:07:46 UTC (rev 10677)
@@ -16,8 +16,8 @@
ogg_int32_t r;
/*Stage 1:*/
/*0-1 butterfly.*/
- t[0]=OC_C4S4*(_x[0]+(ogg_int32_t)_x[4])>>16;
- t[1]=OC_C4S4*(_x[0]-(ogg_int32_t)_x[4])>>16;
+ t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
+ t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
/*2-3 rotation by 6pi/16.*/
t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
@@ -30,11 +30,11 @@
/*Stage 2:*/
/*4-5 butterfly.*/
r=t[4]+t[5];
- t[5]=OC_C4S4*(t[4]-t[5])>>16;
+ t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
t[4]=r;
/*7-6 butterfly.*/
r=t[7]+t[6];
- t[6]=OC_C4S4*(t[7]-t[6])>>16;
+ t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
t[7]=r;
/*Stage 3:*/
/*0-3 butterfly.*/
@@ -86,10 +86,10 @@
t[7]=OC_C1S7*_x[1]>>16;
/*Stage 2:*/
r=t[4]+t[5];
- t[5]=OC_C4S4*(t[4]-t[5])>>16;
+ t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
t[4]=r;
r=t[7]+t[6];
- t[6]=OC_C4S4*(t[7]-t[6])>>16;
+ t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
t[7]=r;
/*Stage 3:*/
t[1]=t[0]+t[2];
Modified: experimental/derf/theora-exp/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxidct.c 2005-12-23 07:17:56 UTC (rev 10676)
+++ experimental/derf/theora-exp/lib/x86/mmxidct.c 2005-12-23 19:07:46 UTC (rev 10677)
@@ -64,10 +64,10 @@
" paddw %mm5, %mm7\n\t" \
" movq %mm0, %mm5\n\t" \
" pmulhw %mm3, %mm0\n\t" \
- " paddsw %mm7, %mm4\n\t" \
+ " paddw %mm7, %mm4\n\t" \
" pmulhw %mm1, %mm5\n\t" \
" movq "OC_C(7)", %mm7\n\t" \
- " psubsw %mm2, %mm6\n\t" \
+ " psubw %mm2, %mm6\n\t" \
" paddw %mm3, %mm0\n\t" \
" pmulhw %mm7, %mm3\n\t" \
" movq "OC_I(2)", %mm2\n\t" \
@@ -75,50 +75,50 @@
" paddw %mm1, %mm5\n\t" \
" movq %mm2, %mm1\n\t" \
" pmulhw "OC_C(2)", %mm2\n\t" \
- " psubsw %mm5, %mm3\n\t" \
+ " psubw %mm5, %mm3\n\t" \
" movq "OC_J(6)", %mm5\n\t" \
- " paddsw %mm7, %mm0\n\t" \
+ " paddw %mm7, %mm0\n\t" \
" movq %mm5, %mm7\n\t" \
- " psubsw %mm4, %mm0\n\t" \
+ " psubw %mm4, %mm0\n\t" \
" pmulhw "OC_C(2)", %mm5\n\t" \
" paddw %mm1, %mm2\n\t" \
" pmulhw "OC_C(6)", %mm1\n\t" \
- " paddsw %mm4, %mm4\n\t" \
- " paddsw %mm0, %mm4\n\t" \
- " psubsw %mm6, %mm3\n\t" \
+ " paddw %mm4, %mm4\n\t" \
+ " paddw %mm0, %mm4\n\t" \
+ " psubw %mm6, %mm3\n\t" \
" paddw %mm7, %mm5\n\t" \
- " paddsw %mm6, %mm6\n\t" \
+ " paddw %mm6, %mm6\n\t" \
" pmulhw "OC_C(6)", %mm7\n\t" \
- " paddsw %mm3, %mm6\n\t" \
+ " paddw %mm3, %mm6\n\t" \
" movq %mm4,"OC_I(1)"\n\t" \
- " psubsw %mm5, %mm1\n\t" \
+ " psubw %mm5, %mm1\n\t" \
" movq "OC_C(4)", %mm4\n\t" \
" movq %mm3, %mm5\n\t" \
" pmulhw %mm4, %mm3\n\t" \
- " paddsw %mm2, %mm7\n\t" \
+ " paddw %mm2, %mm7\n\t" \
" movq %mm6,"OC_I(2)"\n\t" \
" movq %mm0, %mm2\n\t" \
" movq "OC_I(0)", %mm6\n\t" \
" pmulhw %mm4, %mm0\n\t" \
" paddw %mm3, %mm5\n\t" \
" movq "OC_J(4)", %mm3\n\t" \
- " psubsw %mm1, %mm5\n\t" \
+ " psubw %mm1, %mm5\n\t" \
" paddw %mm0, %mm2\n\t" \
- " psubsw %mm3, %mm6\n\t" \
+ " psubw %mm3, %mm6\n\t" \
" movq %mm6, %mm0\n\t" \
" pmulhw %mm4, %mm6\n\t" \
- " paddsw %mm3, %mm3\n\t" \
- " paddsw %mm1, %mm1\n\t" \
- " paddsw %mm0, %mm3\n\t" \
- " paddsw %mm5, %mm1\n\t" \
+ " paddw %mm3, %mm3\n\t" \
+ " paddw %mm1, %mm1\n\t" \
+ " paddw %mm0, %mm3\n\t" \
+ " paddw %mm5, %mm1\n\t" \
" pmulhw %mm3, %mm4\n\t" \
- " paddsw %mm0, %mm6\n\t" \
- " psubsw %mm2, %mm6\n\t" \
- " paddsw %mm2, %mm2\n\t" \
+ " paddw %mm0, %mm6\n\t" \
+ " psubw %mm2, %mm6\n\t" \
+ " paddw %mm2, %mm2\n\t" \
" movq "OC_I(1)", %mm0\n\t" \
- " paddsw %mm6, %mm2\n\t" \
+ " paddw %mm6, %mm2\n\t" \
" paddw %mm3, %mm4\n\t" \
- " psubsw %mm1, %mm2\n\t" \
+ " psubw %mm1, %mm2\n\t" \
"#end OC_IDCT_BEGIN\n\t"
/*38+8=46 cycles.*/
@@ -126,21 +126,21 @@
" #OC_ROW_IDCT\n" \
OC_IDCT_BEGIN \
" movq "OC_I(2)", %mm3\n\t" /* r3 = D. */ \
- " psubsw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
- " paddsw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
- " paddsw %mm7, %mm7\n\t" /* r7 = G + G */ \
- " paddsw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- " paddsw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
- " psubsw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
- " paddsw %mm3, %mm3\n\t" \
- " psubsw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- " paddsw %mm5, %mm5\n\t" \
- " paddsw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
- " paddsw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- " psubsw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
- " paddsw %mm0, %mm0\n\t" \
+ " psubw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
+ " paddw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
+ " paddw %mm7, %mm7\n\t" /* r7 = G + G */ \
+ " paddw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ " paddw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
+ " psubw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ " paddw %mm3, %mm3\n\t" \
+ " psubw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ " paddw %mm5, %mm5\n\t" \
+ " paddw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ " paddw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ " psubw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ " paddw %mm0, %mm0\n\t" \
" movq %mm1,"OC_I(1)"\n\t" /* save R1 */ \
- " paddsw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ " paddw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
"#end OC_ROW_IDCT\n\t" \
)
@@ -216,35 +216,35 @@
#define OC_COLUMN_IDCT __asm__ __volatile__( \
" #OC_COLUMN_IDCT\n" \
OC_IDCT_BEGIN \
- " paddsw "OC_8", %mm2\n\t" \
- " paddsw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
- " paddsw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ " paddw "OC_8", %mm2\n\t" \
+ " paddw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
+ " paddw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
" psraw $4, %mm2\n\t" /* r2 = NR2 */ \
- " psubsw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
+ " psubw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
" psraw $4, %mm1\n\t" /* r1 = NR1 */ \
" movq "OC_I(2)", %mm3\n\t" /* r3 = D. */ \
- " paddsw %mm7, %mm7\n\t" /* r7 = G + G */ \
+ " paddw %mm7, %mm7\n\t" /* r7 = G + G */ \
" movq %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
- " paddsw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
+ " paddw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
" movq %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
- " psubsw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
- " paddsw "OC_8", %mm4\n\t" \
- " paddsw %mm3, %mm3\n\t" /* r3 = D. + D. */ \
- " paddsw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ " psubw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ " paddw "OC_8", %mm4\n\t" \
+ " paddw %mm3, %mm3\n\t" /* r3 = D. + D. */ \
+ " paddw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
" psraw $4, %mm4\n\t" /* r4 = NR4 */ \
- " psubsw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ " psubw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
" psraw $4, %mm3\n\t" /* r3 = NR3 */ \
- " paddsw "OC_8", %mm6\n\t" \
- " paddsw %mm5, %mm5\n\t" /* r5 = B.. + B.. */ \
- " paddsw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ " paddw "OC_8", %mm6\n\t" \
+ " paddw %mm5, %mm5\n\t" /* r5 = B.. + B.. */ \
+ " paddw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
" psraw $4, %mm6\n\t" /* r6 = NR6 */ \
" movq %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
" psraw $4, %mm5\n\t" /* r5 = NR5 */ \
" movq %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
- " psubsw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
- " paddsw "OC_8", %mm7\n\t" \
- " paddsw %mm0, %mm0\n\t" /* r0 = C. + C. */ \
- " paddsw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ " psubw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ " paddw "OC_8", %mm7\n\t" \
+ " paddw %mm0, %mm0\n\t" /* r0 = C. + C. */ \
+ " paddw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
" psraw $4, %mm7\n\t" /* r7 = NR7 */ \
" movq %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
" psraw $4, %mm0\n\t" /* r0 = NR0 */ \
@@ -331,19 +331,19 @@
" movq %mm5, %mm1\n\t" \
" paddw %mm3, %mm0\n\t" \
" pmulhw "OC_C(7)", %mm3\n\t" \
- " psubsw %mm2, %mm6\n\t" \
+ " psubw %mm2, %mm6\n\t" \
" pmulhw "OC_C(2)", %mm5\n\t" \
- " psubsw %mm4, %mm0\n\t" \
+ " psubw %mm4, %mm0\n\t" \
" movq "OC_I(2)", %mm7\n\t" \
- " paddsw %mm4, %mm4\n\t" \
+ " paddw %mm4, %mm4\n\t" \
" paddw %mm5, %mm7\n\t" \
- " paddsw %mm0, %mm4\n\t" \
+ " paddw %mm0, %mm4\n\t" \
" pmulhw "OC_C(6)", %mm1\n\t" \
- " psubsw %mm6, %mm3\n\t" \
+ " psubw %mm6, %mm3\n\t" \
" movq %mm4,"OC_I(1)"\n\t" \
- " paddsw %mm6, %mm6\n\t" \
+ " paddw %mm6, %mm6\n\t" \
" movq "OC_C(4)", %mm4\n\t" \
- " paddsw %mm3, %mm6\n\t" \
+ " paddw %mm3, %mm6\n\t" \
" movq %mm3, %mm5\n\t" \
" pmulhw %mm4, %mm3\n\t" \
" movq %mm6,"OC_I(2)"\n\t" \
@@ -352,17 +352,17 @@
" pmulhw %mm4, %mm0\n\t" \
" paddw %mm3, %mm5\n\t" \
" paddw %mm0, %mm2\n\t" \
- " psubsw %mm1, %mm5\n\t" \
+ " psubw %mm1, %mm5\n\t" \
" pmulhw %mm4, %mm6\n\t" \
" paddw "OC_I(0)", %mm6\n\t" \
- " paddsw %mm1, %mm1\n\t" \
+ " paddw %mm1, %mm1\n\t" \
" movq %mm6, %mm4\n\t" \
- " paddsw %mm5, %mm1\n\t" \
- " psubsw %mm2, %mm6\n\t" \
- " paddsw %mm2, %mm2\n\t" \
+ " paddw %mm5, %mm1\n\t" \
+ " psubw %mm2, %mm6\n\t" \
+ " paddw %mm2, %mm2\n\t" \
" movq "OC_I(1)", %mm0\n\t" \
- " paddsw %mm6, %mm2\n\t" \
- " psubsw %mm1, %mm2\n\t" \
+ " paddw %mm6, %mm2\n\t" \
+ " psubw %mm1, %mm2\n\t" \
" nop\n\t" \
" #end OC_IDCT_BEGIN_10\n\t"
@@ -371,21 +371,21 @@
" #OC_ROW_IDCT_10\n\t" \
OC_IDCT_BEGIN_10 \
" movq "OC_I(2)", %mm3\n\t" /* r3 = D. */ \
- " psubsw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
- " paddsw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
- " paddsw %mm7, %mm7\n\t" /* r7 = G + G */ \
- " paddsw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- " paddsw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
- " psubsw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
- " paddsw %mm3, %mm3\n\t" \
- " psubsw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- " paddsw %mm5, %mm5\n\t" \
- " paddsw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
- " paddsw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- " psubsw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
- " paddsw %mm0, %mm0\n\t" \
+ " psubw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
+ " paddw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
+ " paddw %mm7, %mm7\n\t" /* r7 = G + G */ \
+ " paddw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ " paddw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
+ " psubw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ " paddw %mm3, %mm3\n\t" \
+ " psubw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ " paddw %mm5, %mm5\n\t" \
+ " paddw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ " paddw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ " psubw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ " paddw %mm0, %mm0\n\t" \
" movq %mm1,"OC_I(1)"\n\t" /* save R1 */ \
- " paddsw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ " paddw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
"#end OC_ROW_IDCT_10\n\t" \
)
@@ -393,35 +393,35 @@
#define OC_COLUMN_IDCT_10 __asm__ __volatile__( \
" #OC_COLUMN_IDCT_10\n\t" \
OC_IDCT_BEGIN_10 \
- " paddsw "OC_8", %mm2\n\t" \
- " paddsw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
- " paddsw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ " paddw "OC_8", %mm2\n\t" \
+ " paddw %mm1, %mm1\n\t" /* r1 = H. + H. */ \
+ " paddw %mm2, %mm1\n\t" /* r1 = R1 = A.. + H. */ \
" psraw $4, %mm2\n\t" /* r2 = NR2 */ \
- " psubsw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
+ " psubw %mm7, %mm4\n\t" /* r4 = E. = E - G */ \
" psraw $4, %mm1\n\t" /* r1 = NR1 */ \
" movq "OC_I(2)", %mm3\n\t" /* r3 = D. */ \
- " paddsw %mm7, %mm7\n\t" /* r7 = G + G */ \
+ " paddw %mm7, %mm7\n\t" /* r7 = G + G */ \
" movq %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
- " paddsw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
+ " paddw %mm4, %mm7\n\t" /* r7 = G. = E + G */ \
" movq %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
- " psubsw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
- " paddsw "OC_8", %mm4\n\t" \
- " paddsw %mm3, %mm3\n\t" /* r3 = D. + D. */ \
- " paddsw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ " psubw %mm3, %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ " paddw "OC_8", %mm4\n\t" \
+ " paddw %mm3, %mm3\n\t" /* r3 = D. + D. */ \
+ " paddw %mm4, %mm3\n\t" /* r3 = R3 = E. + D. */ \
" psraw $4, %mm4\n\t" /* r4 = NR4 */ \
- " psubsw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ " psubw %mm5, %mm6\n\t" /* r6 = R6 = F. - B.. */ \
" psraw $4, %mm3\n\t" /* r3 = NR3 */ \
- " paddsw "OC_8", %mm6\n\t" \
- " paddsw %mm5, %mm5\n\t" /* r5 = B.. + B.. */ \
- " paddsw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ " paddw "OC_8", %mm6\n\t" \
+ " paddw %mm5, %mm5\n\t" /* r5 = B.. + B.. */ \
+ " paddw %mm6, %mm5\n\t" /* r5 = R5 = F. + B.. */ \
" psraw $4, %mm6\n\t" /* r6 = NR6 */ \
" movq %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
" psraw $4, %mm5\n\t" /* r5 = NR5 */ \
" movq %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
- " psubsw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
- " paddsw "OC_8", %mm7\n\t" \
- " paddsw %mm0, %mm0\n\t" /* r0 = C. + C. */ \
- " paddsw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ " psubw %mm0, %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ " paddw "OC_8", %mm7\n\t" \
+ " paddw %mm0, %mm0\n\t" /* r0 = C. + C. */ \
+ " paddw %mm7, %mm0\n\t" /* r0 = R0 = G. + C. */ \
" psraw $4, %mm7\n\t" /* r7 = NR7 */ \
" movq %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
" psraw $4, %mm0\n\t" /* r0 = NR0 */ \
Modified: trunk/theora/doc/spec/spec.tex
===================================================================
--- trunk/theora/doc/spec/spec.tex 2005-12-23 07:17:56 UTC (rev 10676)
+++ trunk/theora/doc/spec/spec.tex 2005-12-23 19:07:46 UTC (rev 10677)
@@ -6053,19 +6053,26 @@
discarding any higher-order bits in their two's complement representation.
The final output of each 1D transform is truncated to 16-bits in the same
manner.
-In practice, 32 bits is sufficient for every calculation except scaling by
- $C4$.
-Here we specify truncating to 16 bits after the right shift by 16, but this is
- equivalent to truncating the result of the multiply to 32 bits before the
- right shift.
+In practice, if the high word of a $16\times 16$ bit multiplication can be
+ obtained directly, 16 bits is sufficient for every calculation except scaling
+ by $C4$.
+Here we specify truncating to 16 bits before the multiplication to simplify
+ implementations using hardware or common SIMD instruction sets.
+Note that if 16-bit register are used, overflow in the additions and
+ subtractions should be handled using \textit{unsaturated} arithmetic.
+That is, the high-order bits should be discarded and the low-order bits
+ retained, instead of clamping the result to the maximum or minimum value.
+This allows the maximum flexibility in re-ordering these instructions without
+ deviating from this specification.
+
The 1D transform can only overflow if input coefficients larger than $\pm 6201$
are present.
However, the result of applying the 2D forward transform on pixel values in the
range $-255\ldots 255$ can be as large as $\pm 8157$ due to the scale factor
of four that is applied, and quantization errors could make this even larger.
-Therefore, the coefficients cannot simply be clamped into a valid range, as
- they could still overflow just the 1D inverse transform by itself.
+Therefore, the coefficients cannot simply be clamped into a valid range before
+ the transform.
\subsubsection{The 1D Inverse DCT}
\label{sub:1d-idct}
@@ -6142,6 +6149,8 @@
Operations on a single signal path through the graph cannot be reordered, but
operations on different paths may be, or may be executed in parallel.
+Different graphs may be obtainable using the associative, commutative, and
+ distributive properties of unsaturated arithmetic.
The column of numbers on the left represents an initial permutation of the
input DCT coefficients.
The column on the right represents the unpermuted output.
@@ -6167,18 +6176,21 @@
\begin{enumerate}
\item
-Assign $\locvar{T}[0]$ the value
- $\locvar{C4}*(\bitvar{Y}[0]+\bitvar{Y}[4])>>16$.
+Assign $\locvar{T}[0]$ the value $\bitvar{Y}[0]+\bitvar{Y}[4]$.
\item
Truncate $\locvar{T}[0]$ to a 16-bit representation by dropping any
higher-order bits.
\item
-Assign $\locvar{T}[1]$ the value
- $\locvar{C4}*(\bitvar{Y}[0]-\bitvar{Y}[4])>>16$.
+Assign $\locvar{T}[0]$ the value
+ $\locvar{C4}*\locvar{T}[0]>>16$.
\item
+Assign $\locvar{T}[1]$ the value $\bitvar{Y}[0]-\bitvar{Y}[4]$.
+\item
Truncate $\locvar{T}[1]$ to a 16-bit representation by dropping any
higher-order bits.
\item
+Assign $\locvar{T}[1]$ the value $\locvar{C4}*\locvar{T}[1]>>16$.
+\item
Assign $\locvar{T}[2]$ the value $(\locvar{C6}*\bitvar{Y}[2]>>16)-
(\locvar{S6}*\bitvar{Y}[6]>>16)$.
\item
@@ -6199,22 +6211,24 @@
\item
Assign \locvar{R} the value $\locvar{T}[4]+\locvar{T}[5]$.
\item
-Assign $\locvar{T}[5]$ the value
- $\locvar{C4}*(\locvar{T}[4]-\locvar{T}[5])>>16$.
+Assign $\locvar{T}[5]$ the value $\locvar{T}[4]-\locvar{T}[5]$.
\item
Truncate $\locvar{T}[5]$ to a 16-bit representation by dropping any
higher-order bits.
\item
+Assign $\locvar{T}[5]$ the value $\locvar{C4}*(-\locvar{T}[5])>>16$.
+\item
Assign $\locvar{T}[4]$ the value $\locvar{R}$.
\item
Assign \locvar{R} the value $\locvar{T}[7]+\locvar{T}[6]$.
\item
-Assign $\locvar{T}[6]$ the value
- $\locvar{C4}*(\locvar{T}[7]-\locvar{T}[6])>>16$.
+Assign $\locvar{T}[6]$ the value $\locvar{T}[7]-\locvar{T}[6]$.
\item
Truncate $\locvar{T}[6]$ to a 16-bit representation by dropping any
higher-order bits.
\item
+Assign $\locvar{T}[6]$ the value $\locvar{C4}*\locvar{T}[6]>>16$.
+\item
Assign $\locvar{T}[7]$ the value $\locvar{R}$.
\item
Assign \locvar{R} the value $\locvar{T}[0]+\locvar{T}[3]$.
@@ -6329,15 +6343,15 @@
\locvar{\ci} & Integer & 3 & No & The column index. \\
\locvar{\ri} & Integer & 3 & No & The row index. \\
\locvar{Y} & \multicolumn{1}{p{40pt}}{Integer Array} &
- 16 & Yes & An 8-element array of 1-D iDCT input
+ 16 & Yes & An 8-element array of 1D iDCT input
values. \\
\locvar{X} & \multicolumn{1}{p{40pt}}{Integer Array} &
- 16 & Yes & An 8-element array of 1-D iDCT output
+ 16 & Yes & An 8-element array of 1D iDCT output
values. \\
\bottomrule\end{tabularx}
\medskip
-This procedure applies the 1-D inverse DCT transform 16 times to a block of
+This procedure applies the 1D inverse DCT transform 16 times to a block of
dequantized coefficients: once for each of the 8 rows, and once for each of
the 8 columns of the result.
Note that the coordinate system used for the columns is the same right-handed
@@ -6358,7 +6372,7 @@
$\bitvar{DQC}[\locvar{\ri}*8+\locvar{\ci}]$.
\end{enumerate}
\item
-Compute \locvar{X}, the 1-D inverse DCT of \locvar{Y} using the procedure
+Compute \locvar{X}, the 1D inverse DCT of \locvar{Y} using the procedure
described in Section~\ref{sub:1d-idct}.
\item
For each value of $\locvar{\ci}$ from 0 to 7:
@@ -6379,7 +6393,7 @@
$\bitvar{RES}[\locvar{\ri}][\locvar{\ci}]$.
\end{enumerate}
\item
-Compute \locvar{X}, the 1-D inverse DCT of \locvar{Y} using the procedure
+Compute \locvar{X}, the 1D inverse DCT of \locvar{Y} using the procedure
described in Section~\ref{sub:1d-idct}.
\item
For each value of \locvar{\ri} from 0 to 7:
@@ -6464,7 +6478,7 @@
This can be implemented quickly by adding an offset of $\hex{FFFF}$ if the
number is negative, and then shifting as before.
This slightly increases the computational complexity of the transform.
-Unlike the inverse DCT, 16 bit registers and a $16\times16\rightarrow32$ bit
+Unlike the inverse DCT, 16-bit registers and a $16\times16\rightarrow32$ bit
multiply are sufficient to avoid any overflow, so long as the input is in the
range $-6270\ldots 6270$, which is larger than required.
Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c 2005-12-23 07:17:56 UTC (rev 10676)
+++ trunk/theora/lib/idct.c 2005-12-23 19:07:46 UTC (rev 10677)
@@ -79,11 +79,11 @@
t2 >>= 16;
_D = t1 - t2;
- t1 = (xC4S4 * (_A - _C));
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
t1 >>= 16;
_Ad = t1;
- t1 = (xC4S4 * (_B - _D));
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
t1 >>= 16;
_Bd = t1;
@@ -91,11 +91,11 @@
_Cd = _A + _C;
_Dd = _B + _D;
- t1 = (xC4S4 * (ip[0] + ip[4]));
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0] + ip[4]));
t1 >>= 16;
_E = t1;
- t1 = (xC4S4 * (ip[0] - ip[4]));
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0] - ip[4]));
t1 >>= 16;
_F = t1;
@@ -170,11 +170,11 @@
t2 >>= 16;
_D = t1 - t2;
- t1 = (xC4S4 * (_A - _C));
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
t1 >>= 16;
_Ad = t1;
- t1 = (xC4S4 * (_B - _D));
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
t1 >>= 16;
_Bd = t1;
@@ -182,11 +182,11 @@
_Cd = _A + _C;
_Dd = _B + _D;
- t1 = (xC4S4 * (ip[0*8] + ip[4*8]));
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] + ip[4*8]));
t1 >>= 16;
_E = t1;
- t1 = (xC4S4 * (ip[0*8] - ip[4*8]));
+ t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] - ip[4*8]));
t1 >>= 16;
_F = t1;
@@ -301,11 +301,11 @@
_D = -t2;
- t1 = (xC4S4 * (_A - _C));
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
t1 >>= 16;
_Ad = t1;
- t1 = (xC4S4 * (_B - _D));
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
t1 >>= 16;
_Bd = t1;
@@ -378,11 +378,11 @@
_D = - t2;
- t1 = (xC4S4 * (_A - _C));
+ t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
t1 >>= 16;
_Ad = t1;
- t1 = (xC4S4 * (_B - _D));
+ t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
t1 >>= 16;
_Bd = t1;
More information about the commits
mailing list