[xiph-commits] r10677 - experimental/derf/theora-exp/lib experimental/derf/theora-exp/lib/x86 trunk/theora/doc/spec trunk/theora/lib

Fri Dec 23 11:07:50 PST 2005

Author: tterribe
Date: 2005-12-23 11:07:46 -0800 (Fri, 23 Dec 2005)
New Revision: 10677

Modified:
   experimental/derf/theora-exp/lib/idct.c
   experimental/derf/theora-exp/lib/x86/mmxidct.c
   trunk/theora/doc/spec/spec.tex
   trunk/theora/lib/idct.c
Log:
Change the iDCT specification to be more friendly towards 16-bit
 implementations (e.g., MMX).
The only thing that needed to change was how overflow was handled, which was
 already not consistent between the C code and various SIMD implementations
 used in VP3.

What changed:
1) Truncation to 16-bits before the C4S4 multiplies in the C code.
2) Conversion of saturated arithmetic to unsaturated in the MMX code.
   This one is arguable, as saturated arithmetic theoretically provides a
    better SNR when overflow does occur.
   However, overflow is only possible with _severe_ quantization distortions
    that line up in just the right way, and these changes do not affect any of
    my test clips.
   However, avoiding saturation means that operations are associative,
    commutative, and distributive, and thus gives much greater opportunities
    for re-ordering and rescheduling them.
   In particular, it allows for the transformation from the original DCT
    factorization used in VP3 to the more well-known Chen factorization
    described in the spec while maintaining bit-exact equivalence in the
    output.
   These re-ordering and scheduling possiblities could allow for performance
    optimization on some platforms, and unsaturated arithmetic in general is
    much simpler to implement with dedicated hardware.
   Not to mention that saturated arithmetic is dog-slow in C by comparison.
   Thus, I've elected to go the unsaturated route.



Modified: experimental/derf/theora-exp/lib/idct.c
===================================================================

--- experimental/derf/theora-exp/lib/idct.c	2005-12-23 07:17:56 UTC (rev 10676)
+++ experimental/derf/theora-exp/lib/idct.c	2005-12-23 19:07:46 UTC (rev 10677)
@@ -16,8 +16,8 @@
   ogg_int32_t r;
   /*Stage 1:*/
   /*0-1 butterfly.*/
-  t[0]=OC_C4S4*(_x[0]+(ogg_int32_t)_x[4])>>16;
-  t[1]=OC_C4S4*(_x[0]-(ogg_int32_t)_x[4])>>16;
+  t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
+  t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
   /*2-3 rotation by 6pi/16.*/
   t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
   t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
@@ -30,11 +30,11 @@
   /*Stage 2:*/
   /*4-5 butterfly.*/
   r=t[4]+t[5];
-  t[5]=OC_C4S4*(t[4]-t[5])>>16;
+  t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
   t[4]=r;
   /*7-6 butterfly.*/
   r=t[7]+t[6];
-  t[6]=OC_C4S4*(t[7]-t[6])>>16;
+  t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
   t[7]=r;
   /*Stage 3:*/
   /*0-3 butterfly.*/
@@ -86,10 +86,10 @@
   t[7]=OC_C1S7*_x[1]>>16;
   /*Stage 2:*/
   r=t[4]+t[5];
-  t[5]=OC_C4S4*(t[4]-t[5])>>16;
+  t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
   t[4]=r;
   r=t[7]+t[6];
-  t[6]=OC_C4S4*(t[7]-t[6])>>16;
+  t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
   t[7]=r;
   /*Stage 3:*/
   t[1]=t[0]+t[2];

Modified: experimental/derf/theora-exp/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxidct.c	2005-12-23 07:17:56 UTC (rev 10676)
+++ experimental/derf/theora-exp/lib/x86/mmxidct.c	2005-12-23 19:07:46 UTC (rev 10677)
@@ -64,10 +64,10 @@
  "  paddw       %mm5,     %mm7\n\t" \
  "  movq        %mm0,     %mm5\n\t" \
  "  pmulhw      %mm3,     %mm0\n\t" \
- "  paddsw      %mm7,     %mm4\n\t" \
+ "  paddw       %mm7,     %mm4\n\t" \
  "  pmulhw      %mm1,     %mm5\n\t" \
  "  movq   "OC_C(7)",     %mm7\n\t" \
- "  psubsw      %mm2,     %mm6\n\t" \
+ "  psubw       %mm2,     %mm6\n\t" \
  "  paddw       %mm3,     %mm0\n\t" \
  "  pmulhw      %mm7,     %mm3\n\t" \
  "  movq   "OC_I(2)",     %mm2\n\t" \
@@ -75,50 +75,50 @@
  "  paddw       %mm1,     %mm5\n\t" \
  "  movq        %mm2,     %mm1\n\t" \
  "  pmulhw "OC_C(2)",     %mm2\n\t" \
- "  psubsw      %mm5,     %mm3\n\t" \
+ "  psubw       %mm5,     %mm3\n\t" \
  "  movq   "OC_J(6)",     %mm5\n\t" \
- "  paddsw      %mm7,     %mm0\n\t" \
+ "  paddw       %mm7,     %mm0\n\t" \
  "  movq        %mm5,     %mm7\n\t" \
- "  psubsw      %mm4,     %mm0\n\t" \
+ "  psubw       %mm4,     %mm0\n\t" \
  "  pmulhw "OC_C(2)",     %mm5\n\t" \
  "  paddw       %mm1,     %mm2\n\t" \
  "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  paddsw      %mm4,     %mm4\n\t" \
- "  paddsw      %mm0,     %mm4\n\t" \
- "  psubsw      %mm6,     %mm3\n\t" \
+ "  paddw       %mm4,     %mm4\n\t" \
+ "  paddw       %mm0,     %mm4\n\t" \
+ "  psubw       %mm6,     %mm3\n\t" \
  "  paddw       %mm7,     %mm5\n\t" \
- "  paddsw      %mm6,     %mm6\n\t" \
+ "  paddw       %mm6,     %mm6\n\t" \
  "  pmulhw "OC_C(6)",     %mm7\n\t" \
- "  paddsw      %mm3,     %mm6\n\t" \
+ "  paddw       %mm3,     %mm6\n\t" \
  "  movq        %mm4,"OC_I(1)"\n\t" \
- "  psubsw      %mm5,     %mm1\n\t" \
+ "  psubw       %mm5,     %mm1\n\t" \
  "  movq   "OC_C(4)",     %mm4\n\t" \
  "  movq        %mm3,     %mm5\n\t" \
  "  pmulhw      %mm4,     %mm3\n\t" \
- "  paddsw      %mm2,     %mm7\n\t" \
+ "  paddw       %mm2,     %mm7\n\t" \
  "  movq        %mm6,"OC_I(2)"\n\t" \
  "  movq        %mm0,     %mm2\n\t" \
  "  movq   "OC_I(0)",     %mm6\n\t" \
  "  pmulhw      %mm4,     %mm0\n\t" \
  "  paddw       %mm3,     %mm5\n\t" \
  "  movq   "OC_J(4)",     %mm3\n\t" \
- "  psubsw      %mm1,     %mm5\n\t" \
+ "  psubw       %mm1,     %mm5\n\t" \
  "  paddw       %mm0,     %mm2\n\t" \
- "  psubsw      %mm3,     %mm6\n\t" \
+ "  psubw       %mm3,     %mm6\n\t" \
  "  movq        %mm6,     %mm0\n\t" \
  "  pmulhw      %mm4,     %mm6\n\t" \
- "  paddsw      %mm3,     %mm3\n\t" \
- "  paddsw      %mm1,     %mm1\n\t" \
- "  paddsw      %mm0,     %mm3\n\t" \
- "  paddsw      %mm5,     %mm1\n\t" \
+ "  paddw       %mm3,     %mm3\n\t" \
+ "  paddw       %mm1,     %mm1\n\t" \
+ "  paddw       %mm0,     %mm3\n\t" \
+ "  paddw       %mm5,     %mm1\n\t" \
  "  pmulhw      %mm3,     %mm4\n\t" \
- "  paddsw      %mm0,     %mm6\n\t" \
- "  psubsw      %mm2,     %mm6\n\t" \
- "  paddsw      %mm2,     %mm2\n\t" \
+ "  paddw       %mm0,     %mm6\n\t" \
+ "  psubw       %mm2,     %mm6\n\t" \
+ "  paddw       %mm2,     %mm2\n\t" \
  "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddsw      %mm6,     %mm2\n\t" \
+ "  paddw       %mm6,     %mm2\n\t" \
  "  paddw       %mm3,     %mm4\n\t" \
- "  psubsw      %mm1,     %mm2\n\t" \
+ "  psubw       %mm1,     %mm2\n\t" \
  "#end OC_IDCT_BEGIN\n\t"
 
 /*38+8=46 cycles.*/
@@ -126,21 +126,21 @@
  "  #OC_ROW_IDCT\n" \
  OC_IDCT_BEGIN \
  "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  psubsw      %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
- "  paddsw      %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddsw      %mm7,     %mm7\n\t"  /* r7 = G + G */ \
- "  paddsw      %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
- "  paddsw      %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
- "  psubsw      %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddsw      %mm3,     %mm3\n\t" \
- "  psubsw      %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
- "  paddsw      %mm5,     %mm5\n\t" \
- "  paddsw      %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
- "  paddsw      %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
- "  psubsw      %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddsw      %mm0,     %mm0\n\t" \
+ "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
+ "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
+ "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
+ "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
+ "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
+ "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
+ "  paddw       %mm3,     %mm3\n\t" \
+ "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
+ "  paddw       %mm5,     %mm5\n\t" \
+ "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
+ "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
+ "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
+ "  paddw       %mm0,     %mm0\n\t" \
  "  movq        %mm1,"OC_I(1)"\n\t"  /* save R1 */ \
- "  paddsw      %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
  "#end OC_ROW_IDCT\n\t" \
 )
 
@@ -216,35 +216,35 @@
 #define OC_COLUMN_IDCT __asm__ __volatile__( \
  "  #OC_COLUMN_IDCT\n" \
  OC_IDCT_BEGIN \
- "  paddsw    "OC_8",     %mm2\n\t" \
- "  paddsw      %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddsw      %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
+ "  paddw     "OC_8",     %mm2\n\t" \
+ "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
+ "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
  "  psraw         $4,     %mm2\n\t"  /* r2 = NR2 */ \
- "  psubsw      %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
+ "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
  "  psraw         $4,     %mm1\n\t"  /* r1 = NR1 */ \
  "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  paddsw      %mm7,     %mm7\n\t"  /* r7 = G + G */ \
+ "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
  "  movq        %mm2,"OC_I(2)"\n\t"  /* store NR2 at I2 */ \
- "  paddsw      %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
+ "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
  "  movq        %mm1,"OC_I(1)"\n\t"  /* store NR1 at I1 */ \
- "  psubsw      %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddsw    "OC_8",     %mm4\n\t" \
- "  paddsw      %mm3,     %mm3\n\t"  /* r3 = D. + D. */ \
- "  paddsw      %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
+ "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
+ "  paddw     "OC_8",     %mm4\n\t" \
+ "  paddw       %mm3,     %mm3\n\t"  /* r3 = D. + D. */ \
+ "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
  "  psraw         $4,     %mm4\n\t"  /* r4 = NR4 */ \
- "  psubsw      %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
+ "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
  "  psraw         $4,     %mm3\n\t"  /* r3 = NR3 */ \
- "  paddsw    "OC_8",     %mm6\n\t" \
- "  paddsw      %mm5,     %mm5\n\t"  /* r5 = B.. + B.. */ \
- "  paddsw      %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
+ "  paddw     "OC_8",     %mm6\n\t" \
+ "  paddw       %mm5,     %mm5\n\t"  /* r5 = B.. + B.. */ \
+ "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
  "  psraw         $4,     %mm6\n\t"  /* r6 = NR6 */ \
  "  movq        %mm4,"OC_J(4)"\n\t"  /* store NR4 at J4 */ \
  "  psraw         $4,     %mm5\n\t"  /* r5 = NR5 */ \
  "  movq        %mm3,"OC_I(3)"\n\t"  /* store NR3 at I3 */ \
- "  psubsw      %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddsw    "OC_8",     %mm7\n\t" \
- "  paddsw      %mm0,     %mm0\n\t"  /* r0 = C. + C. */ \
- "  paddsw      %mm7,     %mm0\n\t"  /* r0 = R0 = G. + C. */ \
+ "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
+ "  paddw     "OC_8",     %mm7\n\t" \
+ "  paddw       %mm0,     %mm0\n\t"  /* r0 = C. + C. */ \
+ "  paddw       %mm7,     %mm0\n\t"  /* r0 = R0 = G. + C. */ \
  "  psraw         $4,     %mm7\n\t"  /* r7 = NR7 */ \
  "  movq        %mm6,"OC_J(6)"\n\t"  /* store NR6 at J6 */ \
  "  psraw         $4,     %mm0\n\t"  /* r0 = NR0 */ \
@@ -331,19 +331,19 @@
  "  movq        %mm5,     %mm1\n\t" \
  "  paddw       %mm3,     %mm0\n\t" \
  "  pmulhw "OC_C(7)",     %mm3\n\t" \
- "  psubsw      %mm2,     %mm6\n\t" \
+ "  psubw       %mm2,     %mm6\n\t" \
  "  pmulhw "OC_C(2)",     %mm5\n\t" \
- "  psubsw      %mm4,     %mm0\n\t" \
+ "  psubw       %mm4,     %mm0\n\t" \
  "  movq   "OC_I(2)",     %mm7\n\t" \
- "  paddsw      %mm4,     %mm4\n\t" \
+ "  paddw       %mm4,     %mm4\n\t" \
  "  paddw       %mm5,     %mm7\n\t" \
- "  paddsw      %mm0,     %mm4\n\t" \
+ "  paddw       %mm0,     %mm4\n\t" \
  "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  psubsw      %mm6,     %mm3\n\t" \
+ "  psubw       %mm6,     %mm3\n\t" \
  "  movq        %mm4,"OC_I(1)"\n\t" \
- "  paddsw      %mm6,     %mm6\n\t" \
+ "  paddw       %mm6,     %mm6\n\t" \
  "  movq   "OC_C(4)",     %mm4\n\t" \
- "  paddsw      %mm3,     %mm6\n\t" \
+ "  paddw       %mm3,     %mm6\n\t" \
  "  movq        %mm3,     %mm5\n\t" \
  "  pmulhw      %mm4,     %mm3\n\t" \
  "  movq        %mm6,"OC_I(2)"\n\t" \
@@ -352,17 +352,17 @@
  "  pmulhw      %mm4,     %mm0\n\t" \
  "  paddw       %mm3,     %mm5\n\t" \
  "  paddw       %mm0,     %mm2\n\t" \
- "  psubsw      %mm1,     %mm5\n\t" \
+ "  psubw       %mm1,     %mm5\n\t" \
  "  pmulhw      %mm4,     %mm6\n\t" \
  "  paddw  "OC_I(0)",     %mm6\n\t" \
- "  paddsw      %mm1,     %mm1\n\t" \
+ "  paddw       %mm1,     %mm1\n\t" \
  "  movq        %mm6,     %mm4\n\t" \
- "  paddsw      %mm5,     %mm1\n\t" \
- "  psubsw      %mm2,     %mm6\n\t" \
- "  paddsw      %mm2,     %mm2\n\t" \
+ "  paddw       %mm5,     %mm1\n\t" \
+ "  psubw       %mm2,     %mm6\n\t" \
+ "  paddw       %mm2,     %mm2\n\t" \
  "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddsw      %mm6,     %mm2\n\t" \
- "  psubsw      %mm1,     %mm2\n\t" \
+ "  paddw       %mm6,     %mm2\n\t" \
+ "  psubw       %mm1,     %mm2\n\t" \
  "  nop\n\t" \
  "  #end OC_IDCT_BEGIN_10\n\t"
 
@@ -371,21 +371,21 @@
  "  #OC_ROW_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
  "  movq    "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  psubsw       %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
- "  paddsw       %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddsw       %mm7,     %mm7\n\t" /* r7 = G + G */ \
- "  paddsw       %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- "  paddsw       %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
- "  psubsw       %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddsw       %mm3,     %mm3\n\t" \
- "  psubsw       %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- "  paddsw       %mm5,     %mm5\n\t" \
- "  paddsw       %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
- "  paddsw       %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- "  psubsw       %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddsw       %mm0,     %mm0\n\t" \
+ "  psubw        %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
+ "  paddw        %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
+ "  paddw        %mm7,     %mm7\n\t" /* r7 = G + G */ \
+ "  paddw        %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ "  paddw        %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
+ "  psubw        %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ "  paddw        %mm3,     %mm3\n\t" \
+ "  psubw        %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ "  paddw        %mm5,     %mm5\n\t" \
+ "  paddw        %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ "  paddw        %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ "  psubw        %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ "  paddw        %mm0,     %mm0\n\t" \
  "  movq         %mm1,"OC_I(1)"\n\t" /* save R1 */ \
- "  paddsw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "  paddw        %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
  "#end OC_ROW_IDCT_10\n\t" \
 )
 
@@ -393,35 +393,35 @@
 #define OC_COLUMN_IDCT_10 __asm__ __volatile__( \
  "  #OC_COLUMN_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "  paddsw    "OC_8",     %mm2\n\t" \
- "  paddsw      %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddsw      %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ "  paddw     "OC_8",     %mm2\n\t" \
+ "  paddw       %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
+ "  paddw       %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
  "  psraw         $4,     %mm2\n\t" /* r2 = NR2 */ \
- "  psubsw      %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
+ "  psubw       %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
  "  psraw         $4,     %mm1\n\t" /* r1 = NR1 */ \
  "  movq   "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  paddsw      %mm7,     %mm7\n\t" /* r7 = G + G */ \
+ "  paddw       %mm7,     %mm7\n\t" /* r7 = G + G */ \
  "  movq        %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
- "  paddsw      %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
+ "  paddw       %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
  "  movq        %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
- "  psubsw      %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddsw    "OC_8",     %mm4\n\t" \
- "  paddsw      %mm3,     %mm3\n\t" /* r3 = D. + D. */ \
- "  paddsw      %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ "  psubw       %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ "  paddw     "OC_8",     %mm4\n\t" \
+ "  paddw       %mm3,     %mm3\n\t" /* r3 = D. + D. */ \
+ "  paddw       %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
  "  psraw         $4,     %mm4\n\t" /* r4 = NR4 */ \
- "  psubsw      %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ "  psubw       %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
  "  psraw         $4,     %mm3\n\t" /* r3 = NR3 */ \
- "  paddsw    "OC_8",     %mm6\n\t" \
- "  paddsw      %mm5,     %mm5\n\t" /* r5 = B.. + B.. */ \
- "  paddsw      %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ "  paddw     "OC_8",     %mm6\n\t" \
+ "  paddw       %mm5,     %mm5\n\t" /* r5 = B.. + B.. */ \
+ "  paddw       %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
  "  psraw         $4,     %mm6\n\t" /* r6 = NR6 */ \
  "  movq        %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
  "  psraw         $4,     %mm5\n\t" /* r5 = NR5 */ \
  "  movq        %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
- "  psubsw      %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddsw    "OC_8",     %mm7\n\t" \
- "  paddsw      %mm0,     %mm0\n\t" /* r0 = C. + C. */ \
- "  paddsw      %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "  psubw       %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ "  paddw     "OC_8",     %mm7\n\t" \
+ "  paddw       %mm0,     %mm0\n\t" /* r0 = C. + C. */ \
+ "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
  "  psraw         $4,     %mm7\n\t" /* r7 = NR7 */ \
  "  movq        %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
  "  psraw         $4,     %mm0\n\t" /* r0 = NR0 */ \

Modified: trunk/theora/doc/spec/spec.tex
===================================================================
--- trunk/theora/doc/spec/spec.tex	2005-12-23 07:17:56 UTC (rev 10676)
+++ trunk/theora/doc/spec/spec.tex	2005-12-23 19:07:46 UTC (rev 10677)
@@ -6053,19 +6053,26 @@
  discarding any higher-order bits in their two's complement representation.
 The final output of each 1D transform is truncated to 16-bits in the same
  manner.
-In practice, 32 bits is sufficient for every calculation except scaling by
- $C4$.
-Here we specify truncating to 16 bits after the right shift by 16, but this is
- equivalent to truncating the result of the multiply to 32 bits before the
- right shift.
+In practice, if the high word of a $16\times 16$ bit multiplication can be
+ obtained directly, 16 bits is sufficient for every calculation except scaling
+ by $C4$.
+Here we specify truncating to 16 bits before the multiplication to simplify
+ implementations using hardware or common SIMD instruction sets.
 
+Note that if 16-bit register are used, overflow in the additions and
+ subtractions should be handled using \textit{unsaturated} arithmetic.
+That is, the high-order bits should be discarded and the low-order bits
+ retained, instead of clamping the result to the maximum or minimum value.
+This allows the maximum flexibility in re-ordering these instructions without
+ deviating from this specification.
+
 The 1D transform can only overflow if input coefficients larger than $\pm 6201$
  are present.
 However, the result of applying the 2D forward transform on pixel values in the
  range $-255\ldots 255$ can be as large as $\pm 8157$ due to the scale factor
  of four that is applied, and quantization errors could make this even larger.
-Therefore, the coefficients cannot simply be clamped into a valid range, as
- they could still overflow just the 1D inverse transform by itself.
+Therefore, the coefficients cannot simply be clamped into a valid range before
+ the transform.
 
 \subsubsection{The 1D Inverse DCT}
 \label{sub:1d-idct}
@@ -6142,6 +6149,8 @@
 
 Operations on a single signal path through the graph cannot be reordered, but
  operations on different paths may be, or may be executed in parallel.
+Different graphs may be obtainable using the associative, commutative, and
+ distributive properties of unsaturated arithmetic.
 The column of numbers on the left represents an initial permutation of the
  input DCT coefficients.
 The column on the right represents the unpermuted output.
@@ -6167,18 +6176,21 @@
 
 \begin{enumerate}
 \item
-Assign $\locvar{T}[0]$ the value
- $\locvar{C4}*(\bitvar{Y}[0]+\bitvar{Y}[4])>>16$.
+Assign $\locvar{T}[0]$ the value $\bitvar{Y}[0]+\bitvar{Y}[4]$.
 \item
 Truncate $\locvar{T}[0]$ to a 16-bit representation by dropping any
  higher-order bits.
 \item
-Assign $\locvar{T}[1]$ the value
- $\locvar{C4}*(\bitvar{Y}[0]-\bitvar{Y}[4])>>16$.
+Assign $\locvar{T}[0]$ the value
+ $\locvar{C4}*\locvar{T}[0]>>16$.
 \item
+Assign $\locvar{T}[1]$ the value $\bitvar{Y}[0]-\bitvar{Y}[4]$.
+\item
 Truncate $\locvar{T}[1]$ to a 16-bit representation by dropping any
  higher-order bits.
 \item
+Assign $\locvar{T}[1]$ the value $\locvar{C4}*\locvar{T}[1]>>16$.
+\item
 Assign $\locvar{T}[2]$ the value $(\locvar{C6}*\bitvar{Y}[2]>>16)-
  (\locvar{S6}*\bitvar{Y}[6]>>16)$.
 \item
@@ -6199,22 +6211,24 @@
 \item
 Assign \locvar{R} the value $\locvar{T}[4]+\locvar{T}[5]$.
 \item
-Assign $\locvar{T}[5]$ the value
- $\locvar{C4}*(\locvar{T}[4]-\locvar{T}[5])>>16$.
+Assign $\locvar{T}[5]$ the value $\locvar{T}[4]-\locvar{T}[5]$.
 \item
 Truncate $\locvar{T}[5]$ to a 16-bit representation by dropping any
  higher-order bits.
 \item
+Assign $\locvar{T}[5]$ the value $\locvar{C4}*(-\locvar{T}[5])>>16$.
+\item
 Assign $\locvar{T}[4]$ the value $\locvar{R}$.
 \item
 Assign \locvar{R} the value $\locvar{T}[7]+\locvar{T}[6]$.
 \item
-Assign $\locvar{T}[6]$ the value
- $\locvar{C4}*(\locvar{T}[7]-\locvar{T}[6])>>16$.
+Assign $\locvar{T}[6]$ the value $\locvar{T}[7]-\locvar{T}[6]$.
 \item
 Truncate $\locvar{T}[6]$ to a 16-bit representation by dropping any
  higher-order bits.
 \item
+Assign $\locvar{T}[6]$ the value $\locvar{C4}*\locvar{T}[6]>>16$.
+\item
 Assign $\locvar{T}[7]$ the value $\locvar{R}$.
 \item
 Assign \locvar{R} the value $\locvar{T}[0]+\locvar{T}[3]$.
@@ -6329,15 +6343,15 @@
 \locvar{\ci}     & Integer &  3 & No  & The column index. \\
 \locvar{\ri}     & Integer &  3 & No  & The row index. \\
 \locvar{Y}       & \multicolumn{1}{p{40pt}}{Integer Array} &
-                             16 & Yes & An 8-element array of 1-D iDCT input
+                             16 & Yes & An 8-element array of 1D iDCT input
  values. \\
 \locvar{X}       & \multicolumn{1}{p{40pt}}{Integer Array} &
-                             16 & Yes & An 8-element array of 1-D iDCT output
+                             16 & Yes & An 8-element array of 1D iDCT output
  values. \\
 \bottomrule\end{tabularx}
 \medskip
 
-This procedure applies the 1-D inverse DCT transform 16 times to a block of
+This procedure applies the 1D inverse DCT transform 16 times to a block of
  dequantized coefficients: once for each of the 8 rows, and once for each of
  the 8 columns of the result.
 Note that the coordinate system used for the columns is the same right-handed
@@ -6358,7 +6372,7 @@
  $\bitvar{DQC}[\locvar{\ri}*8+\locvar{\ci}]$.
 \end{enumerate}
 \item
-Compute \locvar{X}, the 1-D inverse DCT of \locvar{Y} using the procedure
+Compute \locvar{X}, the 1D inverse DCT of \locvar{Y} using the procedure
  described in Section~\ref{sub:1d-idct}.
 \item
 For each value of $\locvar{\ci}$ from 0 to 7:
@@ -6379,7 +6393,7 @@
  $\bitvar{RES}[\locvar{\ri}][\locvar{\ci}]$.
 \end{enumerate}
 \item
-Compute \locvar{X}, the 1-D inverse DCT of \locvar{Y} using the procedure
+Compute \locvar{X}, the 1D inverse DCT of \locvar{Y} using the procedure
  described in Section~\ref{sub:1d-idct}.
 \item
 For each value of \locvar{\ri} from 0 to 7:
@@ -6464,7 +6478,7 @@
 This can be implemented quickly by adding an offset of $\hex{FFFF}$ if the
  number is negative, and then shifting as before.
 This slightly increases the computational complexity of the transform.
-Unlike the inverse DCT, 16 bit registers and a $16\times16\rightarrow32$ bit
+Unlike the inverse DCT, 16-bit registers and a $16\times16\rightarrow32$ bit
  multiply are sufficient to avoid any overflow, so long as the input is in the
  range $-6270\ldots 6270$, which is larger than required.
 

Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c	2005-12-23 07:17:56 UTC (rev 10676)
+++ trunk/theora/lib/idct.c	2005-12-23 19:07:46 UTC (rev 10677)
@@ -79,11 +79,11 @@
       t2 >>= 16;
       _D = t1 - t2;
 
-      t1 = (xC4S4 * (_A - _C));
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
       t1 >>= 16;
       _Ad = t1;
 
-      t1 = (xC4S4 * (_B - _D));
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
       t1 >>= 16;
       _Bd = t1;
 
@@ -91,11 +91,11 @@
       _Cd = _A + _C;
       _Dd = _B + _D;
 
-      t1 = (xC4S4 * (ip[0] + ip[4]));
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0] + ip[4]));
       t1 >>= 16;
       _E = t1;
 
-      t1 = (xC4S4 * (ip[0] - ip[4]));
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0] - ip[4]));
       t1 >>= 16;
       _F = t1;
 
@@ -170,11 +170,11 @@
       t2 >>= 16;
       _D = t1 - t2;
 
-      t1 = (xC4S4 * (_A - _C));
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
       t1 >>= 16;
       _Ad = t1;
 
-      t1 = (xC4S4 * (_B - _D));
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
       t1 >>= 16;
       _Bd = t1;
 
@@ -182,11 +182,11 @@
       _Cd = _A + _C;
       _Dd = _B + _D;
 
-      t1 = (xC4S4 * (ip[0*8] + ip[4*8]));
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] + ip[4*8]));
       t1 >>= 16;
       _E = t1;
 
-      t1 = (xC4S4 * (ip[0*8] - ip[4*8]));
+      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] - ip[4*8]));
       t1 >>= 16;
       _F = t1;
 
@@ -301,11 +301,11 @@
       _D = -t2;
 
 
-      t1 = (xC4S4 * (_A - _C));
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
       t1 >>= 16;
       _Ad = t1;
 
-      t1 = (xC4S4 * (_B - _D));
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
       t1 >>= 16;
       _Bd = t1;
 
@@ -378,11 +378,11 @@
       _D = - t2;
 
 
-      t1 = (xC4S4 * (_A - _C));
+      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
       t1 >>= 16;
       _Ad = t1;
 
-      t1 = (xC4S4 * (_B - _D));
+      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
       t1 >>= 16;
       _Bd = t1;