[xiph-commits] r17728 - in trunk/theora/lib: . arm c64x x86 x86_vc

Tue Dec 7 02:28:07 PST 2010

Author: tterribe
Date: 2010-12-07 02:28:07 -0800 (Tue, 07 Dec 2010)
New Revision: 17728

Added:
   trunk/theora/lib/x86/x86zigzag.h
   trunk/theora/lib/x86_vc/x86zigzag.h
Modified:
   trunk/theora/lib/Makefile.am
   trunk/theora/lib/analyze.c
   trunk/theora/lib/arm/armidct.s
   trunk/theora/lib/c64x/c64xidct.c
   trunk/theora/lib/encint.h
   trunk/theora/lib/enquant.c
   trunk/theora/lib/fdct.c
   trunk/theora/lib/idct.c
   trunk/theora/lib/tokenize.c
   trunk/theora/lib/x86/mmxfdct.c
   trunk/theora/lib/x86/mmxidct.c
   trunk/theora/lib/x86/sse2fdct.c
   trunk/theora/lib/x86/sse2idct.c
   trunk/theora/lib/x86/x86enc.c
   trunk/theora/lib/x86/x86enc.h
   trunk/theora/lib/x86/x86enquant.c
   trunk/theora/lib/x86_vc/mmxfdct.c
   trunk/theora/lib/x86_vc/mmxidct.c
Log:
Move zig-zagging from quantization into the fDCT.

This removes one of the transposes from the fDCT, and avoids several zig-zag
 lookups during tokenization.
This change also makes the encoder iDCT clear the input buffer like the
 decoder, which can be re-used for the next block, avoiding the need for a
 memcpy or memset in the tokenizer.
This gives a 1.3% speed-up at the default speed-level (1), and a 3.1% speed-up
 at speed-level 2 (for 480p, on x86-64).


Modified: trunk/theora/lib/Makefile.am
===================================================================

--- trunk/theora/lib/Makefile.am	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/Makefile.am	2010-12-07 10:28:07 UTC (rev 17728)
@@ -13,7 +13,14 @@
 	arm/arm2gnu.pl \
 	c64x/c64xint.h \
 	c64x/c64xdec.h \
+	x86/mmxfrag.c \
+	x86/mmxidct.c \
+	x86/mmxloop.h \
+	x86/mmxstate.c \
+	x86/sse2idct.c \
 	x86/x86cpu.c \
+	x86/x86int.h \
+	x86/x86state.c \
 	x86/mmxencfrag.c \
 	x86/mmxfdct.c \
 	x86/sse2encfrag.c \
@@ -22,13 +29,7 @@
 	x86/x86enc.c \
 	x86/x86enc.h \
 	x86/x86enquant.c \
-	x86/mmxfrag.c \
-	x86/mmxidct.c \
-	x86/mmxloop.h \
-	x86/mmxstate.c \
-	x86/sse2idct.c \
-	x86/x86int.h \
-	x86/x86state.c \
+	x86/x86zigzag.h \
 	x86_vc
 
 lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
@@ -215,11 +216,12 @@
 	arm/armcpu.h \
 	c64x/c64xdec.h \
 	c64x/c64xint.h \
-	x86/x86cpu.h \
 	x86/mmxloop.h \
 	x86/sse2trans.h \
+	x86/x86cpu.h \
 	x86/x86enc.h \
-	x86/x86int.h
+	x86/x86int.h \
+	x86/x86zigzag.h
 
 libtheoradec_la_SOURCES = \
 	$(decoder_sources) \

Modified: trunk/theora/lib/analyze.c
===================================================================
--- trunk/theora/lib/analyze.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/analyze.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -667,8 +667,9 @@
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
  unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
  oc_fr_state *_fr,oc_token_checkpoint **_stack){
+  ogg_int16_t            *data;
   ogg_int16_t            *dct;
-  ogg_int16_t            *data;
+  ogg_int16_t            *idct;
   oc_qii_state            qs;
   const ogg_uint16_t     *dequant;
   ogg_uint16_t            dequant_dc;
@@ -701,6 +702,7 @@
   qii=frags[_fragi].qii;
   data=_enc->pipe.dct_data;
   dct=data+64;
+  idct=data+128;
   if(qii&~3){
 #if !defined(OC_COLLECT_METRICS)
     if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
@@ -771,12 +773,12 @@
   /*Tokenize.*/
   checkpoint=*_stack;
   if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
-    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
-     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   }
   else{
-    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
-     _stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
+    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
+     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
   }
   /*Reconstruct.
     TODO: nonzero may need to be adjusted after tokenization.*/
@@ -798,8 +800,9 @@
     else if(qi01>=0)qii=0;
   }
   else{
-    data[0]=dc*dequant_dc;
-    oc_idct8x8(&_enc->state,data,data,nonzero+1);
+    idct[0]=dc*dequant_dc;
+    /*Note: This clears idct[] back to zero for the next block.*/
+    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
   }
   frags[_fragi].qii=qii;
   if(nqis>1){

Modified: trunk/theora/lib/arm/armidct.s
===================================================================
--- trunk/theora/lib/arm/armidct.s	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/arm/armidct.s	2010-12-07 10:28:07 UTC (rev 17728)
@@ -64,11 +64,8 @@
 	BL	idct8core_arm
 	BL	idct8core_arm
 	LDR	r0, [r13], #4	; Write to the final destination.
-	; Clear input data for next block (decoder only).
 	SUB	r2, r1, #8*16
-	CMP	r0, r2
-	MOV	r1, r13		; And read from temp storage.
-	BEQ	oc_idct8x8_slow_arm_cols
+	; Clear input data for next block.
 	MOV	r4, #0
 	MOV	r5, #0
 	MOV	r6, #0
@@ -81,7 +78,7 @@
 	STMIA	r2!,{r4,r5,r6,r7}
 	STMIA	r2!,{r4,r5,r6,r7}
 	STMIA	r2!,{r4,r5,r6,r7}
-oc_idct8x8_slow_arm_cols
+	MOV	r1, r13		; And read from temp storage.
 ; Column transforms
 	BL	idct8core_down_arm
 	BL	idct8core_down_arm
@@ -105,18 +102,15 @@
 	BL	idct3core_arm
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block (decoder only).
-	SUB	r0, r1, #4*16
-	CMP	r0, r2
-	MOV	r1, r13		; Read from temp storage.
-	BEQ	oc_idct8x8_10_arm_cols
+	; Clear input data for next block.
 	MOV	r4, #0
-	STR	r4, [r0]
-	STR	r4, [r0,#4]
-	STR	r4, [r0,#16]
-	STR	r4, [r0,#20]
-	STR	r4, [r0,#32]
-	STR	r4, [r0,#48]
+	STR	r4, [r1,#-4*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#20]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	MOV	r1, r13		; Read from temp storage.
 	MOV	r0, r2		; Write to the final destination
 oc_idct8x8_10_arm_cols
 ; Column transforms
@@ -141,18 +135,14 @@
 	BL	idct3core_arm
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block (decoder only).
-	SUB	r0, r1, #3*16
-	CMP	r0, r2
-	MOV	r1, r13		; Read from temp storage.
-	BEQ	oc_idct8x8_6_arm_cols
+	; Clear input data for next block.
 	MOV	r4, #0
-	STR	r4, [r0]
-	STR	r4, [r0,#4]
-	STR	r4, [r0,#16]
-	STR	r4, [r0,#32]
+	STR	r4, [r1,#-3*16]!
+	STR	r4, [r1,#4]
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	MOV	r1, r13		; Read from temp storage.
 	MOV	r0, r2		; Write to the final destination
-oc_idct8x8_6_arm_cols
 ; Column transforms
 	BL	idct3core_down_arm
 	BL	idct3core_down_arm
@@ -174,14 +164,12 @@
 	MOV	r0, r13		; Write to temp storage.
 	BL	idct2core_arm
 	BL	idct1core_arm
-	; Clear input data for next block (decoder only).
-	SUB	r0, r1, #2*16
-	CMP	r0, r2
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
 	MOV	r1, r13		; Read from temp storage.
-	MOVNE	r4, #0
-	STRNE	r4, [r0]
-	STRNE	r4, [r0,#16]
-	MOVNE	r0, r2		; Write to the final destination
+	MOV	r0, r2		; Write to the final destination
 ; Column transforms
 	BL	idct2core_down_arm
 	BL	idct2core_down_arm
@@ -799,30 +787,26 @@
 	BL	idct8_8core_v6
 	BL	idct8_8core_v6
 	LDR	r0, [r13], #4	; Write to the final destination.
-	; Clear input data for next block (decoder only).
-	SUB	r2, r1, #8*16
-	CMP	r0, r2
-	MOV	r1, r13		; And read from temp storage.
-	BEQ	oc_idct8x8_slow_v6_cols
+	; Clear input data for next block.
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-	STRD	r4, [r2], #8
-oc_idct8x8_slow_v6_cols
+	STRD	r4, [r1,#-8*16]!
+	STRD	r4, [r1,#8]
+	STRD	r4, [r1,#16]
+	STRD	r4, [r1,#24]
+	STRD	r4, [r1,#32]
+	STRD	r4, [r1,#40]
+	STRD	r4, [r1,#48]
+	STRD	r4, [r1,#56]
+	STRD	r4, [r1,#64]
+	STRD	r4, [r1,#72]
+	STRD	r4, [r1,#80]
+	STRD	r4, [r1,#88]
+	STRD	r4, [r1,#96]
+	STRD	r4, [r1,#104]
+	STRD	r4, [r1,#112]
+	STRD	r4, [r1,#120]
+	MOV	r1, r13		; And read from temp storage.
 ; Column transforms
 	BL	idct8_8core_down_v6
 	BL	idct8_8core_down_v6
@@ -843,20 +827,16 @@
 	BL	idct4_3core_v6
 	BL	idct2_1core_v6
 	LDR	r0, [r13], #4	; Write to the final destination.
-	; Clear input data for next block (decoder only).
-	SUB	r2, r1, #4*16
-	CMP	r0, r2
-	AND	r1, r13,#4	; Align the stack.
-	BEQ	oc_idct8x8_10_v6_cols
+	; Clear input data for next block.
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r2]
-	STRD	r4, [r2,#16]
-	STR	r4, [r2,#32]
-	STR	r4, [r2,#48]
-oc_idct8x8_10_v6_cols
-; Column transforms
+	STRD	r4, [r1,#-4*16]!
+	STRD	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	STR	r4, [r1,#48]
+	AND	r1, r13,#4	; Align the stack.
 	ADD	r1, r1, r13	; And read from temp storage.
+; Column transforms
 	BL	idct4_4core_down_v6
 	BL	idct4_4core_down_v6
 	BL	idct4_4core_down_v6
@@ -872,14 +852,12 @@
 	MOV	r8, r0
 	MOV	r0, r13		; Write to temp storage.
 	BL	idct2_1core_v6
-	; Clear input data for next block (decoder only).
-	SUB	r0, r1, #2*16
-	CMP	r0, r8
+	; Clear input data for next block.
+	MOV	r4, #0
+	STR	r4, [r1,#-2*16]!
+	STR	r4, [r1,#16]
 	MOV	r1, r13		; Read from temp storage.
-	MOVNE	r4, #0
-	STRNE	r4, [r0]
-	STRNE	r4, [r0,#16]
-	MOVNE	r0, r8		; Write to the final destination.
+	MOV	r0, r8		; Write to the final destination.
 ; Column transforms
 	BL	idct2_2core_down_v6
 	BL	idct2_2core_down_v6
@@ -1035,20 +1013,16 @@
 	ADD	r0, r0, r13	; Write to temp storage.
 	BL	idct3_2core_v6
 	BL	idct1core_v6
-	; Clear input data for next block (decoder only).
-	SUB	r0, r1, #3*16
-	CMP	r0, r8
-	AND	r1, r13,#4	; Align the stack.
-	BEQ	oc_idct8x8_6_v6_cols
+	; Clear input data for next block.
 	MOV	r4, #0
 	MOV	r5, #0
-	STRD	r4, [r0]
-	STR	r4, [r0,#16]
-	STR	r4, [r0,#32]
+	STRD	r4, [r1,#-3*16]!
+	STR	r4, [r1,#16]
+	STR	r4, [r1,#32]
+	AND	r1, r13,#4	; Align the stack.
 	MOV	r0, r8		; Write to the final destination.
-oc_idct8x8_6_v6_cols
-; Column transforms
 	ADD	r1, r1, r13	; And read from temp storage.
+; Column transforms
 	BL	idct3_3core_down_v6
 	BL	idct3_3core_down_v6
 	BL	idct3_3core_down_v6
@@ -1590,7 +1564,6 @@
 	VSWP		D23,D30
 	; Column transforms
 	BL	oc_idct8x8_stage123_neon
-	CMP	r0,r1
 	; We have to put the return address back in the LR, or the branch
 	;  predictor will not recognize the function return and mis-predict the
 	;  entire call stack.
@@ -1604,7 +1577,6 @@
 	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
 	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
 	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
-	BEQ		oc_idct8x8_slow_neon_noclear
 	VMOV.I8		Q2,#0
 	VPOP		{D8-D15}
 	VMOV.I8		Q3,#0
@@ -1622,19 +1594,6 @@
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
-
-oc_idct8x8_slow_neon_noclear
-	VPOP		{D8-D15}
-	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
-	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
-	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
-	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
-	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
-	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
-	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
-	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
-	VSTMIA		r0, {D16-D31}
-	MOV	PC, r14
 	ENDP
 
 oc_idct8x8_stage123_neon PROC
@@ -1865,7 +1824,6 @@
 	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
 	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
 ; Stage 4
-	CMP	r0, r1
 	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
 	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
 	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
@@ -1874,7 +1832,6 @@
 	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
 	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
 	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
-	BEQ	oc_idct8x8_10_neon_noclear
 	VMOV.I8		D2, #0
 	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
 	VST1.64		{D2}, [r1 at 64], r12
@@ -1890,18 +1847,6 @@
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
-
-oc_idct8x8_10_neon_noclear
-	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
-	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
-	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
-	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
-	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
-	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
-	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
-	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
-	VSTMIA		r0, {D16-D31}
-	MOV	PC, r14
 	ENDP
  ]
 

Modified: trunk/theora/lib/c64x/c64xidct.c
===================================================================
--- trunk/theora/lib/c64x/c64xidct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/c64x/c64xidct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -319,12 +319,10 @@
   /*Transform rows of x into columns of w.*/
   for(i=0;i<8;i+=2){
     OC_IDCT8x2_LOAD8(_x+i*8);
-    if(_x!=_y){
-      _amem8(_x+i*8)=0LL;
-      _amem8(_x+i*8+4)=0LL;
-      _amem8(_x+i*8+8)=0LL;
-      _amem8(_x+i*8+12)=0LL;
-    }
+    _amem8(_x+i*8)=0LL;
+    _amem8(_x+i*8+4)=0LL;
+    _amem8(_x+i*8+8)=0LL;
+    _amem8(_x+i*8+12)=0LL;
     OC_IDCT8x2();
     OC_IDCT8x2_STORET(w+i);
   }
@@ -357,12 +355,10 @@
   OC_IDCT8x2_4();
   OC_IDCT8x2_STORET(w);
   OC_IDCT8x2_LOAD2(_x+16);
-  if(_x!=_y){
-    _amem8(_x)=0LL;
-    _amem8(_x+8)=0LL;
-    _amem4(_x+16)=0;
-    _amem4(_x+24)=0;
-  }
+  _amem8(_x)=0LL;
+  _amem8(_x+8)=0LL;
+  _amem4(_x+16)=0;
+  _amem4(_x+24)=0;
   OC_IDCT8x2_2();
   OC_IDCT8x2_STORET(w+2);
   /*Transform rows of w into columns of y.*/
@@ -398,10 +394,8 @@
     OC_IDCT8x2_2();
     OC_IDCT8x2_STORE(w+i*8);
   }
-  if(_x!=_y){
-    _amem4(_x)=0;
-    _amem4(_x+8)=0;
-  }
+  _amem4(_x)=0;
+  _amem4(_x+8)=0;
   /*Transform columns of w into columns of y.*/
   for(i=0;i<8;i+=2){
     OC_IDCT8x2_LOAD2T(w+i);

Modified: trunk/theora/lib/encint.h
===================================================================
--- trunk/theora/lib/encint.h	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/encint.h	2010-12-07 10:28:07 UTC (rev 17728)
@@ -444,7 +444,7 @@
     This is kept off the stack because a) gcc can't align things on the stack
      reliably on ARM, and b) it avoids (unintentional) data hazards between
      ARM and NEON code.*/
-  OC_ALIGN16(ogg_int16_t dct_data[128]);
+  OC_ALIGN16(ogg_int16_t dct_data[64*3]);
   OC_ALIGN16(signed char bounding_values[256]);
   oc_fr_state         fr[3];
   oc_qii_state        qs[3];
@@ -765,10 +765,12 @@
 
 void oc_enc_tokenize_start(oc_enc_ctx *_enc);
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
  const oc_token_checkpoint *_stack,int _n);

Modified: trunk/theora/lib/enquant.c
===================================================================
--- trunk/theora/lib/enquant.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/enquant.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -229,7 +229,7 @@
   enquant=(const oc_iquant *)_enquant;
   nonzero=0;
   for(zzi=0;zzi<64;zzi++){
-    val=_dct[OC_FZIG_ZAG[zzi]];
+    val=_dct[zzi];
     d=_dequant[zzi];
     val=val<<1;
     if(abs(val)>=d){

Modified: trunk/theora/lib/fdct.c
===================================================================
--- trunk/theora/lib/fdct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/fdct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -147,7 +147,7 @@
   /*Round the result back to the external working precision (which is still
      scaled by four relative to the orthogonal result).
     TODO: We should just update the external working precision.*/
-  for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
+  for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
 }
 
 

Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/idct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -241,8 +241,8 @@
   for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  /*Clear input data for next block (decoder only).*/
-  if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
+  /*Clear input data for next block.*/
+  _x[0]=_x[1]=_x[8]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -272,8 +272,8 @@
   for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  /*Clear input data for next block (decoder only).*/
-  if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
+  /*Clear input data for next block.*/
+  _x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -291,7 +291,8 @@
   for(i=0;i<8;i++)idct8(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
   for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
-  if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
+  /*Clear input data for next block.*/
+  for(i=0;i<64;i++)_x[i]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.

Modified: trunk/theora/lib/tokenize.c
===================================================================
--- trunk/theora/lib/tokenize.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/tokenize.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -454,9 +454,10 @@
 
 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
    dequantizes and de-zig-zags the result.
-  The DC coefficient is not preserved; it should be restored by the caller.*/
+  The AC coefficients of _idct must be pre-initialized to zero.*/
 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
   oc_token_checkpoint *stack;
   ogg_int64_t          zflags;
@@ -501,7 +502,7 @@
     qc=_qdct[zzi];
     s=-(qc<0);
     qc_m=qc+s^s;
-    c=_dct[OC_FZIG_ZAG[zzi]];
+    c=_dct[zzi];
     /*The hard case: try a zero run.*/
     if(qc_m<=1){
       ogg_uint32_t sum_d2;
@@ -565,7 +566,7 @@
               /*Try a +/- 1 combo token.*/
               token=OC_DCT_RUN_CAT1_TOKEN[nzeros-1];
               eb=OC_DCT_RUN_CAT1_EB[nzeros-1][-val_s];
-              e=_dct[OC_FZIG_ZAG[zzj]]-(_dequant[zzj]+val_s^val_s);
+              e=_dct[zzj]-(_dequant[zzj]+val_s^val_s);
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               bits=oc_token_bits(_enc,huffi,zzi,token);
               cost=d2+_lambda*bits+tokens[zzk][tk].cost;
@@ -585,7 +586,7 @@
               bits=oc_token_bits(_enc,huffi,zzi,token);
               val=2+(val>2);
               sval=val+val_s^val_s;
-              e=_dct[OC_FZIG_ZAG[zzj]]-_dequant[zzj]*sval;
+              e=_dct[zzj]-_dequant[zzj]*sval;
               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
               cost=d2+_lambda*bits+tokens[zzk][tk].cost;
               if(cost<=best_cost){
@@ -701,9 +702,6 @@
   }
   /*Emit the tokens from the best path through the trellis.*/
   stack=*_stack;
-  /*We blow away the first entry here so that things vectorize better.
-    The DC coefficient is not actually stored in the array yet.*/
-  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   zzi=1;
   ti=best_flags>>1&1;
@@ -737,7 +735,7 @@
     zzj=(next>>1)-1&63;
     /*TODO: It may be worth saving the dequantized coefficient in the trellis
        above; we had to compute it to measure the error anyway.*/
-    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+    _idct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
     zzi=next>>1;
     ti=next&1;
   }
@@ -747,16 +745,15 @@
 }
 
 /*Simplistic R/D tokenizer.
+  The AC coefficients of _idct must be pre-initialized to zero.
   This could be made more accurate by using more sophisticated
    rate predictions for zeros.
   It could be made faster by switching from R/D decisions to static
    lambda-derived rounding biases.*/
 int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
- ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ ogg_int16_t *_idct,const ogg_int16_t *_qdct,
+ const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
-  /*Note that gcc will not always respect this alignment.
-    In this case it doesn't matter terribly much.*/
-  OC_ALIGN16(ogg_int16_t  coef[64]);
   const unsigned char *dct_fzig_zag;
   ogg_uint16_t        *eob_run;
   oc_token_checkpoint *stack;
@@ -779,9 +776,7 @@
   eob_run=_enc->eob_run[_pli];
   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
-  memcpy(coef,_qdct,_zzi*sizeof(*coef));
-  for(zzj=0;zzj<64;zzj++)_qdct[zzj]=0;
-  for(zzj=zzi=1;zzj<_zzi&&!coef[zzj];zzj++);
+  for(zzj=zzi=1;zzj<_zzi&&!_qdct[zzj];zzj++);
   while(zzj<_zzi){
     int v;
     int d0;
@@ -797,10 +792,10 @@
     int eob_bits;
     int dct_fzig_zzj;
     dct_fzig_zzj=dct_fzig_zag[zzj];
-    v=_dct[OC_FZIG_ZAG[zzj]];
-    d0=coef[zzj];
+    v=_dct[zzj];
+    d0=_qdct[zzj];
     eob=eob_run[zzi];
-    for(zzk=zzj+1;zzk<_zzi&&!coef[zzk];zzk++);
+    for(zzk=zzj+1;zzk<_zzi&&!_qdct[zzk];zzk++);
     next_zero=zzk-zzj+62>>6;
     dq0=d0*_dequant[zzj];
     dd0=dq0-v;
@@ -840,7 +835,7 @@
         cost=dd1+zr[next_zero];
       }
       if((dd0+(best_bits+eob_bits)*_lambda)>cost){
-        _qdct[dct_fzig_zzj]=dq1;
+        _idct[dct_fzig_zzj]=dq1;
         if(d1==0){
           zzj=zzk;
           continue;
@@ -851,7 +846,7 @@
       }
       else{
         best_eb=*(OC_DCT_VALUE_EB_PTR+d0);
-        _qdct[dct_fzig_zzj]=dq0;
+        _idct[dct_fzig_zzj]=dq0;
       }
       oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
       if(eob>0){
@@ -927,7 +922,6 @@
       }
       best_cost=dd0+(best_bits+eob_bits)*_lambda;
       if(d1==0&&(dd1+zr[2+next_zero])<=best_cost){
-        _qdct[dct_fzig_zzj]=0;
         zzj=zzk;
         continue;
       }
@@ -936,9 +930,9 @@
         best_token=best_token1;
         best_eb=best_eb1;
         d=d1;
-        _qdct[dct_fzig_zzj]=dq1;
+        _idct[dct_fzig_zzj]=dq1;
       }
-      else _qdct[dct_fzig_zzj]=dq0;
+      else _idct[dct_fzig_zzj]=dq0;
       oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
       if(eob){
         oc_enc_eob_log(_enc,_pli,zzi,eob);

Modified: trunk/theora/lib/x86/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86/mmxfdct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/mmxfdct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -12,6 +12,7 @@
 /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -462,8 +463,9 @@
    mm7 = d3 c3 b3 a3*/ \
 
 /*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ogg_int16_t buf[64] __attribute__((aligned(8)));
+  ptrdiff_t   a;
   __asm__ __volatile__(
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
@@ -586,78 +588,89 @@
     "movq 0x30(%[y]),%%mm3\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x18(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x08(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x10(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,0x00(%[buf])\n\t"
+    "movq 0x30(%[y]),%%mm4\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,0x20(%[buf])\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,0x40(%[buf])\n\t"
     "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,0x50(%[buf])\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,0x60(%[buf])\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,0x70(%[buf])\n\t"
     "psraw $2,%%mm7\n\t"
+    "movq %%mm7,0x10(%[buf])\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,0x30(%[buf])\n\t"
+    /*Load the next block.*/
     "movq 0x40(%[y]),%%mm0\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x30(%[y])\n\t"
     "movq 0x78(%[y]),%%mm7\n\t"
-    "movq %%mm1,0x08(%[y])\n\t"
     "movq 0x50(%[y]),%%mm1\n\t"
-    "movq %%mm6,0x20(%[y])\n\t"
     "movq 0x68(%[y]),%%mm6\n\t"
-    "movq %%mm2,0x28(%[y])\n\t"
     "movq 0x60(%[y]),%%mm2\n\t"
-    "movq %%mm5,0x10(%[y])\n\t"
     "movq 0x58(%[y]),%%mm5\n\t"
-    "movq %%mm3,0x38(%[y])\n\t"
     "movq 0x70(%[y]),%%mm3\n\t"
-    "movq %%mm4,0x00(%[y])\n\t"
     "movq 0x48(%[y]),%%mm4\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x58(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x48(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
-    "movq %%mm2,0x68(%[y])\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x50(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "movq %%mm3,0x78(%[y])\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,0x08(%[buf])\n\t"
+    "movq 0x70(%[y]),%%mm4\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,0x28(%[buf])\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,0x48(%[buf])\n\t"
     "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "movq %%mm4,0x40(%[y])\n\t"
-    "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
-    "movq %%mm5,0x50(%[y])\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,0x58(%[buf])\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,0x68(%[buf])\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,0x78(%[buf])\n\t"
     "psraw $2,%%mm7\n\t"
-    "movq %%mm6,0x60(%[y])\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x70(%[y])\n\t"
-    "movq %%mm1,0x48(%[y])\n\t"
+    "movq %%mm7,0x18(%[buf])\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,0x38(%[buf])\n\t"
+    /*Final transpose and zig-zag.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movq 0x"_row"0(%[buf]),"_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "movq 0x"_row"8(%[buf]),"_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
     :[a]"=&r"(a)
-    :[y]"r"(_y),[x]"r"(_x)
+    :[y]"r"(_y),[x]"r"(_x),[buf]"r"(buf)
     :"memory"
   );
 }

Modified: trunk/theora/lib/x86/mmxidct.c
===================================================================
--- trunk/theora/lib/x86/mmxidct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/mmxidct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -284,6 +284,7 @@
   "#end OC_COLUMN_IDCT\n\t" \
 
 static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm__ __volatile__(
@@ -313,18 +314,15 @@
     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
-  if(_x!=_y){
-    int i;
-    __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
-    for(i=0;i<4;i++){
-      __asm__ __volatile__(
-        "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
-        "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
-        "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
-        "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
-        :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
-      );
-    }
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=0;i<4;i++){
+    __asm__ __volatile__(
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+      :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+    );
   }
 }
 
@@ -514,16 +512,14 @@
     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
-  if(_x!=_y){
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
-      :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
-    );
-  }
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+  );
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.

Modified: trunk/theora/lib/x86/sse2fdct.c
===================================================================
--- trunk/theora/lib/x86/sse2fdct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/sse2fdct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -13,6 +13,7 @@
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include <stddef.h>
 #include "x86enc.h"
+#include "x86zigzag.h"
 #include "sse2trans.h"
 
 #if defined(OC_X86_64_ASM)
@@ -412,8 +413,6 @@
     /*Transform rows.*/
     OC_TRANSPOSE_8x8
     OC_FDCT_8x8
-    /*TODO: zig-zag ordering?*/
-    OC_TRANSPOSE_8x8
     /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
     "paddw %%xmm14,%%xmm14\n\t"
     "psubw %%xmm14,%%xmm0\n\t"
@@ -432,15 +431,19 @@
     "psubw %%xmm14,%%xmm7\n\t"
     "psraw $2,%%xmm6\n\t"
     "psraw $2,%%xmm7\n\t"
-    /*Store the result.*/
-    "movdqa %%xmm0,0x00(%[y])\n\t"
-    "movdqa %%xmm1,0x10(%[y])\n\t"
-    "movdqa %%xmm2,0x20(%[y])\n\t"
-    "movdqa %%xmm3,0x30(%[y])\n\t"
-    "movdqa %%xmm4,0x40(%[y])\n\t"
-    "movdqa %%xmm5,0x50(%[y])\n\t"
-    "movdqa %%xmm6,0x60(%[y])\n\t"
-    "movdqa %%xmm7,0x70(%[y])\n\t"
+    /*Transpose, zig-zag, and store the result.*/
+    /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
+       version will do for now.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movdq2q %%xmm"_row","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "punpckhqdq %%xmm"_row",%%xmm"_row"\n\t" \
+    "movdq2q %%xmm"_row","_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
     :[a]"=&r"(a)
     :[y]"r"(_y),[x]"r"(_x)
     :"memory"

Modified: trunk/theora/lib/x86/sse2idct.c
===================================================================
--- trunk/theora/lib/x86/sse2idct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/sse2idct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -208,6 +208,7 @@
 
 static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   OC_ALIGN16(ogg_int16_t buf[16]);
+  int i;
   /*This routine accepts an 8x8 matrix pre-transposed.*/
   __asm__ __volatile__(
     /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
@@ -230,19 +231,16 @@
     :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   );
-  if(_x!=_y){
-    int i;
-    __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
-    /*Clear input data for next block (decoder only).*/
-    for(i=0;i<2;i++){
-      __asm__ __volatile__(
-        "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
-        "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
-        "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
-        "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
-        :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
-      );
-    }
+  __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+  /*Clear input data for next block (decoder only).*/
+  for(i=0;i<2;i++){
+    __asm__ __volatile__(
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+    );
   }
 }
 
@@ -411,17 +409,15 @@
     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   );
-  if(_x!=_y){
-    /*Clear input data for next block (decoder only).*/
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
-      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
-      :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
-    );
-  }
+  /*Clear input data for next block (decoder only).*/
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+  );
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.

Modified: trunk/theora/lib/x86/x86enc.c
===================================================================
--- trunk/theora/lib/x86/x86enc.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enc.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -28,7 +28,6 @@
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
@@ -38,6 +37,7 @@
     _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 #  if defined(OC_X86_64_ASM)

Modified: trunk/theora/lib/x86/x86enc.h
===================================================================
--- trunk/theora/lib/x86/x86enc.h	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enc.h	2010-12-07 10:28:07 UTC (rev 17728)
@@ -105,7 +105,7 @@
 void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
 int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
  const ogg_uint16_t _dequant[64],const void *_enquant);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 # if defined(OC_X86_64_ASM)
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);

Modified: trunk/theora/lib/x86/x86enquant.c
===================================================================
--- trunk/theora/lib/x86/x86enquant.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86/x86enquant.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -56,125 +56,17 @@
   }
 }
 
-/*Convert DCT coefficients in %[dct] from natural order into zig-zag scan order
-   and store them in %[qdct].
-  The index of each output element in the original 64-element array should wind
-   up in the following 8x8 matrix (the letters indicate the order we compute
-   each 4-tuple below):
-    A  0  1  8 16   9  2  3 10 B
-    C 17 24 32 25  18 11  4  5 D
-    E 12 19 26 33  40 48 41 34 I
-    H 27 20 13  6   7 14 21 28 G
-    K 35 42 49 56  57 50 43 36 J
-    F 29 22 15 23  30 37 44 51 M
-    P 58 59 52 45  38 31 39 46 L
-    N 53 60 61 54  47 55 62 63 O
-  The order of the coefficients within each tuple is reversed in the comments
-   below to reflect the usual MSB to LSB notation.*/
-#define OC_ZIG_ZAG_MMXEXT \
-  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
-  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
-  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
-  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
-  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
-  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
-  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
-  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
-  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
-  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
-  "movq %%mm0,0x00(%[qdct])\n\t" \
-  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
-  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
-  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
-  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
-  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
-  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
-  "movq %%mm6,0x08(%[qdct])\n\t" \
-  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
-  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
-  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
-  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
-  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
-  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
-  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
-  "movq %%mm2,0x10(%[qdct])\n\t" \
-  "movq %%mm3,0x18(%[qdct])\n\t" \
-  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
-  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
-  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
-  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
-  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
-  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
-  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
-  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
-  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
-  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
-  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
-  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
-  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
-  "movq %%mm0,0x20(%[qdct])\n\t" \
-  "movq %%mm3,0x50(%[qdct])\n\t" \
-  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
-  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
-  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
-  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
-  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
-  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
-  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
-  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
-  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
-  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
-  "movq %%mm2,0x30(%[qdct])\n\t" \
-  "movq %%mm6,0x38(%[qdct])\n\t" \
-  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
-  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
-  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
-  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
-  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
-  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
-  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
-  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
-  "movq %%mm0,0x28(%[qdct])\n\t" \
-  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
-  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
-  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
-  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
-  "movq %%mm4,0x40(%[qdct])\n\t" \
-  "movq %%mm6,0x48(%[qdct])\n\t" \
-  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
-  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
-  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
-  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
-  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
-  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
-  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
-  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
-  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
-  "movq %%mm2,0x68(%[qdct])\n\t" \
-  "movq %%mm1,0x58(%[qdct])\n\t" \
-  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
-  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
-  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
-  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
-  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
-  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
-  "movq %%mm6,0x70(%[qdct])\n\t" \
-  "movq %%mm5,0x78(%[qdct])\n\t" \
-  "movq %%mm7,0x60(%[qdct])\n\t" \
-
 int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
  const ogg_uint16_t _dequant[64],const void *_enquant){
   ptrdiff_t r;
   __asm__ __volatile__(
-    /*Put the input in zig-zag order.*/
-    OC_ZIG_ZAG_MMXEXT
     "xor %[r],%[r]\n\t"
     /*Loop through two rows at a time.*/
     ".p2align 4\n\t"
     "0:\n\t"
     /*Load the first two rows of the data and the quant matrices.*/
-    "movdqa 0x00(%[qdct],%[r]),%%xmm0\n\t"
-    "movdqa 0x10(%[qdct],%[r]),%%xmm1\n\t"
+    "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
+    "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
     "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
     "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
     "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"

Added: trunk/theora/lib/x86/x86zigzag.h
===================================================================
--- trunk/theora/lib/x86/x86zigzag.h	                        (rev 0)
+++ trunk/theora/lib/x86/x86zigzag.h	2010-12-07 10:28:07 UTC (rev 17728)
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86zigzag_H)
+# define _x86_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in %[y].
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+   first four and second four entries of each row into the specified register,
+   respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO("0","%%mm0") /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO("1","%%mm1") /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO("2","%%mm2") /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO("3","%%mm3") /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI("0","%%mm4") /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI("1","%%mm5") /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI("2","%%mm6") /*mm6=23 22 21 20*/ \
+  "movq %%mm0,%%mm7\n\t"         /*mm7=03 02 01 00*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=11 10 03 02*/ \
+  "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
+  "punpcklwd %%mm0,%%mm1\n\t"    /*mm1=03 09 02 08*/ \
+  "pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
+  "punpcklwd %%mm1,%%mm7\n\t"    /*mm7=02 01 08 00 *A*/ \
+  "movq %%mm7,0x00(%[y])\n\t" \
+  "punpckhwd %%mm4,%%mm1\n\t"    /*mm1=04 03 07 09*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=19 18 17 16*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=04 03 11 10*/ \
+  "punpckhwd %%mm5,%%mm7\n\t"    /*mm7=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=25 07 24 09*/ \
+  "punpcklwd %%mm6,%%mm5\n\t"    /*mm5=21 14 20 13*/ \
+  "punpcklwd %%mm2,%%mm1\n\t"    /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO("4","%%mm2") /*mm2=35 34 33 32*/ \
+  "movq %%mm1,0x08(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO("5","%%mm1") /*mm1=43 42 41 40*/ \
+  "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
+  "movq %%mm0,0x10(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=?? 07 23 22*/ \
+  "punpckldq %%mm5,%%mm4\n\t"    /*mm4=20 13 06 05 *D*/ \
+  "movq %%mm4,0x28(%[y])\n\t" \
+  "psrlq $16,%%mm3\n\t"          /*mm3=.. 27 26 25*/ \
+  "pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
+  "movq %%mm7,%%mm4\n\t"         /*mm4=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=26 33 25 32*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=41 15 40 18*/ \
+  "punpckhwd %%mm1,%%mm3\n\t"    /*mm3=43 .. 42 27*/ \
+  "punpckldq %%mm2,%%mm4\n\t"    /*mm4=25 32 40 18*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO("6","%%mm0") /*mm0=51 50 49 48*/ \
+  "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
+  "movq %%mm4,0x18(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO("7","%%mm4") /*mm4=59 58 57 56*/ \
+  "punpckhdq %%mm7,%%mm2\n\t"    /*mm2=12 19 26 33 *F*/ \
+  "movq %%mm2,0x20(%[y])\n\t" \
+  "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
+  "pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
+  "movq %%mm3,%%mm2\n\t"         /*mm2=35 42 34 27*/ \
+  "punpckhwd %%mm0,%%mm1\n\t"    /*mm1=50 43 48 41*/ \
+  "pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=48 41 34 27 *G*/ \
+  "movq %%mm3,0x30(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm1\n\t"    /*mm1=58 57 50 43 *H*/ \
+  "movq %%mm1,0x50(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_HI("7","%%mm1") /*mm1=63 62 61 60*/ \
+  "punpcklwd %%mm0,%%mm4\n\t"    /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI("6","%%mm0") /*mm0=55 54 53 52*/ \
+  "psllq $16,%%mm6\n\t"          /*mm6=07 23 22 ..*/ \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=49 56 51 59*/ \
+  "punpckhdq %%mm2,%%mm4\n\t"    /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI("3","%%mm2") /*mm2=31 30 29 28*/ \
+  "movq %%mm4,0x38(%[y])\n\t" \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=61 51 60 59*/ \
+  "punpcklwd %%mm6,%%mm7\n\t"    /*mm7=22 15 .. ??*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=61 51 60 59*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=53 60 52 59*/ \
+  "punpckhwd %%mm0,%%mm4\n\t"    /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI("4","%%mm0") /*mm0=39 38 37 36*/ \
+  "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
+  "movq %%mm3,0x68(%[y])\n\t" \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=?? ?? 54 51*/ \
+  "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
+  "punpckhwd %%mm1,%%mm4\n\t"    /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI("5","%%mm1") /*mm1=47 46 45 44*/ \
+  "movq %%mm4,0x78(%[y])\n\t" \
+  "punpckhwd %%mm2,%%mm6\n\t"    /*mm6=28 07 31 23*/ \
+  "punpcklwd %%mm0,%%mm2\n\t"    /*mm2=37 30 36 29*/ \
+  "punpckhdq %%mm6,%%mm5\n\t"    /*mm5=28 07 21 14*/ \
+  "pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
+  "pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
+  "movq %%mm5,0x40(%[y])\n\t" \
+  "punpckhdq %%mm2,%%mm7\n\t"    /*mm7=36 29 22 15 *M*/ \
+  "movq %%mm7,0x48(%[y])\n\t" \
+  "pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
+  "punpckhwd %%mm1,%%mm0\n\t"    /*mm0=46 39 45 38*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=47 54 44 51*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=45 38 31 23 *N*/ \
+  "movq %%mm6,0x60(%[y])\n\t" \
+  "punpckhdq %%mm3,%%mm0\n\t"    /*mm0=47 54 46 39*/ \
+  "punpckldq %%mm2,%%mm3\n\t"    /*mm3=30 37 44 51 *O*/ \
+  "movq %%mm3,0x58(%[y])\n\t" \
+  "pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
+  "movq %%mm0,0x70(%[y])\n\t" \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif

Modified: trunk/theora/lib/x86_vc/mmxfdct.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxfdct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86_vc/mmxfdct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -12,6 +12,7 @@
  /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -463,11 +464,13 @@
 
 /*MMX implementation of the fDCT.*/
 void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  __declspec (align(8)) ogg_int16_t buf[64];
   ptrdiff_t a;
   __asm{
+#define X edx
 #define Y eax
 #define A ecx
-#define X edx
+#define BUF esi
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
     /*We also add biases to correct for some systematic error that remains in
@@ -591,79 +594,90 @@
     movq mm3,[0x30+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
-    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x18+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x08+Y]
-    psraw mm3,2
-    psubw mm5,mm0
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x10]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
+    psubw mm0,mm2
+    movq [BUF+0x00],mm4
+    movq mm4,[Y+0x30]
+    psraw mm6,2
+    psubw mm5,mm2
+    movq [BUF+0x20],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x40],mm0
     psraw mm5,2
-    psubw mm7,mm0
-    psraw mm6,2
-    psubw mm1,mm0
+    psubw mm1,mm2
+    movq [BUF+0x50],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x60],mm3
+    psraw mm1,2
+    psubw mm4,mm2
+    movq [BUF+0x70],mm1
     psraw mm7,2
+    movq [BUF+0x10],mm7
+    psraw mm4,2
+    movq [BUF+0x30],mm4
+    /*Load the next block.*/
     movq mm0,[0x40+Y]
-    psraw mm1,2
-    movq [0x30+Y],mm7
     movq mm7,[0x78+Y]
-    movq [0x08+Y],mm1
     movq mm1,[0x50+Y]
-    movq [0x20+Y],mm6
     movq mm6,[0x68+Y]
-    movq [0x28+Y],mm2
     movq mm2,[0x60+Y]
-    movq [0x10+Y],mm5
     movq mm5,[0x58+Y]
-    movq [0x38+Y],mm3
     movq mm3,[0x70+Y]
-    movq [0x00+Y],mm4
     movq mm4,[0x48+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
-    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x58+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x48+Y]
-    psraw mm3,2
-    psubw mm5,mm0
-    movq [0x68+Y],mm2
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x50]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
-    movq [0x78+Y],mm3
+    psubw mm0,mm2
+    movq [BUF+0x08],mm4
+    movq mm4,[Y+0x70]
+    psraw mm6,2
+    psubw mm5,mm2
+    movq [BUF+0x28],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x48],mm0
     psraw mm5,2
-    psubw mm7,mm0
-    movq [0x40+Y],mm4
-    psraw mm6,2
-    psubw mm1,mm0
-    movq [0x50+Y],mm5
+    psubw mm1,mm2
+    movq [BUF+0x58],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x68],mm3
+    psraw mm1,2
+    psubw mm4,mm2
+    movq [BUF+0x78],mm1
     psraw mm7,2
-    movq [0x60+Y],mm6
-    psraw mm1,2
-    movq [0x70+Y],mm7
-    movq [0x48+Y],mm1
+    movq [BUF+0x18],mm7
+    psraw mm4,2
+    movq [BUF+0x38],mm4
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)] \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)+8] \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+#undef X
 #undef Y
 #undef A
-#undef X
+#undef BUF
   }
 }
 

Modified: trunk/theora/lib/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/x86_vc/mmxidct.c	2010-12-07 10:27:43 UTC (rev 17727)
+++ trunk/theora/lib/x86_vc/mmxidct.c	2010-12-07 10:28:07 UTC (rev 17728)
@@ -339,22 +339,19 @@
 #undef  Y
 #undef  X
   }
-  if(_x!=_y){
-    int i;
-    __asm pxor mm0,mm0;
-    for(i=0;i<4;i++){
-      ogg_int16_t *x;
-      x=_x+16*i;
+  __asm pxor mm0,mm0;
+  for(i=0;i<4;i++){
+    ogg_int16_t *x;
+    x=_x+16*i;
 #define X ecx
-      __asm{
-        mov X,x
-        movq [X+0x00],mm0
-        movq [X+0x08],mm0
-        movq [X+0x10],mm0
-        movq [X+0x18],mm0
-      }
-#undef  X
+    __asm{
+      mov X,x
+      movq [X+0x00],mm0
+      movq [X+0x08],mm0
+      movq [X+0x10],mm0
+      movq [X+0x18],mm0
     }
+#undef  X
   }
 }
 
@@ -547,18 +544,16 @@
 #undef  Y
 #undef  X
   }
-  if(_x!=_y){
 #define X ecx
-    __asm{
-      pxor mm0,mm0;
-      mov X,_x
-      movq [X+0x00],mm0
-      movq [X+0x10],mm0
-      movq [X+0x20],mm0
-      movq [X+0x30],mm0
-    }
-#undef  X
+  __asm{
+    pxor mm0,mm0;
+    mov X,_x
+    movq [X+0x00],mm0
+    movq [X+0x10],mm0
+    movq [X+0x20],mm0
+    movq [X+0x30],mm0
   }
+#undef  X
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.

Added: trunk/theora/lib/x86_vc/x86zigzag.h
===================================================================
--- trunk/theora/lib/x86_vc/x86zigzag.h	                        (rev 0)
+++ trunk/theora/lib/x86_vc/x86zigzag.h	2010-12-07 10:28:07 UTC (rev 17728)
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86zigzag_H)
+# define _x86_vc_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in Y.
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+   first four and second four entries of each row into the specified register,
+   respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO(0,mm0)  /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,mm1)  /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,mm2)  /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,mm3)  /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,mm4)  /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,mm5)  /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,mm6)  /*mm6=23 22 21 20*/ \
+  __asm movq mm7,mm0        /*mm7=03 02 01 00*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=11 10 03 02*/ \
+  __asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
+  __asm punpcklwd mm1,mm0   /*mm1=03 09 02 08*/ \
+  __asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
+  __asm punpcklwd mm7,mm1   /*mm7=02 01 08 00 *A*/ \
+  __asm movq [Y+0x00],mm7 \
+  __asm punpckhwd mm1,mm4   /*mm1=04 03 07 09*/ \
+  __asm movq mm7,mm2        /*mm7=19 18 17 16*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=04 03 11 10*/ \
+  __asm punpckhwd mm7,mm5   /*mm7=12 19 15 18*/ \
+  __asm punpcklwd mm1,mm3   /*mm1=25 07 24 09*/ \
+  __asm punpcklwd mm5,mm6   /*mm5=21 14 20 13*/ \
+  __asm punpcklwd mm1,mm2   /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO(4,mm2)  /*mm2=35 34 33 32*/ \
+  __asm movq [Y+0x08],mm1 \
+  OC_ZZ_LOAD_ROW_LO(5,mm1)  /*mm1=43 42 41 40*/ \
+  __asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
+  __asm movq [Y+0x10],mm0 \
+  __asm punpckhdq mm6,mm4   /*mm6=?? 07 23 22*/ \
+  __asm punpckldq mm4,mm5   /*mm4=20 13 06 05 *D*/ \
+  __asm movq [Y+0x28],mm4 \
+  __asm psrlq mm3,16        /*mm3=.. 27 26 25*/ \
+  __asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
+  __asm movq mm4,mm7        /*mm4=12 19 15 18*/ \
+  __asm punpcklwd mm2,mm3   /*mm2=26 33 25 32*/ \
+  __asm punpcklwd mm4,mm1   /*mm4=41 15 40 18*/ \
+  __asm punpckhwd mm3,mm1   /*mm3=43 .. 42 27*/ \
+  __asm punpckldq mm4,mm2   /*mm4=25 32 40 18*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO(6,mm0)  /*mm0=51 50 49 48*/ \
+  __asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
+  __asm movq [Y+0x18],mm4 \
+  OC_ZZ_LOAD_ROW_LO(7,mm4)  /*mm4=59 58 57 56*/ \
+  __asm punpckhdq mm2,mm7   /*mm2=12 19 26 33 *F*/ \
+  __asm movq [Y+0x20],mm2 \
+  __asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
+  __asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
+  __asm movq mm2,mm3        /*mm2=35 42 34 27*/ \
+  __asm punpckhwd mm1,mm0   /*mm1=50 43 48 41*/ \
+  __asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
+  __asm punpckldq mm3,mm1   /*mm3=48 41 34 27 *G*/ \
+  __asm movq [Y+0x30],mm3 \
+  __asm punpckhdq mm1,mm4   /*mm1=58 57 50 43 *H*/ \
+  __asm movq [Y+0x50],mm1 \
+  OC_ZZ_LOAD_ROW_HI(7,mm1)  /*mm1=63 62 61 60*/ \
+  __asm punpcklwd mm4,mm0   /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI(6,mm0)  /*mm0=55 54 53 52*/ \
+  __asm psllq mm6,16        /*mm6=07 23 22 ..*/ \
+  __asm movq mm3,mm4        /*mm3=49 56 51 59*/ \
+  __asm punpckhdq mm4,mm2   /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI(3,mm2)  /*mm2=31 30 29 28*/ \
+  __asm movq [Y+0x38],mm4 \
+  __asm punpcklwd mm3,mm1   /*mm3=61 51 60 59*/ \
+  __asm punpcklwd mm7,mm6   /*mm7=22 15 .. ??*/ \
+  __asm movq mm4,mm3        /*mm4=61 51 60 59*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=53 60 52 59*/ \
+  __asm punpckhwd mm4,mm0   /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI(4,mm0)  /*mm0=39 38 37 36*/ \
+  __asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
+  __asm movq [Y+0x68],mm3 \
+  __asm movq mm3,mm4        /*mm3=?? ?? 54 51*/ \
+  __asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
+  __asm punpckhwd mm4,mm1   /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI(5,mm1)  /*mm1=47 46 45 44*/ \
+  __asm movq [Y+0x78],mm4 \
+  __asm punpckhwd mm6,mm2   /*mm6=28 07 31 23*/ \
+  __asm punpcklwd mm2,mm0   /*mm2=37 30 36 29*/ \
+  __asm punpckhdq mm5,mm6   /*mm5=28 07 21 14*/ \
+  __asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
+  __asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
+  __asm movq [Y+0x40],mm5 \
+  __asm punpckhdq mm7,mm2   /*mm7=36 29 22 15 *M*/ \
+  __asm movq [Y+0x48],mm7 \
+  __asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
+  __asm punpckhwd mm0,mm1   /*mm0=46 39 45 38*/ \
+  __asm punpcklwd mm3,mm1   /*mm3=47 54 44 51*/ \
+  __asm punpckldq mm6,mm0   /*mm6=45 38 31 23 *N*/ \
+  __asm movq [Y+0x60],mm6 \
+  __asm punpckhdq mm0,mm3   /*mm0=47 54 46 39*/ \
+  __asm punpckldq mm3,mm2   /*mm3=30 37 44 51 *O*/ \
+  __asm movq [Y+0x58],mm3 \
+  __asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
+  __asm movq [Y+0x70],mm0 \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif