[Theora-dev] MMX IDCT for theora-exp

Wed Jul 20 08:22:55 PDT 2005

Hello,

I'm attaching IDCT MMX patch. I reused IDCT from theora-a3-MMXd.zip.
It should work on 64bit X86 platform too.

Here is most used functions when playing video with jet aircrafts (gripen)
Ogg logical stream 310b2968 is Theora 720x480 29.97 fps video
Encoded frame content is 720x480 with 0x0 offset
I can play this video with like 200-300 frame drops on Athlon XP 1700+
CPU load (with music when playing from50% to full, mostly 60%)

CPU: Athlon, speed 1466.91 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 400000
Counted DATA_CACHE_MISSES events (Data cache misses) with a unit mask of 0x00 (No unit mask) count 2000
samples  %        samples  %        image name               symbol name
124337   22.0173  91089    23.4683  dump                     theora_decode_packetin
83446    14.7764  114246   29.4345  libc-2.3.2.so            (no symbols)
74011    13.1057  33746     8.6944  dump                     oc_state_loop_filter_frag_rows
57706    10.2185  9204      2.3713  libogg.so.0.5.2          (no symbols)
39182     6.9383  10146     2.6140  dump                     oc_state_frag_recon_mmx
31095     5.5062  38650     9.9578  dump                     oc_frag_recon_inter2_mmx
24133     4.2734  12945     3.3352  dump                     oc_frag_pred_dc
22053     3.9051  11120     2.8650  dump                     oc_huff_token_decode
12497     2.2129  163       0.0420  dump                     oc_idct8x8_mmx
10376     1.8374  22113     5.6972  dump                     oc_frag_recon_inter_mmx
9553      1.6916  266       0.0685  dump                     oc_idct8x8_10_mmx
9489      1.6803  1123      0.2893  dump                     theora_look
9401      1.6647  3291      0.8479  dump                     oc_token_expand_run_cat1a
7733      1.3693  2563      0.6603  dump                     oc_token_expand_const
6488      1.1489  12456     3.2092  dump                     oc_state_frag_copy_mmx
5488      0.9718  3577      0.9216  dump                     oc_state_get_mv_offsets
5334      0.9445  2286      0.5890  dump                     oc_token_expand_run
3351      0.5934  1794      0.4622  dump                     anonymous symbol from section .plt
3158      0.5592  393       0.1013  dump                     oc_token_skip_val
2836      0.5022  786       0.2025  dump                     oc_token_skip_run_cat1a
2572      0.4554  982       0.2530  dump                     oc_token_expand_cat2
2087      0.3696  16        0.0041  dump                     oc_dct_token_skip
1884      0.3336  658       0.1695  dump                     oc_token_skip_eob
1788      0.3166  5600      1.4428  dump                     oc_frag_recon_intra_mmx
1542      0.2731  12        0.0031  dump                     oc_frag_recon_inter2
1514      0.2681  1578      0.4066  dump                     oc_vlc_mode_unpack
1441      0.2552  1055      0.2718  dump                     oc_state_frag_recon
1411      0.2499  382       0.0984  dump                     oc_token_skip_run
1275      0.2258  1598      0.4117  dump                     oc_dec_mb_modes_unpack
1008      0.1785  460       0.1185  dump                     oc_token_expand_cati
999       0.1769  1538      0.3963  dump                     oc_restore_fpu
884       0.1565  581       0.1497  dump                     oc_token_dec1val_const
614       0.1087  145       0.0374  dump                     oc_token_expand_zrl
543       0.0962  1        2.6e-04  dump                     oc_frag_recon_inter
509       0.0901  15        0.0039  dump                     oc_frag_recon_intra
470       0.0832  505       0.1301  dump                     oc_token_dec1val_cat2
445       0.0788  96        0.0247  dump                     oc_token_dec1val_zrl
344       0.0609  81        0.0209  dump                     oc_state_borders_fill_rows
324       0.0574  237       0.0611  dump                     oc_restore_fpu_mmx
260       0.0460  36        0.0093  dump                     main
218       0.0386  83        0.0214  dump                     stripe_decoded
213       0.0377  67        0.0173  dump                     oc_token_skip_zrl
212       0.0375  237       0.0611  dump                     oc_token_dec1val_cati
209       0.0370  57        0.0147  dump                     oc_sb_run_unpack
170       0.0301  11        0.0028  dump                     oc_vlc_mv_comp_unpack
30        0.0053  11        0.0028  dump                     oc_state_borders_fill_caps
26        0.0046  4         0.0010  dump                     oc_clc_mv_comp_unpack
12        0.0021  6         0.0015  dump                     oc_ycbcr_buffer_flip
11        0.0019  13        0.0033  dump                     theora_granule_time
10        0.0018  1        2.6e-04  dump                     oc_state_loop_filter_init
6         0.0011  100       0.0258  dump                     oc_state_frag_copy
6         0.0011  5         0.0013  dump                     oc_token_skip_eob6
5        8.9e-04  2        5.2e-04  dump                     oc_clc_mode_unpack
5        8.9e-04  5         0.0013  dump                     oc_state_init
3        5.3e-04  0              0  dump                     oc_set_chroma_mvs00
2        3.5e-04  0              0  dump                     oc_dequant_tables_init
2        3.5e-04  2        5.2e-04  ld-2.3.2.so              (no symbols)
1        1.8e-04  0              0  dump                     oc_huff_tree_mindepth
1        1.8e-04  0              0  dump                     oc_huff_tree_occupancy

As you can see loopfilter needs some work. After loopfilter opts I will take a look again on
oc_state_frag_recon_mmx and oc_state_frag_inter2_mmx and maybe I can add quantization in MMX too.

Derf have you any objections againts the patch?
Thanks

regards

Rudolf
-------------- next part --------------
diff -Naur a/lib/idct.c b/lib/idct.c

--- a/lib/idct.c	2005-07-20 11:39:30.355887750 +0200
+++ b/lib/idct.c	2005-07-20 11:25:26.451147000 +0200
@@ -208,7 +208,7 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
@@ -236,7 +236,7 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
diff -Naur a/lib/idct.h b/lib/idct.h
--- a/lib/idct.h	2005-07-20 11:39:30.383889500 +0200
+++ b/lib/idct.h	2005-07-20 11:25:36.019745000 +0200
@@ -3,7 +3,7 @@
 #if !defined(_idct_H)
 # define _idct_H (1)
 
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif
diff -Naur a/lib/internal.h b/lib/internal.h
--- a/lib/internal.h	2005-07-20 11:39:30.355887750 +0200
+++ b/lib/internal.h	2005-07-20 11:46:53.083556500 +0200
@@ -239,6 +239,9 @@
   void (*state_frag_copy)(const oc_theora_state *_state,
    const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
   void (*restore_fpu)(void);
+  void (*oc_state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag,                                               
+   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,                                                             
+   ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 }oc_base_opt_vtable;
 
 
@@ -385,9 +388,6 @@
 int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
 int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offset0,
  int *_offset1,int _dx,int _dy,int _ystride,int _pli);
-void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
@@ -421,5 +421,8 @@
 void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
 void oc_restore_fpu_c(void);
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 
 #endif
diff -Naur a/lib/Makefile.am b/lib/Makefile.am
--- a/lib/Makefile.am	2005-07-20 11:39:30.383889500 +0200
+++ b/lib/Makefile.am	2005-07-20 11:57:27.867228000 +0200
@@ -5,6 +5,7 @@
 
 EXTRA_DIST = \
 	x86/cpu.c \
+	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
 	x86/x86state.c
@@ -12,6 +13,7 @@
 if OC_X86ASM
 X86ASM_FILES = \
 	x86/cpu.c \
+	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
 	x86/x86state.c
diff -Naur a/lib/state.c b/lib/state.c
--- a/lib/state.c	2005-07-20 11:39:30.351887500 +0200
+++ b/lib/state.c	2005-07-20 11:56:57.753346000 +0200
@@ -508,6 +508,7 @@
 
 
 void oc_state_vtable_init_c(oc_theora_state *_state){
+  _state->opt_vtable.oc_state_frag_recon=oc_state_frag_recon_c;
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
@@ -787,6 +788,14 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+_state->opt_vtable.oc_state_frag_recon(_state,_frag,_pli,_dct_coeffs,
+					_last_zzi,_ncoefs,_dc_iquant,
+					_ac_iquant);
+}
+
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t dct_buf[64];
   ogg_int16_t res_buf[64];
   int dst_framei;
@@ -837,11 +846,11 @@
        the iDCT.*/
     if(_last_zzi<10){
       for(;zzi<10;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_10(res_buf,dct_buf);
+      oc_idct8x8_10_c(res_buf,dct_buf);
     }
     else{
       for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8(res_buf,dct_buf);
+      oc_idct8x8_c(res_buf,dct_buf);
     }
   }
   /*Fill in the target buffer.*/
diff -Naur a/lib/x86/cpu.c b/lib/x86/cpu.c
--- a/lib/x86/cpu.c	2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/cpu.c	2005-07-20 11:43:38.231379000 +0200
@@ -13,6 +13,7 @@
  ********************************************************************/
 
 #include "cpu.h"
+#include "x86int.h"
 
 ogg_uint32_t oc_cpu_flags_get(void){
   ogg_uint32_t eax;
@@ -95,5 +96,7 @@
     /*Implement me.*/
     flags=0;
   }
+  /* Hack call fill IDCT contants */
+  fillidctconstants_mmx();  
   return flags;
 }
diff -Naur a/lib/x86/mmxidct.c b/lib/x86/mmxidct.c
--- a/lib/x86/mmxidct.c	1970-01-01 01:00:00.000000000 +0100
+++ b/lib/x86/mmxidct.c	2005-07-20 16:49:45.187242000 +0200
@@ -0,0 +1,650 @@
+#include <ogg/ogg.h>
+#include "dct.h"
+#include "idct.h"
+
+#define ASM asm
+/* 4 masks come in order low word to high */
+#define MaskOffset 0		
+/* 7 cosines come in order pi/16 * (1 ... 7) */
+#define CosineOffset 32		
+#define EightOffset 88
+#define IdctAdjustBeforeShift 8
+
+
+ogg_uint16_t __attribute__((aligned(8),used))  idctconstants[(4+7+1) * 4];
+ogg_uint16_t idctcosTbl[ 7] = 
+{
+	64277, 60547, 54491, 46341, 36410, 25080, 12785
+};
+
+/* I'm leaving original VP3 table even I'm not using it whole. It can be used
+   for MMX quantzation later - Ruik */ 
+
+void fillidctconstants_mmx(void)
+{
+	int j = 16;  
+	ogg_uint16_t * p; 
+
+	do 
+	{ 
+		idctconstants[ --j] = 0;
+	}  
+	while( j);
+	
+	idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
+	
+	j = 1; 
+	do 
+	{
+		p = idctconstants + ( (j+3) << 2);
+		p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
+	} 
+	while( ++j <= 7);
+	
+	idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
+}
+
+
+#define MtoSTR(s) #s
+
+#define Dump	"call MMX_dump\n"
+
+#define BeginIDCT "#BeginIDCT\n"\
+	\
+	"	movq	"	I(3)","r2"\n"  \
+	 \
+	"	movq	"	C(3)","r6"\n" \
+	"	movq	"	r2","r4"\n" \
+	"	movq	"	J(5)","r7"\n" \
+	"	pmulhw	"	r6","r4"\n"		\
+	"	movq	"	C(5)","r1"\n" \
+	"	pmulhw	"	r7","r6"\n"		\
+	"	movq	"	r1","r5"\n" \
+	"	pmulhw	"	r2","r1"\n"		\
+	"	movq	"	I(1)","r3"\n" \
+	"	pmulhw	"	r7","r5"\n"		\
+	"	movq	"	C(1)","r0"\n"	\
+	"	paddw	"	r2","r4"\n"		\
+	"	paddw	"	r7","r6"\n"		\
+	"	paddw	"	r1","r2"\n"		\
+	"	movq	"	J(7)","r1"\n" \
+	"	paddw	"	r5","r7"\n"		\
+	"	movq	"	r0","r5"\n"		\
+	"	pmulhw	"	r3","r0"\n"		\
+	"	paddsw	"	r7","r4"\n"		\
+	"	pmulhw	"	r1","r5"\n"		\
+	"	movq	"	C(7)","r7"\n" \
+	"	psubsw	"	r2","r6"\n"		\
+	"	paddw	"	r3","r0"\n"		\
+	"	pmulhw	"	r7","r3"\n"		\
+	"	movq	"	I(2)","r2"\n" \
+	"	pmulhw	"	r1","r7"\n"		\
+	"	paddw	"	r1","r5"\n"		\
+	"	movq	"	r2","r1"\n"		\
+	"	pmulhw	"	C(2)","r2"\n"	\
+	"	psubsw	"	r5","r3"\n"		\
+	"	movq	"	J(6)","r5"\n" \
+	"	paddsw	"	r7","r0"\n"		\
+	"	movq	"	r5","r7"\n"		\
+	"	psubsw	"	r4","r0"\n"		\
+	"	pmulhw	"	C(2)","r5"\n"	\
+	"	paddw	"	r1","r2"\n"		\
+	"	pmulhw	"	C(6)","r1"\n"	\
+	"	paddsw	"	r4","r4"\n"		\
+	"	paddsw	"	r0","r4"\n"		\
+	"	psubsw	"	r6","r3"\n"		\
+	"	paddw	"	r7","r5"\n"		\
+	"	paddsw	"	r6","r6"\n"		\
+	"	pmulhw	"	C(6)","r7"\n"	\
+	"	paddsw	"	r3","r6"\n"		\
+	"	movq	"	r4","I(1)"\n"	\
+	"	psubsw	"	r5","r1"\n"		\
+	"	movq	"	C(4)","r4"\n" \
+	"	movq	"	r3","r5"\n"		\
+	"	pmulhw	"	r4","r3"\n"		\
+	"	paddsw	"	r2","r7"\n"		\
+	"	movq	"	r6","I(2)"\n"	\
+	"	movq	"	r0","r2"\n"		\
+	"	movq	"	I(0)","r6"\n" \
+	"	pmulhw	"	r4","r0"\n"		\
+	"	paddw	"	r3","r5"\n"		\
+	"\n"\
+	"	movq	"	J(4)","r3"\n" \
+	"	psubsw	"	r1","r5"\n"		\
+	"	paddw	"	r0","r2"\n"		\
+	"	psubsw	"	r3","r6"\n"		\
+	"	movq	"	r6","r0"\n" \
+	"	pmulhw	"	r4","r6"\n"		\
+	"	paddsw	"	r3","r3"\n"		\
+	"	paddsw	"	r1","r1"\n"		\
+	"	paddsw	"	r0","r3"\n"		\
+	"	paddsw	"	r5","r1"\n"		\
+	"	pmulhw	"	r3","r4"\n"		\
+	"	paddsw	"	r0","r6"\n"		\
+	"	psubsw	"	r2","r6"\n"		\
+	"	paddsw	"	r2","r2"\n"		\
+	"	movq	"	I(1)","r0"\n"	\
+	"	paddsw	"	r6","r2"\n"		\
+	"	paddw	"	r3","r4"\n"		\
+	"	psubsw	"	r1","r2"\n"		\
+	"#end BeginIDCT\n"
+// end BeginIDCT macro (38 cycles).
+
+#define RowIDCT ASM("\n"\
+	"#RowIDCT\n"\
+	BeginIDCT \
+	"\n"\
+	"	movq	"I(2)","r3"\n"	/* r3 = D. */ \
+	"	psubsw	"r7","r4"\n"	/* r4 = E. = E - G */ \
+	"	paddsw	"r1","r1"\n"	/* r1 = H. + H. */ \
+	"	paddsw	"r7","r7"\n"	/* r7 = G + G */ \
+	"	paddsw	"r2","r1"\n"	/* r1 = R1 = A.. + H. */ \
+	"	paddsw	"r4","r7"\n"	/* r7 = G. = E + G */ \
+	"	psubsw	"r3","r4"\n"	/* r4 = R4 = E. - D. */ \
+	"	paddsw	"r3","r3"\n" \
+	"	psubsw	"r5","r6"\n"	/* r6 = R6 = F. - B.. */ \
+	"	paddsw	"r5","r5"\n" \
+	"	paddsw	"r4","r3"\n"	/* r3 = R3 = E. + D. */ \
+	"	paddsw	"r6","r5"\n"	/* r5 = R5 = F. + B.. */ \
+	"	psubsw	"r0","r7"\n"	/* r7 = R7 = G. - C. */ \
+	"	paddsw	"r0","r0"\n" \
+	"	movq	"r1","I(1)"\n"	/* save R1 */ \
+	"	paddsw	"r7","r0"\n"	/* r0 = R0 = G. + C. */ \
+	"#end RowIDCT"\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+/* Following macro does two 4x4 transposes in place.
+
+  At entry (we assume):
+
+	r0 = a3 a2 a1 a0
+	I(1) = b3 b2 b1 b0
+	r2 = c3 c2 c1 c0
+	r3 = d3 d2 d1 d0
+
+	r4 = e3 e2 e1 e0
+	r5 = f3 f2 f1 f0
+	r6 = g3 g2 g1 g0
+	r7 = h3 h2 h1 h0
+
+   At exit, we have:
+
+	I(0) = d0 c0 b0 a0
+	I(1) = d1 c1 b1 a1
+	I(2) = d2 c2 b2 a2
+	I(3) = d3 c3 b3 a3
+	
+	J(4) = h0 g0 f0 e0
+	J(5) = h1 g1 f1 e1
+	J(6) = h2 g2 f2 e2
+	J(7) = h3 g3 f3 e3
+
+   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
+   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
+
+   Since r1 is free at entry, we calculate the Js first. */
+
+
+
+#define Transpose ASM("\n#Transpose\n" \
+	\
+	"	movq		"r4","r1"\n" 			 \
+	"	punpcklwd	"r5","r4"\n"			 \
+	"	movq		"r0","I(0)"\n"		 \
+	"	punpckhwd	"r5","r1"\n"			 \
+	"	movq		"r6","r0"\n"			 \
+	"	punpcklwd	"r7","r6"\n"			 \
+	"	movq		"r4","r5"\n"			 \
+	"	punpckldq	"r6","r4"\n"			 \
+	"	punpckhdq	"r6","r5"\n"			 \
+	"	movq		"r1","r6"\n"			 \
+	"	movq		"r4","J(4)"\n" \
+	"	punpckhwd	"r7","r0"\n"			 \
+	"	movq		"r5","J(5)"\n" \
+	"	punpckhdq	"r0","r6"\n"			 \
+	"	movq		"I(0)","r4"\n"		 \
+	"	punpckldq	"r0","r1"\n"			 \
+	"	movq		"I(1)","r5"\n"		 \
+	"	movq		"r4","r0"\n"			 \
+	"	movq		"r6","J(7)"\n" \
+	"	punpcklwd	"r5","r0"\n"			 \
+	"	movq		"r1","J(6)"\n" \
+	"	punpckhwd	"r5","r4"\n"			 \
+	"	movq		"r2","r5"\n"			 \
+	"	punpcklwd	"r3","r2"\n"			 \
+	"	movq		"r0","r1"\n"			 \
+	"	punpckldq	"r2","r0"\n"			 \
+	"	punpckhdq	"r2","r1"\n"			 \
+	"	movq		"r4","r2"\n"			 \
+	"	movq		"r0","I(0)"\n" \
+	"	punpckhwd	"r3","r5"\n"			 \
+	"	movq		"r1","I(1)"\n" \
+	"	punpckhdq	"r5","r4"\n"			 \
+	"	punpckldq	"r5","r2"\n"			 \
+	 \
+	"	movq		"r4","I(3)"\n"  \
+	 \
+	"	movq		"r2","I(2)"\n"  \
+	"#end Transpose\n"\
+);
+// end Transpose macro (19 cycles).
+
+#define ColumnIDCT ASM("\n"\
+	"#ColumnIDCT\n"\
+	BeginIDCT \
+	"\n"\
+	"	paddsw	"Eight","r2"\n"	\
+	"	paddsw	"r1","r1"\n"		/* r1 = H. + H. */ \
+	"	paddsw	"r2","r1"\n"		/* r1 = R1 = A.. + H. */ \
+	"	psraw	""$4"","r2"\n"		/* r2 = NR2 */ \
+	"	psubsw	"r7","r4"\n"		/* r4 = E. = E - G */ \
+	"	psraw	""$4"","r1"\n"		/* r1 = NR1 */ \
+	"	movq	"I(2)","r3"\n"	/* r3 = D. */ \
+	"	paddsw	"r7","r7"\n"		/* r7 = G + G */ \
+	"	movq	"r2","I(2)"\n"	/* store NR2 at I2 */ \
+	"	paddsw	"r4","r7"\n"		/* r7 = G. = E + G */ \
+	"	movq	"r1","I(1)"\n"	/* store NR1 at I1 */ \
+	"	psubsw	"r3","r4"\n"		/* r4 = R4 = E. - D. */ \
+	"	paddsw	"Eight","r4"\n"	\
+	"	paddsw	"r3","r3"\n"		/* r3 = D. + D. */ \
+	"	paddsw	"r4","r3"\n"		/* r3 = R3 = E. + D. */ \
+	"	psraw	""$4"","r4"\n"		/* r4 = NR4 */ \
+	"	psubsw	"r5","r6"\n"		/* r6 = R6 = F. - B.. */ \
+	"	psraw	""$4"","r3"\n"		/* r3 = NR3 */ \
+	"	paddsw	"Eight","r6"\n"	\
+	"	paddsw	"r5","r5"\n"		/* r5 = B.. + B.. */ \
+	"	paddsw	"r6","r5"\n"		/* r5 = R5 = F. + B.. */ \
+	"	psraw	""$4"","r6"\n"		/* r6 = NR6 */ \
+	"	movq	"r4","J(4)"\n"	/* store NR4 at J4 */ \
+	"	psraw	""$4"","r5"\n"		/* r5 = NR5 */ \
+	"	movq	"r3","I(3)"\n"	/* store NR3 at I3 */ \
+	"	psubsw	"r0","r7"\n"		/* r7 = R7 = G. - C. */ \
+	"	paddsw	"Eight","r7"\n"	\
+	"	paddsw	"r0","r0"\n" 		/* r0 = C. + C. */ \
+	"	paddsw	"r7","r0"\n"		/* r0 = R0 = G. + C. */ \
+	"	psraw	""$4"","r7"\n"		/* r7 = NR7 */ \
+	"	movq	"r6","J(6)"\n"	/* store NR6 at J6 */ \
+	"	psraw	""$4"","r0"\n"		/* r0 = NR0 */ \
+	"	movq	"r5","J(5)"\n"	/* store NR5 at J5 */ \
+	"	movq	"r7","J(7)"\n"	/* store NR7 at J7 */ \
+	"	movq	"r0","I(0)"\n"	/* store NR0 at I0 */ \
+	"#end ColumnIDCT\n"\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+
+void MMX_dump() 
+{
+	ASM 
+	("\
+		movq	%mm0,(%edi)\n\
+		movq	%mm1,8(%edi)\n\
+		movq	%mm2,16(%edi)\n\
+		movq	%mm3,24(%edi)\n\
+		movq	%mm4,32(%edi)\n\
+		movq	%mm5,40(%edi)\n\
+		movq	%mm6,48(%edi)\n\
+		movq	%mm7,56(%edi)\n\
+		ret"
+	);
+}
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+
+/* this routine accepts 8x8 matrix but in transposed form
+   every 4x4 submatrix is transposed */
+
+#	define r0	"%mm0"
+#	define r1	"%mm1"
+#	define r2	"%mm2"
+#	define r3	"%mm3"
+#	define r4	"%mm4"
+#	define r5	"%mm5"
+#	define r6	"%mm6"
+#	define r7	"%mm7"
+
+
+#	undef M
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+
+	__asm__ __volatile__ ("\n"
+	"#lea %1,%%rcx\n"
+	:
+	: "d" (_y),
+	  "c" (idctconstants)
+	
+	);
+
+#	define MIDM(M,I)	MtoSTR(M+I*8(%rcx))
+#	define M(I)		MIDM( MaskOffset , I )
+#	define MIDC(M,I)	MtoSTR(M+(I-1)*8(%rcx))
+#	define C(I)		MIDC( CosineOffset , I )
+#	define MIDEight(M)	MtoSTR(M(%rcx))
+#	define Eight		MIDEight(EightOffset)
+
+#	define I( K)	MtoSTR((K*16)(%rdx))
+#	define J( K)	MtoSTR(((K - 4) * 16)+8(%rdx))
+
+
+#else
+	__asm__ __volatile__ ("\n"
+	"movl $idctconstants,%%ecx\n"
+	:
+	: "d" (_y)
+	 : "ecx"
+	);
+
+#	define MIDM(M,I)	MtoSTR(M+I*8(%ecx))
+#	define M(I)		MIDM( MaskOffset , I )
+#	define MIDC(M,I)	MtoSTR(M+(I-1)*8(%ecx))
+#	define C(I)		MIDC( CosineOffset , I )
+#	define MIDEight(M)	MtoSTR(M(%ecx))
+#	define Eight		MIDEight(EightOffset)
+
+#	define I( K)	MtoSTR((K*16)(%edx))
+#	define J( K)	MtoSTR(((K - 4) * 16)+8(%edx))
+
+#endif
+
+	RowIDCT
+	Transpose
+
+#	undef I
+#	undef J
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+#	define I( K)	MtoSTR((K * 16)+64(%rdx))
+#	define J( K)	MtoSTR(((K - 4)*16)+72(%rdx))
+#else
+#	define I( K)	MtoSTR((K * 16)+64(%edx))
+#	define J( K)	MtoSTR(((K - 4)*16)+72(%edx))
+#endif 
+	RowIDCT
+	Transpose
+
+#	undef I
+#	undef J
+
+
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+#	define I( K)	MtoSTR((K * 16)(%rdx))
+#else
+#	define I( K)	MtoSTR((K * 16)(%edx))
+#endif
+
+#	define J( K)	I( K)
+
+	ColumnIDCT
+
+#	undef I
+#	undef J
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+#	define I( K)	MtoSTR((K * 16)+8(%rdx))
+#else
+#	define I( K)	MtoSTR((K * 16)+8(%edx))
+
+#endif
+
+#	define J( K)	I( K)
+
+	ColumnIDCT
+
+#	undef I
+#	undef J
+
+	ASM("\n"
+	"	emms\n"
+	);
+ 
+}
+
+
+
+#define BeginIDCT_10 "#BeginIDCT_10\n"\
+	"	movq	"I(3)","r2"\n"  \
+	"	nop\n" \
+\
+	"	movq	"C(3)","r6"\n" \
+	"	movq	"r2","r4"\n" \
+\
+	"	movq	"C(5)","r1"\n" \
+	"	pmulhw	"r6","r4"\n"		 \
+\
+	"	movq	"I(1)","r3"\n" \
+	"	pmulhw	"r2","r1"\n"		\
+\
+	"	movq	"C(1)","r0"\n"	\
+	"	paddw	"r2","r4"\n"		\
+\
+    "	pxor    "r6","r6"\n"     \
+	"	paddw	"r1","r2"\n"		\
+\
+	"	movq	"I(2)","r5"\n" \
+	"	pmulhw	"r3","r0"\n"		\
+\
+	"	movq	"r5","r1"\n" \
+	"	paddw	"r3","r0"\n"		\
+\
+	"	pmulhw	"C(7)","r3"\n"	\
+	"	psubsw	"r2","r6"\n"		\
+\
+	"	pmulhw	"C(2)","r5"\n"	\
+	"	psubsw	"r4","r0"\n"		\
+\
+    "	movq    "I(2)","r7"\n"\
+	"	paddsw	"r4","r4"\n"		\
+\
+	"	paddw	"r5","r7"\n"		\
+	"	paddsw	"r0","r4"\n"		\
+\
+	"	pmulhw	"C(6)","r1"\n"	\
+	"	psubsw	"r6","r3"\n"		\
+\
+	"	movq	"r4","I(1)"\n"	\
+	"	paddsw	"r6","r6"\n"		\
+\
+    "	movq	"C(4)","r4"\n" \
+	"	paddsw	"r3","r6"\n"		\
+\
+	"	movq	"r3","r5"\n"		\
+	"	pmulhw	"r4","r3"\n"		\
+\
+	"	movq	"r6","I(2)"\n"	\
+	"	movq	"r0","r2"\n"		\
+\
+	"	movq	"I(0)","r6"\n" \
+	"	pmulhw	"r4","r0"\n"		\
+\
+	"	paddw	"r3","r5"\n"		\
+	"	paddw	"r0","r2"\n"		\
+\
+	"	psubsw	"r1","r5"\n"		\
+	"	pmulhw	"r4","r6"\n"		\
+\
+    "	paddw   "I(0)","r6"\n"   \
+	"	paddsw	"r1","r1"\n"		\
+\
+	"	movq	"r6","r4"\n"     \
+	"	paddsw	"r5","r1"\n"		\
+\
+	"	psubsw	"r2","r6"\n"		\
+	"	paddsw	"r2","r2"\n"		\
+\
+	"	movq	"I(1)","r0"\n"	\
+	"	paddsw	"r6","r2"\n"		\
+\
+	"	psubsw	"r1","r2"\n"		\
+	"	nop\n" \
+	"#end BeginIDCT_10\n"
+// end BeginIDCT_10 macro (25 cycles).
+
+
+#define RowIDCT_10 ASM("\n"\
+	"#RowIDCT_10\n"\
+	BeginIDCT_10 \
+	"\n"\
+	"	movq	"I(2)","r3"\n"	/* r3 = D. */ \
+	"	psubsw	"r7","r4"\n"		/* r4 = E. = E - G */ \
+	"	paddsw	"r1","r1"\n"		/* r1 = H. + H. */ \
+	"	paddsw	"r7","r7"\n"		/* r7 = G + G */ \
+	"	paddsw	"r2","r1"\n"		/* r1 = R1 = A.. + H. */ \
+	"	paddsw	"r4","r7"\n"		/* r7 = G. = E + G */ \
+	"	psubsw	"r3","r4"\n"		/* r4 = R4 = E. - D. */ \
+	"	paddsw	"r3","r3"\n" \
+	"	psubsw	"r5","r6"\n"		/* r6 = R6 = F. - B.. */ \
+	"	paddsw	"r5","r5"\n" \
+	"	paddsw	"r4","r3"\n"		/* r3 = R3 = E. + D. */ \
+	"	paddsw	"r6","r5"\n"		/* r5 = R5 = F. + B.. */ \
+	"	psubsw	"r0","r7"\n"		/* r7 = R7 = G. - C. */ \
+	"	paddsw	"r0","r0"\n" \
+	"	movq	"r1","I(1)"\n"	/* save R1 */ \
+	"	paddsw	"r7","r0"\n"		/* r0 = R0 = G. + C. */ \
+	"#end RowIDCT_10\n"\
+);
+// end RowIDCT macro (8 + 38 = 46 cycles)
+
+
+#define ColumnIDCT_10 ASM("\n"\
+	"#ColumnIDCT_10\n"\
+	BeginIDCT_10 \
+	"\n"\
+	"	paddsw	"Eight","r2"\n"	\
+	"	paddsw	"r1","r1"\n"	/* r1 = H. + H. */ \
+	"	paddsw	"r2","r1"\n"	/* r1 = R1 = A.. + H. */ \
+	"	psraw	""$4"","r2"\n"		/* r2 = NR2 */ \
+	"	psubsw	"r7","r4"\n"	/* r4 = E. = E - G */ \
+	"	psraw	""$4"","r1"\n"		/* r1 = NR1 */ \
+	"	movq	"I(2)","r3"\n"	/* r3 = D. */ \
+	"	paddsw	"r7","r7"\n"	/* r7 = G + G */ \
+	"	movq	"r2","I(2)"\n"	/* store NR2 at I2 */ \
+	"	paddsw	"r4","r7"\n"	/* r7 = G. = E + G */ \
+	"	movq	"r1","I(1)"\n"	/* store NR1 at I1 */ \
+	"	psubsw	"r3","r4"\n"	/* r4 = R4 = E. - D. */ \
+	"	paddsw	"Eight","r4"\n"	\
+	"	paddsw	"r3","r3"\n"	/* r3 = D. + D. */ \
+	"	paddsw	"r4","r3"\n"	/* r3 = R3 = E. + D. */ \
+	"	psraw	""$4"","r4"\n"		/* r4 = NR4 */ \
+	"	psubsw	"r5","r6"\n"	/* r6 = R6 = F. - B.. */ \
+	"	psraw	""$4"","r3"\n"		/* r3 = NR3 */ \
+	"	paddsw	"Eight","r6"\n"	\
+	"	paddsw	"r5","r5"\n"	/* r5 = B.. + B.. */ \
+	"	paddsw	"r6","r5"\n"	/* r5 = R5 = F. + B.. */ \
+	"	psraw	""$4"","r6"\n"		/* r6 = NR6 */ \
+	"	movq	"r4","J(4)"\n"	/* store NR4 at J4 */ \
+	"	psraw	""$4"","r5"\n"		/* r5 = NR5 */ \
+	"	movq	"r3","I(3)"\n"	/* store NR3 at I3 */ \
+	"	psubsw	"r0","r7"\n"	/* r7 = R7 = G. - C. */ \
+	"	paddsw	"Eight","r7"\n"	\
+	"	paddsw	"r0","r0"\n" 	/* r0 = C. + C. */ \
+	"	paddsw	"r7","r0"\n"	/* r0 = R0 = G. + C. */ \
+	"	psraw	""$4"","r7"\n"		/* r7 = NR7 */ \
+	"	movq	"r6","J(6)"\n"	/* store NR6 at J6 */ \
+	"	psraw	""$4"","r0"\n"		/* r0 = NR0 */ \
+	"	movq	"r5","J(5)"\n"	/* store NR5 at J5 */ \
+	 \
+	"	movq	"r7","J(7)"\n"	/* store NR7 at J7 */ \
+	 \
+	"	movq	"r0","I(0)"\n"	/* store NR0 at I0 */ \
+	"#end ColumnIDCT_10\n"\
+);
+// end ColumnIDCT macro (38 + 19 = 57 cycles)
+
+
+/* IDCT 10 */
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64])
+
+{
+
+#	define r0	"%mm0"
+#	define r1	"%mm1"
+#	define r2	"%mm2"
+#	define r3	"%mm3"
+#	define r4	"%mm4"
+#	define r5	"%mm5"
+#	define r6	"%mm6"
+#	define r7	"%mm7"
+
+#	undef M
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+
+#	define M(I)		MIDM( MaskOffset , I )
+#	define MIDC(M,I)	MtoSTR(M+(I-1)*8(%rcx))
+#	define C(I)		MIDC( CosineOffset , I )
+#	define MIDEight(M)	MtoSTR(M(%rcx))
+#	define Eight		MIDEight(EightOffset)
+
+	__asm__ __volatile__ ("\n"
+	:
+	: "d" (_y),
+	  "c" (idctconstants)
+	  
+	);
+
+#	define I( K)	MtoSTR((K*16)(%rdx))
+#	define J( K)	MtoSTR(((K - 4) * 16)+8(%rdx))
+
+#else
+#	define M(I)		MIDM( MaskOffset , I )
+#	define MIDC(M,I)	MtoSTR(M+(I-1)*8(%ecx))
+#	define C(I)		MIDC( CosineOffset , I )
+#	define MIDEight(M)	MtoSTR(M(%ecx))
+#	define Eight		MIDEight(EightOffset)
+
+	__asm__ __volatile__ ("\n"
+	"mov	$idctconstants,%%ecx\n"
+	:
+	: "d" (_y)
+	 : "ecx"
+	);
+
+#	define I( K)	MtoSTR((K*16)(%edx))
+#	define J( K)	MtoSTR(((K - 4) * 16)+8(%edx))
+
+#endif
+
+
+//; Done w/dequant + descramble + partial transpose; now do the idct itself.
+
+
+	RowIDCT_10
+	Transpose
+
+#	undef I
+#	undef J
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+#	define I( K)	MtoSTR((K * 16)(%rdx))
+#else
+#	define I( K)	MtoSTR((K * 16)(%edx))
+#endif
+#	define J( K)	I( K)
+
+	ColumnIDCT_10
+
+#	undef I
+#	undef J
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+#	define I( K)	MtoSTR((K * 16)+8(%rdx))
+#else
+#	define I( K)	MtoSTR((K * 16)+8(%edx))
+#endif
+#	define J( K)	I( K)
+
+	ColumnIDCT_10
+
+#	undef I
+#	undef J
+
+	ASM("\n"
+	"	emms\n"
+	);
+}
+
+
diff -Naur a/lib/x86/mmxstate.c b/lib/x86/mmxstate.c
--- a/lib/x86/mmxstate.c	2005-07-20 11:39:29.059806750 +0200
+++ b/lib/x86/mmxstate.c	2005-07-20 16:48:32.718713000 +0200
@@ -10,7 +10,171 @@
  *                                                                  *
  ********************************************************************
 */
+#include <ogg/ogg.h>
 #include "x86int.h"
+#include "../internal.h" 
+
+static const __attribute__((aligned(8),used)) const int OC_FZIG_ZAGMMX[128]={
+0, 8, 1, 2, 9, 16, 24, 17,
+10, 3, 32, 11, 18, 25, 4, 12,
+5, 26, 19, 40, 33, 34, 41, 48,
+27, 6, 13, 20, 28, 21, 14, 7,
+56, 49, 42, 35, 43, 50, 57, 36,
+15, 22, 29, 30, 23, 44, 37, 58,
+51, 59, 38, 45, 52, 31, 60, 53,
+46, 39, 47, 54, 61, 62, 55, 63,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64,
+64, 64, 64, 64, 64, 64, 64, 64
+};
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+  ogg_int16_t  __attribute__((aligned(8),used)) res_buf[64];
+  int dst_framei;
+  int dst_ystride;
+  int zzi;
+  int ci;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    /*Why is the iquant product rounded in this case and no others?
+      Who knows.*/
+    p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+
+#if (defined(__amd64__) ||  defined(__x86_64__))
+	/* for(ci=0;ci<64;ci++)res_buf[ci]=p; */
+     __asm__ __volatile__(
+	"mov %%rdi,%%rdx\n"  /* I cant tell the GCC that EDI value is clobbered */
+	"cld\n"
+	"rep\n"
+	"stosq\n"
+	"mov %%rdx,%%rdi\n" 
+    : 
+    : "D" (res_buf), "a" (p), "c" (16)
+    : "memory", "cc", "rdx"
+    );
+#else
+       __asm__ __volatile__(
+	"mov %%edi,%%edx\n"  /* I cant tell the GCC that EDI value is clobbered */
+	"cld\n"
+	"rep\n"
+	"stosw\n"
+	 "mov %%edx,%%edi\n"  /* I cant tell the GCC that EDI value is clobbered */
+    : 
+    : "D" (res_buf), "a" (p), "c" (64)
+    : "memory", "%edx", "cc"
+    );
+#endif
+  }
+  else{
+    
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+
+    /* First zero the buffer */
+    
+#if (defined(__amd64__) ||  defined(__x86_64__))
+
+	    __asm__ __volatile__(
+		"mov %%rdi,%%rdx\n"  /* I cant tell the GCC that EDI value is clobbered */
+		"xor %%rax,%%rax\n"
+		"cld\n"
+		"rep\n"
+		"stosq\n"
+		"mov %%rdx,%%rdi\n" 
+	    : 
+	    : "D" (res_buf), "c" (16)
+	    : "memory", "cc", "rdx"
+	    );
+
+#else    
+	       __asm__ __volatile__(
+		"mov %%edi,%%edx\n"  /* I cant tell the GCC that EDI value is clobbered */
+		"xor %%eax,%%eax\n"
+		"cld\n"
+		"rep\n"
+		"stosw\n"
+		"mov %%edx,%%edi\n"  /* I cant tell the GCC that EDI value is clobbered */
+	    : 
+	    : "D" (res_buf), "c" (64)
+	    : "memory", "%edx", "cc"
+	    );
+
+#endif
+	res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+  
+  	for(zzi=1;zzi<_ncoefs;zzi++){
+      	int ci;
+      	ci=OC_FZIG_ZAG[zzi];
+      	res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*_ac_iquant[ci]);
+    	}
+    if(_last_zzi<10){
+      oc_idct8x8_10_mmx(res_buf);
+    }
+    else{
+      oc_idct8x8_mmx(res_buf);
+    }
+  }
+  /*Fill in the target buffer.*/
+  dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  /*For now ystride values in all ref frames assumed to be equal.*/
+  if(_frag->mbmode==OC_MODE_INTRA){
+    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
+  }
+  else{
+    int ref_framei;
+    int ref_ystride;
+    int mvoffset0;
+    int mvoffset1;
+    ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+     _frag->mv[1],ref_ystride,_pli)>1){
+      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+    }
+    else{
+      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+    }
+  }
+  oc_restore_fpu(_state);
+}
+
 
 /*Copies the fragments specified by the lists of fragment indices from one
    frame to another.
diff -Naur a/lib/x86/x86int.h b/lib/x86/x86int.h
--- a/lib/x86/x86int.h	2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/x86int.h	2005-07-20 13:30:16.883269750 +0200
@@ -3,7 +3,6 @@
 # include "../internal.h"
 
 void oc_state_vtable_init_x86(oc_theora_state *_state);
-
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
@@ -14,5 +13,11 @@
 void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
 void oc_restore_fpu_mmx(void);
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,                                                             
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void fillidctconstants_mmx(void);
 
 #endif
diff -Naur a/lib/x86/x86state.c b/lib/x86/x86state.c
--- a/lib/x86/x86state.c	2005-07-20 11:39:29.063807000 +0200
+++ b/lib/x86/x86state.c	2005-07-20 11:32:49.138813250 +0200
@@ -4,6 +4,7 @@
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();  
   if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.oc_state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;