[xiph-commits] r9609 - in experimental/derf/theora-exp: lib lib/x86 unix

tterribe at svn.xiph.org tterribe at svn.xiph.org
Sat Jul 23 19:34:00 PDT 2005


Author: tterribe
Date: 2005-07-23 19:33:55 -0700 (Sat, 23 Jul 2005)
New Revision: 9609

Added:
   experimental/derf/theora-exp/lib/x86/mmxidct.c
Modified:
   experimental/derf/theora-exp/lib/Makefile.am
   experimental/derf/theora-exp/lib/idct.c
   experimental/derf/theora-exp/lib/idct.h
   experimental/derf/theora-exp/lib/internal.h
   experimental/derf/theora-exp/lib/state.c
   experimental/derf/theora-exp/lib/x86/cpu.c
   experimental/derf/theora-exp/lib/x86/mmxstate.c
   experimental/derf/theora-exp/lib/x86/x86int.h
   experimental/derf/theora-exp/lib/x86/x86state.c
   experimental/derf/theora-exp/unix/Makefile
Log:
Latest MMX patches from Rudolf Marek.
This includes an MMX iDCT and MMX acceleration for other pieces of fragment
 reconstruction (DC-only case and zeroing the buffer).
The iDCT implementation originates from the old VP3 codebase (though it was
 cleaned up substantially).
Thus, the flow will not match the traditional Chen factorization (as described
 in the spec and used in the C version), but it should be identically
 equivalent.


Modified: experimental/derf/theora-exp/lib/Makefile.am
===================================================================
--- experimental/derf/theora-exp/lib/Makefile.am	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/Makefile.am	2005-07-24 02:33:55 UTC (rev 9609)
@@ -5,6 +5,7 @@
 
 EXTRA_DIST = \
 	x86/cpu.c \
+	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
 	x86/x86state.c
@@ -12,6 +13,7 @@
 if OC_X86ASM
 X86ASM_FILES = \
 	x86/cpu.c \
+	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
 	x86/x86state.c

Modified: experimental/derf/theora-exp/lib/idct.c
===================================================================
--- experimental/derf/theora-exp/lib/idct.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/idct.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -208,7 +208,7 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
@@ -236,7 +236,7 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;

Modified: experimental/derf/theora-exp/lib/idct.h
===================================================================
--- experimental/derf/theora-exp/lib/idct.h	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/idct.h	2005-07-24 02:33:55 UTC (rev 9609)
@@ -3,7 +3,7 @@
 #if !defined(_idct_H)
 # define _idct_H (1)
 
-void oc_idct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif

Modified: experimental/derf/theora-exp/lib/internal.h
===================================================================
--- experimental/derf/theora-exp/lib/internal.h	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/internal.h	2005-07-24 02:33:55 UTC (rev 9609)
@@ -238,6 +238,9 @@
    int _src2_ystride,const ogg_int16_t *_residue);
   void (*state_frag_copy)(const oc_theora_state *_state,
    const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
+  void (*state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag,
+   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+   ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
   void (*restore_fpu)(void);
 }oc_base_opt_vtable;
 
@@ -385,9 +388,6 @@
 int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
 int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offset0,
  int *_offset1,int _dx,int _dy,int _ystride,int _pli);
-void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
@@ -408,6 +408,9 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu(const oc_theora_state *_state);
 
 /*Default pure-C implementations.*/
@@ -420,6 +423,9 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_c(void);
 
 #endif

Modified: experimental/derf/theora-exp/lib/state.c
===================================================================
--- experimental/derf/theora-exp/lib/state.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/state.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -512,6 +512,7 @@
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
   _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
 }
 
@@ -787,6 +788,13 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+  _state->opt_vtable.state_frag_recon(_state,_frag,_pli,_dct_coeffs,
+   _last_zzi,_ncoefs,_dc_iquant,_ac_iquant);
+}
+
+void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t dct_buf[64];
   ogg_int16_t res_buf[64];
   int dst_framei;
@@ -837,11 +845,11 @@
        the iDCT.*/
     if(_last_zzi<10){
       for(;zzi<10;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_10(res_buf,dct_buf);
+      oc_idct8x8_10_c(res_buf,dct_buf);
     }
     else{
       for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8(res_buf,dct_buf);
+      oc_idct8x8_c(res_buf,dct_buf);
     }
   }
   /*Fill in the target buffer.*/

Modified: experimental/derf/theora-exp/lib/x86/cpu.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/cpu.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/x86/cpu.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -13,6 +13,7 @@
  ********************************************************************/
 
 #include "cpu.h"
+#include "x86int.h"
 
 ogg_uint32_t oc_cpu_flags_get(void){
   ogg_uint32_t eax;
@@ -20,49 +21,45 @@
   ogg_uint32_t ecx;
   ogg_uint32_t edx;
   ogg_uint32_t flags;
-
 #if (defined(__amd64__) || defined(__x86_64__))
-
-#define cpuid(op,eax,ebx,ecx,edx) \
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
-   "push %%rbx   \n\t" \
-   "cpuid         \n\t" \
-   "movl %%ebx,%1 \n\t" \
-   "pop %%rbx" \
-   :"=a" (eax), \
-    "=r" (ebx), \
-    "=c" (ecx), \
-    "=d" (edx) \
-   :"a" (op) \
+   "push %%rbx\n\t" \
+   "cpuid\n\t" \
+   "movl %%ebx,%1\n\t" \
+   "pop  %%rbx\n\t" \
+   :"=a" (_eax), \
+    "=r" (_ebx), \
+    "=c" (_ecx), \
+    "=d" (_edx) \
+   :"a" (_op) \
    :"cc" \
   )
-
 #else
-
-#define cpuid(op,eax,ebx,ecx,edx) \
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
-   "pushl %%ebx   \n\t" \
-   "cpuid         \n\t" \
-   "movl %%ebx,%1 \n\t" \
-   "popl %%ebx" \
-   :"=a" (eax), \
-    "=r" (ebx), \
-    "=c" (ecx), \
-    "=d" (edx) \
-   :"a" (op) \
+   "pushl %%ebx\n\t" \
+   "cpuid\n\t" \
+   "movl  %%ebx,%1\n\t" \
+   "popl  %%ebx\n\t" \
+   :"=a" (_eax), \
+    "=r" (_ebx), \
+    "=c" (_ecx), \
+    "=d" (_edx) \
+   :"a" (_op) \
    :"cc" \
   )
   __asm__ __volatile__(
-   "pushfl              \n\t"
-   "pushfl              \n\t"
-   "popl %0             \n\t"
-   "movl %0,%1          \n\t"
-   "xorl $0x200000,%0   \n\t"
-   "pushl %0            \n\t"
-   "popfl               \n\t"
-   "pushfl              \n\t"
-   "popl %0             \n\t"
-   "popfl"
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl          %0\n\t"
+   "movl          %0,%1\n\t"
+   "xorl   $0x200000,%0\n\t"
+   "pushl         %0\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl          %0\n\t"
+   "popfl\n\t"
    :"=r" (eax),
     "=r" (ebx)
    :
@@ -82,7 +79,7 @@
     if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
   }
   else if(ebx==0x68747541&&edx==0x69746e65&&ecx==0x444d4163){
-    /*AMD.*/
+    /*AMD:*/
     cpuid(0x80000000,eax,ebx,ecx,edx);
     if(eax<0x80000001)goto inteltest;
     cpuid(0x80000001,eax,ebx,ecx,edx);

Added: experimental/derf/theora-exp/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxidct.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/x86/mmxidct.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -0,0 +1,469 @@
+#include <ogg/ogg.h>
+#include "../dct.h"
+#include "../idct.h"
+
+
+
+/*These are offsets into the table of constants below.*/
+/*4 masks, in order: low word to high.*/
+#define OC_MASK_OFFSET    (0)
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (32)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET  (88)
+
+
+
+/*A table of constants used by the MMX routines.*/
+ogg_uint16_t __attribute__((aligned(8),used)) OC_IDCT_CONSTS[(4+7+1)*4]={
+  65535,    0,    0,    0,
+      0,65535,    0,    0,
+      0,    0,65535,    0,
+      0,    0,    0,65535,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+/*Converts the expression in the argument to a sting.*/
+#define OC_M2STR(_s) #_s
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN \
+ "  #OC_IDCT_BEGIN\n\t" \
+ "  movq   "OC_I(3)",     %mm2\n\t" \
+ "  movq   "OC_C(3)",     %mm6\n\t" \
+ "  movq        %mm2,     %mm4\n\t" \
+ "  movq   "OC_J(5)",     %mm7\n\t" \
+ "  pmulhw      %mm6,     %mm4\n\t" \
+ "  movq   "OC_C(5)",     %mm1\n\t" \
+ "  pmulhw      %mm7,     %mm6\n\t" \
+ "  movq        %mm1,     %mm5\n\t" \
+ "  pmulhw      %mm2,     %mm1\n\t" \
+ "  movq   "OC_I(1)",     %mm3\n\t" \
+ "  pmulhw      %mm7,     %mm5\n\t" \
+ "  movq   "OC_C(1)",     %mm0\n\t" \
+ "  paddw       %mm2,     %mm4\n\t" \
+ "  paddw       %mm7,     %mm6\n\t" \
+ "  paddw       %mm1,     %mm2\n\t" \
+ "  movq   "OC_J(7)",     %mm1\n\t" \
+ "  paddw       %mm5,     %mm7\n\t" \
+ "  movq        %mm0,     %mm5\n\t" \
+ "  pmulhw      %mm3,     %mm0\n\t" \
+ "  paddsw      %mm7,     %mm4\n\t" \
+ "  pmulhw      %mm1,     %mm5\n\t" \
+ "  movq   "OC_C(7)",     %mm7\n\t" \
+ "  psubsw      %mm2,     %mm6\n\t" \
+ "  paddw       %mm3,     %mm0\n\t" \
+ "  pmulhw      %mm7,     %mm3\n\t" \
+ "  movq   "OC_I(2)",     %mm2\n\t" \
+ "  pmulhw      %mm1,     %mm7\n\t" \
+ "  paddw       %mm1,     %mm5\n\t" \
+ "  movq        %mm2,     %mm1\n\t" \
+ "  pmulhw "OC_C(2)",     %mm2\n\t" \
+ "  psubsw      %mm5,     %mm3\n\t" \
+ "  movq   "OC_J(6)",     %mm5\n\t" \
+ "  paddsw      %mm7,     %mm0\n\t" \
+ "  movq        %mm5,     %mm7\n\t" \
+ "  psubsw      %mm4,     %mm0\n\t" \
+ "  pmulhw "OC_C(2)",     %mm5\n\t" \
+ "  paddw       %mm1,     %mm2\n\t" \
+ "  pmulhw "OC_C(6)",     %mm1\n\t" \
+ "  paddsw      %mm4,     %mm4\n\t" \
+ "  paddsw      %mm0,     %mm4\n\t" \
+ "  psubsw      %mm6,     %mm3\n\t" \
+ "  paddw       %mm7,     %mm5\n\t" \
+ "  paddsw      %mm6,     %mm6\n\t" \
+ "  pmulhw "OC_C(6)",     %mm7\n\t" \
+ "  paddsw      %mm3,     %mm6\n\t" \
+ "  movq        %mm4,"OC_I(1)"\n\t" \
+ "  psubsw      %mm5,     %mm1\n\t" \
+ "  movq   "OC_C(4)",     %mm4\n\t" \
+ "  movq        %mm3,     %mm5\n\t" \
+ "  pmulhw      %mm4,     %mm3\n\t" \
+ "  paddsw      %mm2,     %mm7\n\t" \
+ "  movq        %mm6,"OC_I(2)"\n\t" \
+ "  movq        %mm0,     %mm2\n\t" \
+ "  movq   "OC_I(0)",     %mm6\n\t" \
+ "  pmulhw      %mm4,     %mm0\n\t" \
+ "  paddw       %mm3,     %mm5\n\t" \
+ "  movq   "OC_J(4)",     %mm3\n\t" \
+ "  psubsw      %mm1,     %mm5\n\t" \
+ "  paddw       %mm0,     %mm2\n\t" \
+ "  psubsw      %mm3,     %mm6\n\t" \
+ "  movq        %mm6,     %mm0\n\t" \
+ "  pmulhw      %mm4,     %mm6\n\t" \
+ "  paddsw      %mm3,     %mm3\n\t" \
+ "  paddsw      %mm1,     %mm1\n\t" \
+ "  paddsw      %mm0,     %mm3\n\t" \
+ "  paddsw      %mm5,     %mm1\n\t" \
+ "  pmulhw      %mm3,     %mm4\n\t" \
+ "  paddsw      %mm0,     %mm6\n\t" \
+ "  psubsw      %mm2,     %mm6\n\t" \
+ "  paddsw      %mm2,     %mm2\n\t" \
+ "  movq   "OC_I(1)",     %mm0\n\t" \
+ "  paddsw      %mm6,     %mm2\n\t" \
+ "  paddw       %mm3,     %mm4\n\t" \
+ "  psubsw      %mm1,     %mm2\n\t" \
+ "#end OC_IDCT_BEGIN\n\t"
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT __asm__ __volatile__( \
+ "  #OC_ROW_IDCT\n" \
+ OC_IDCT_BEGIN \
+ "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
+ "  psubsw      %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
+ "  paddsw      %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
+ "  paddsw      %mm7,     %mm7\n\t"  /* r7 = G + G */ \
+ "  paddsw      %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
+ "  paddsw      %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
+ "  psubsw      %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
+ "  paddsw      %mm3,     %mm3\n\t" \
+ "  psubsw      %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
+ "  paddsw      %mm5,     %mm5\n\t" \
+ "  paddsw      %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
+ "  paddsw      %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
+ "  psubsw      %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
+ "  paddsw      %mm0,     %mm0\n\t" \
+ "  movq        %mm1,"OC_I(1)"\n\t"  /* save R1 */ \
+ "  paddsw      %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "#end OC_ROW_IDCT\n\t" \
+)
+
+/* The following macro does two 4x4 transposes in place.
+   At entry, we assume:
+     r0 = a3 a2 a1 a0
+   I(1) = b3 b2 b1 b0
+     r2 = c3 c2 c1 c0
+     r3 = d3 d2 d1 d0
+
+     r4 = e3 e2 e1 e0
+     r5 = f3 f2 f1 f0
+     r6 = g3 g2 g1 g0
+     r7 = h3 h2 h1 h0
+
+   At exit, we have:
+   I(0) = d0 c0 b0 a0
+   I(1) = d1 c1 b1 a1
+   I(2) = d2 c2 b2 a2
+   I(3) = d3 c3 b3 a3
+
+   J(4) = h0 g0 f0 e0
+   J(5) = h1 g1 f1 e1
+   J(6) = h2 g2 f2 e2
+   J(7) = h3 g3 f3 e3
+
+   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+   J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
+
+   Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE __asm__ __volatile__( \
+ "  #OC_TRANSPOSE\n\t" \
+ "  movq           %mm4,     %mm1\n\t" \
+ "  punpcklwd      %mm5,     %mm4\n\t" \
+ "  movq           %mm0,"OC_I(0)"\n\t" \
+ "  punpckhwd      %mm5,     %mm1\n\t" \
+ "  movq           %mm6,     %mm0\n\t" \
+ "  punpcklwd      %mm7,     %mm6\n\t" \
+ "  movq           %mm4,     %mm5\n\t" \
+ "  punpckldq      %mm6,     %mm4\n\t" \
+ "  punpckhdq      %mm6,     %mm5\n\t" \
+ "  movq           %mm1,     %mm6\n\t" \
+ "  movq           %mm4,"OC_J(4)"\n\t" \
+ "  punpckhwd      %mm7,     %mm0\n\t" \
+ "  movq           %mm5,"OC_J(5)"\n\t" \
+ "  punpckhdq      %mm0,     %mm6\n\t" \
+ "  movq      "OC_I(0)",     %mm4\n\t" \
+ "  punpckldq      %mm0,     %mm1\n\t" \
+ "  movq      "OC_I(1)",     %mm5\n\t" \
+ "  movq           %mm4,     %mm0\n\t" \
+ "  movq           %mm6,"OC_J(7)"\n\t" \
+ "  punpcklwd      %mm5,     %mm0\n\t" \
+ "  movq           %mm1,"OC_J(6)"\n\t" \
+ "  punpckhwd      %mm5,     %mm4\n\t" \
+ "  movq           %mm2,     %mm5\n\t" \
+ "  punpcklwd      %mm3,     %mm2\n\t" \
+ "  movq           %mm0,     %mm1\n\t" \
+ "  punpckldq      %mm2,     %mm0\n\t" \
+ "  punpckhdq      %mm2,     %mm1\n\t" \
+ "  movq           %mm4,     %mm2\n\t" \
+ "  movq           %mm0,"OC_I(0)"\n\t" \
+ "  punpckhwd      %mm3,     %mm5\n\t" \
+ "  movq           %mm1,"OC_I(1)"\n\t" \
+ "  punpckhdq      %mm5,     %mm4\n\t" \
+ "  punpckldq      %mm5,     %mm2\n\t" \
+ "  movq           %mm4,"OC_I(3)"\n\t" \
+ "  movq           %mm2,"OC_I(2)"\n\t" \
+ "#end OC_TRANSPOSE\n\t" \
+)
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT __asm__ __volatile__( \
+ "  #OC_COLUMN_IDCT\n" \
+ OC_IDCT_BEGIN \
+ "  paddsw    "OC_8",     %mm2\n\t" \
+ "  paddsw      %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
+ "  paddsw      %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
+ "  psraw         $4,     %mm2\n\t"  /* r2 = NR2 */ \
+ "  psubsw      %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
+ "  psraw         $4,     %mm1\n\t"  /* r1 = NR1 */ \
+ "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
+ "  paddsw      %mm7,     %mm7\n\t"  /* r7 = G + G */ \
+ "  movq        %mm2,"OC_I(2)"\n\t"  /* store NR2 at I2 */ \
+ "  paddsw      %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
+ "  movq        %mm1,"OC_I(1)"\n\t"  /* store NR1 at I1 */ \
+ "  psubsw      %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
+ "  paddsw    "OC_8",     %mm4\n\t" \
+ "  paddsw      %mm3,     %mm3\n\t"  /* r3 = D. + D. */ \
+ "  paddsw      %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
+ "  psraw         $4,     %mm4\n\t"  /* r4 = NR4 */ \
+ "  psubsw      %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
+ "  psraw         $4,     %mm3\n\t"  /* r3 = NR3 */ \
+ "  paddsw    "OC_8",     %mm6\n\t" \
+ "  paddsw      %mm5,     %mm5\n\t"  /* r5 = B.. + B.. */ \
+ "  paddsw      %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
+ "  psraw         $4,     %mm6\n\t"  /* r6 = NR6 */ \
+ "  movq        %mm4,"OC_J(4)"\n\t"  /* store NR4 at J4 */ \
+ "  psraw         $4,     %mm5\n\t"  /* r5 = NR5 */ \
+ "  movq        %mm3,"OC_I(3)"\n\t"  /* store NR3 at I3 */ \
+ "  psubsw      %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
+ "  paddsw    "OC_8",     %mm7\n\t" \
+ "  paddsw      %mm0,     %mm0\n\t"  /* r0 = C. + C. */ \
+ "  paddsw      %mm7,     %mm0\n\t"  /* r0 = R0 = G. + C. */ \
+ "  psraw         $4,     %mm7\n\t"  /* r7 = NR7 */ \
+ "  movq        %mm6,"OC_J(6)"\n\t"  /* store NR6 at J6 */ \
+ "  psraw         $4,     %mm0\n\t"  /* r0 = NR0 */ \
+ "  movq        %mm5,"OC_J(5)"\n\t"  /* store NR5 at J5 */ \
+ "  movq        %mm7,"OC_J(7)"\n\t"  /* store NR7 at J7 */ \
+ "  movq        %mm0,"OC_I(0)"\n\t"  /* store NR0 at I0 */ \
+ "  #end OC_COLUMN_IDCT\n\t" \
+)
+
+#if (defined(__amd64__) || defined(__x86_64__))
+# define OC_MID_REG "%rcx"
+# define OC_Y_REG   "%rdx"
+#else
+# define OC_MID_REG "%ecx"
+# define OC_Y_REG   "%edx"
+#endif
+#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"("OC_MID_REG")"
+#define OC_M(_i)      OC_MID(OC_MASK_OFFSET,_i)
+#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
+#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+/*This routine accepts an 8x8 matrix, but in transposed form.
+  Every 4x4 submatrix is transposed.*/
+#if (defined(__amd64__) || defined(__x86_64__))
+  __asm__ __volatile__(
+   ""
+   :
+   :"d" (_y),
+    "c" (OC_IDCT_CONSTS)
+  );
+#else
+  __asm__ __volatile__(
+   "  movl $OC_IDCT_CONSTS,%%ecx\n\t"
+   :
+   :"d" (_y)
+   :"ecx"
+  );
+#endif
+#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
+  OC_ROW_IDCT;
+  OC_TRANSPOSE;
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16)+64)"("OC_Y_REG")"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"("OC_Y_REG")"
+  OC_ROW_IDCT;
+  OC_TRANSPOSE;
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_J(_k)      OC_I(_k)
+  OC_COLUMN_IDCT;
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_J(_k)      OC_I(_k)
+  OC_COLUMN_IDCT;
+#undef  OC_I
+#undef  OC_J
+  __asm__ __volatile__(
+   " emms\n\t"
+  );
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10 \
+ "  #OC_IDCT_BEGIN_10\n\t" \
+ "  movq   "OC_I(3)",     %mm2\n\t" \
+ "  nop\n\t" \
+ "  movq   "OC_C(3)",     %mm6\n\t" \
+ "  movq        %mm2,     %mm4\n\t" \
+ "  movq   "OC_C(5)",     %mm1\n\t" \
+ "  pmulhw      %mm6,     %mm4\n\t" \
+ "  movq   "OC_I(1)",     %mm3\n\t" \
+ "  pmulhw      %mm2,     %mm1\n\t" \
+ "  movq   "OC_C(1)",     %mm0\n\t" \
+ "  paddw       %mm2,     %mm4\n\t" \
+ "  pxor        %mm6,     %mm6\n\t" \
+ "  paddw       %mm1,     %mm2\n\t" \
+ "  movq   "OC_I(2)",     %mm5\n\t" \
+ "  pmulhw      %mm3,     %mm0\n\t" \
+ "  movq        %mm5,     %mm1\n\t" \
+ "  paddw       %mm3,     %mm0\n\t" \
+ "  pmulhw "OC_C(7)",     %mm3\n\t" \
+ "  psubsw      %mm2,     %mm6\n\t" \
+ "  pmulhw "OC_C(2)",     %mm5\n\t" \
+ "  psubsw      %mm4,     %mm0\n\t" \
+ "  movq   "OC_I(2)",     %mm7\n\t" \
+ "  paddsw      %mm4,     %mm4\n\t" \
+ "  paddw       %mm5,     %mm7\n\t" \
+ "  paddsw      %mm0,     %mm4\n\t" \
+ "  pmulhw "OC_C(6)",     %mm1\n\t" \
+ "  psubsw      %mm6,     %mm3\n\t" \
+ "  movq        %mm4,"OC_I(1)"\n\t" \
+ "  paddsw      %mm6,     %mm6\n\t" \
+ "  movq   "OC_C(4)",     %mm4\n\t" \
+ "  paddsw      %mm3,     %mm6\n\t" \
+ "  movq        %mm3,     %mm5\n\t" \
+ "  pmulhw      %mm4,     %mm3\n\t" \
+ "  movq        %mm6,"OC_I(2)"\n\t" \
+ "  movq        %mm0,     %mm2\n\t" \
+ "  movq   "OC_I(0)",     %mm6\n\t" \
+ "  pmulhw      %mm4,     %mm0\n\t" \
+ "  paddw       %mm3,     %mm5\n\t" \
+ "  paddw       %mm0,     %mm2\n\t" \
+ "  psubsw      %mm1,     %mm5\n\t" \
+ "  pmulhw      %mm4,     %mm6\n\t" \
+ "  paddw  "OC_I(0)",     %mm6\n\t" \
+ "  paddsw      %mm1,     %mm1\n\t" \
+ "  movq        %mm6,     %mm4\n\t" \
+ "  paddsw      %mm5,     %mm1\n\t" \
+ "  psubsw      %mm2,     %mm6\n\t" \
+ "  paddsw      %mm2,     %mm2\n\t" \
+ "  movq   "OC_I(1)",     %mm0\n\t" \
+ "  paddsw      %mm6,     %mm2\n\t" \
+ "  psubsw      %mm1,     %mm2\n\t" \
+ "  nop\n\t" \
+ "  #end OC_IDCT_BEGIN_10\n\t"
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10 __asm__ __volatile__( \
+ "  #OC_ROW_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10 \
+ "  movq    "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
+ "  psubsw       %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
+ "  paddsw       %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
+ "  paddsw       %mm7,     %mm7\n\t" /* r7 = G + G */ \
+ "  paddsw       %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ "  paddsw       %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
+ "  psubsw       %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ "  paddsw       %mm3,     %mm3\n\t" \
+ "  psubsw       %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ "  paddsw       %mm5,     %mm5\n\t" \
+ "  paddsw       %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ "  paddsw       %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ "  psubsw       %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ "  paddsw       %mm0,     %mm0\n\t" \
+ "  movq         %mm1,"OC_I(1)"\n\t" /* save R1 */ \
+ "  paddsw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "#end OC_ROW_IDCT_10\n\t" \
+)
+
+/*25+19=44 cycles.*/
+#define OC_COLUMN_IDCT_10 __asm__ __volatile__( \
+ "  #OC_COLUMN_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10 \
+ "  paddsw    "OC_8",     %mm2\n\t" \
+ "  paddsw      %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
+ "  paddsw      %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
+ "  psraw         $4,     %mm2\n\t" /* r2 = NR2 */ \
+ "  psubsw      %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
+ "  psraw         $4,     %mm1\n\t" /* r1 = NR1 */ \
+ "  movq   "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
+ "  paddsw      %mm7,     %mm7\n\t" /* r7 = G + G */ \
+ "  movq        %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
+ "  paddsw      %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
+ "  movq        %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
+ "  psubsw      %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
+ "  paddsw    "OC_8",     %mm4\n\t" \
+ "  paddsw      %mm3,     %mm3\n\t" /* r3 = D. + D. */ \
+ "  paddsw      %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
+ "  psraw         $4,     %mm4\n\t" /* r4 = NR4 */ \
+ "  psubsw      %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
+ "  psraw         $4,     %mm3\n\t" /* r3 = NR3 */ \
+ "  paddsw    "OC_8",     %mm6\n\t" \
+ "  paddsw      %mm5,     %mm5\n\t" /* r5 = B.. + B.. */ \
+ "  paddsw      %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
+ "  psraw         $4,     %mm6\n\t" /* r6 = NR6 */ \
+ "  movq        %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
+ "  psraw         $4,     %mm5\n\t" /* r5 = NR5 */ \
+ "  movq        %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
+ "  psubsw      %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
+ "  paddsw    "OC_8",     %mm7\n\t" \
+ "  paddsw      %mm0,     %mm0\n\t" /* r0 = C. + C. */ \
+ "  paddsw      %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ "  psraw         $4,     %mm7\n\t" /* r7 = NR7 */ \
+ "  movq        %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
+ "  psraw         $4,     %mm0\n\t" /* r0 = NR0 */ \
+ "  movq        %mm5,"OC_J(5)"\n\t" /* store NR5 at J5 */ \
+ "  movq        %mm7,"OC_J(7)"\n\t" /* store NR7 at J7 */ \
+ "  movq        %mm0,"OC_I(0)"\n\t" /* store NR0 at I0 */ \
+ "  #end OC_COLUMN_IDCT_10\n\t" \
+)
+
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+#if (defined(__amd64__) ||  defined(__x86_64__))
+  __asm__ __volatile__(
+   ""
+   :
+   :"d" (_y),
+   "c" (OC_IDCT_CONSTS)
+  );
+#else
+  __asm__ __volatile__(
+   "  mov    $OC_IDCT_CONSTS,%%ecx\n\t"
+   :
+   :"d" (_y)
+   :"ecx"
+  );
+#endif
+#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
+  /*Done with dequant, descramble, and partial transpose.
+    Now do the iDCT itself.*/
+  OC_ROW_IDCT_10;
+  OC_TRANSPOSE;
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_J(_k) OC_I(_k)
+  OC_COLUMN_IDCT_10;
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_J(_k) OC_I(_k)
+  OC_COLUMN_IDCT_10;
+#undef  OC_I
+#undef  OC_J
+  __asm__ __volatile__(
+   " emms\n\t"
+  );
+}

Modified: experimental/derf/theora-exp/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxstate.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/x86/mmxstate.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -11,7 +11,164 @@
  ********************************************************************
 */
 #include "x86int.h"
+#include "../internal.h"
 
+
+
+static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
+
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+  ogg_int16_t  __attribute__((aligned(8))) res_buf[64];
+  int dst_framei;
+  int dst_ystride;
+  int zzi;
+  int ci;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    /*Why is the iquant product rounded in this case and no others?
+      Who knows.*/
+    p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+    /*for(ci=0;ci<64;ci++)res_buf[ci]=p;*/
+    /*This could also be done with MMX 2.*/
+    __asm__ __volatile__(
+     "  movzwl    %1,   %%eax\n\t"
+     "  movd   %%eax,   %%mm0\n\t" /* XXXX XXXX 0000 AAAA */
+     "  movq   %%mm0,   %%mm1\n\t" /* XXXX XXXX 0000 AAAA */
+     "  pslld    $16,   %%mm1\n\t" /* XXXX XXXX AAAA 0000 */
+     "  por    %%mm0,   %%mm1\n\t" /* XXXX XXXX AAAA AAAA */
+     "  movq   %%mm1,   %%mm0\n\t" /* XXXX XXXX AAAA AAAA */
+     "  psllq    $32,   %%mm1\n\t" /* AAAA AAAA 0000 0000 */
+     "  por    %%mm1,   %%mm0\n\t" /* AAAA AAAA AAAA AAAA */
+     "  movq   %%mm0,    (%0)\n\t"
+     "  movq   %%mm0,   8(%0)\n\t"
+     "  movq   %%mm0,  16(%0)\n\t"
+     "  movq   %%mm0,  24(%0)\n\t"
+     "  movq   %%mm0,  32(%0)\n\t"
+     "  movq   %%mm0,  40(%0)\n\t"
+     "  movq   %%mm0,  48(%0)\n\t"
+     "  movq   %%mm0,  56(%0)\n\t"
+     "  movq   %%mm0,  64(%0)\n\t"
+     "  movq   %%mm0,  72(%0)\n\t"
+     "  movq   %%mm0,  80(%0)\n\t"
+     "  movq   %%mm0,  88(%0)\n\t"
+     "  movq   %%mm0,  96(%0)\n\t"
+     "  movq   %%mm0, 104(%0)\n\t"
+     "  movq   %%mm0, 112(%0)\n\t"
+     "  movq   %%mm0, 120(%0)\n\t"
+     :
+     :"r" (res_buf),
+      "r" (p)
+     :"memory"
+    );
+  }
+  else{
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+    __asm__ __volatile__(
+     "  pxor %%mm0,   %%mm0\n\t"
+     "  movq %%mm0,    (%0)\n\t"
+     "  movq %%mm0,   8(%0)\n\t"
+     "  movq %%mm0,  16(%0)\n\t"
+     "  movq %%mm0,  24(%0)\n\t"
+     "  movq %%mm0,  32(%0)\n\t"
+     "  movq %%mm0,  40(%0)\n\t"
+     "  movq %%mm0,  48(%0)\n\t"
+     "  movq %%mm0,  56(%0)\n\t"
+     "  movq %%mm0,  64(%0)\n\t"
+     "  movq %%mm0,  72(%0)\n\t"
+     "  movq %%mm0,  80(%0)\n\t"
+     "  movq %%mm0,  88(%0)\n\t"
+     "  movq %%mm0,  96(%0)\n\t"
+     "  movq %%mm0, 104(%0)\n\t"
+     "  movq %%mm0, 112(%0)\n\t"
+     "  movq %%mm0, 120(%0)\n\t"
+     :
+     :"r" (res_buf)
+     :"memory"
+    );
+    res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+    /*This is planned to be rewritten in MMX.*/
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      int ci;
+      ci=OC_FZIG_ZAG[zzi];
+      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[ci]*
+       _ac_iquant[ci]);
+    }
+    if(_last_zzi<10){
+      oc_idct8x8_10_mmx(res_buf);
+    }
+    else{
+      oc_idct8x8_mmx(res_buf);
+    }
+  }
+  /*Fill in the target buffer.*/
+  dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  /*For now ystride values in all ref frames assumed to be equal.*/
+  if(_frag->mbmode==OC_MODE_INTRA){
+    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
+  }
+  else{
+    int ref_framei;
+    int ref_ystride;
+    int mvoffset0;
+    int mvoffset1;
+    ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+     _frag->mv[1],ref_ystride,_pli)>1){
+      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+    }
+    else{
+      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+    }
+  }
+  oc_restore_fpu(_state);
+}
+
 /*Copies the fragments specified by the lists of fragment indices from one
    frame to another.
   _fragis:    A pointer to a list of fragment indices.
@@ -39,7 +196,6 @@
     frag=_state->frags+*fragi;
     dst=frag->buffer[dst_framei];
     src=frag->buffer[src_framei];
-
 #if (defined(__amd64__) || defined(__x86_64__))
     __asm__ __volatile__(
      "  lea         (%3, %3, 2), %%rsi   \n\t"  /* esi=src_stride*3 */

Modified: experimental/derf/theora-exp/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-exp/lib/x86/x86int.h	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/x86/x86int.h	2005-07-24 02:33:55 UTC (rev 9609)
@@ -13,6 +13,12 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,                                                             
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_mmx(void);
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+void oc_fill_idct_constants_mmx(void);
 
 #endif

Modified: experimental/derf/theora-exp/lib/x86/x86state.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/x86state.c	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/lib/x86/x86state.c	2005-07-24 02:33:55 UTC (rev 9609)
@@ -8,6 +8,7 @@
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
   }
   else oc_state_vtable_init_c(_state);

Modified: experimental/derf/theora-exp/unix/Makefile
===================================================================
--- experimental/derf/theora-exp/unix/Makefile	2005-07-23 23:59:34 UTC (rev 9608)
+++ experimental/derf/theora-exp/unix/Makefile	2005-07-24 02:33:55 UTC (rev 9609)
@@ -45,6 +45,7 @@
 $(if $(findstring -DOC_X86ASM,${CFLAGS}), \
 x86/mmxstate.c \
 x86/x86state.c \
+x86/mmxidct.c \
 x86/mmxfrag.c \
 x86/cpu.c \
 )



More information about the commits mailing list