[Theora-dev] [PATCH] promised MMX patches rc1

R.Marek at sh.cvut.cz R.Marek at sh.cvut.cz
Wed Mar 23 01:31:53 PST 2005


Hello,

Here is my first speedup patch. Like 10-11%. No IDCT yet.
Please feel free to comment my code or even better think about
improvements. :) I belive my routines are not so bad, maybe
one day they will be even more faster.

What needs to be optimized is the loop filter fuction. I have 
no ideas now how to do it. It does not leave much space for parallel
stuff, copying memory from lot of different locations, storing in stride
manner uff.

The patch should apply to derfs tree.

There is not mentioned my name. I dont have what CREDITS policy you have.
So I'm better asking. :)

Regards 

Rudolf

diff -Naur theora-exp/lib/fragment.c theora-rel/lib/fragment.c
--- theora-exp/lib/fragment.c	2005-03-23 08:54:44.163819664 +0100
+++ theora-rel/lib/fragment.c	2005-03-23 09:42:29.000000000 +0100
@@ -1,6 +1,11 @@
 #include "internal.h"
 
-void oc_frag_recon_intra(unsigned char *_dst,int _dst_ystride,
+inline void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+_state->opt_vtable.oc_frag_recon_intra(_dst,_dst_ystride,_residue);
+}
+
+void oc_frag_recon_intra__c(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
   int i;
   for(i=0;i<8;i++){
@@ -10,7 +15,12 @@
   }
 }
 
-void oc_frag_recon_inter(unsigned char *_dst,int _dst_ystride,
+inline void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
+_state->opt_vtable.oc_frag_recon_inter(_dst,_dst_ystride,_src,_src_ystride,_residue);
+}
+
+void oc_frag_recon_inter__c(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
   int i;
   for(i=0;i<8;i++){
@@ -21,7 +31,14 @@
   }
 }
 
-void oc_frag_recon_inter2(unsigned char *_dst,int _dst_ystride,
+inline void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue) {
+_state->opt_vtable.oc_frag_recon_inter2(_dst,_dst_ystride,_src1,_src1_ystride,
+_src2,_src2_ystride,_residue);
+}
+
+void oc_frag_recon_inter2__c(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
   int i;
diff -Naur theora-exp/lib/internal.h theora-rel/lib/internal.h
--- theora-exp/lib/internal.h	2005-03-23 08:54:44.185816320 +0100
+++ theora-rel/lib/internal.h	2005-03-23 09:06:23.000000000 +0100
@@ -219,6 +219,21 @@
   int nsbs;
 }oc_fragment_plane;
 
+struct oc_theora_state;
+
+typedef struct {
+/* This shared (decoder and encoder) functions have accelerated variants */
+void (* oc_frag_recon_intra) (unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void (* oc_frag_recon_inter) (unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void (* oc_frag_recon_inter2) (unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void (* oc_state_frag_copy) (const struct oc_theora_state *_state, const int *_fragis, 
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void (* restore_fpu)(void);
+} oc_base_opt_vtable;
 
 
 /*Common state information between the encoder and decoder.*/
@@ -291,10 +306,12 @@
   oc_quant_tables       dequant_table_data[2][3];
   /*Loop filter strength parameters.*/
   unsigned char         loop_filter_limits[64];
+  /*Table for accelerated common functions*/
+  oc_base_opt_vtable    opt_vtable;
+  ogg_uint32_t 		cpu_flags;
 }oc_theora_state;
 
 
-
 /*The function type used to fill in the chroma plane motion vectors for a
    macro block when 4 different motion vectors are specified in the luma
    plane.
@@ -344,13 +361,20 @@
 
 int oc_dct_token_skip(int _token,int _extra_bits);
 
+/* 
+
+Function converted to function pointers
+
 void oc_frag_recon_intra(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue);
-void oc_frag_recon_inter(unsigned char *_dst,int _dst_ystride,
+void oc_frag_recon_inter (unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter2(unsigned char *_dst,int _dst_ystride,
+void oc_frag_recon_inter2 (unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue);
+ */
+ 
+ 
 int oc_frag_pred_dc(const oc_fragment *_frag,
  const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
 
@@ -368,8 +392,10 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
-void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+
+/*void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
+*/
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
@@ -379,4 +405,45 @@
  const char *_suf);
 #endif
 
+void oc_frag_recon_intra__c(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter__c(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2__c(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy__c(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
+void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void restore_fpu(const oc_theora_state *_state);
+
+#define X86CODE 1
+
+#ifdef X86CODE
+
+void oc_frag_recon_intra__mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter__mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2__mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy__mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void restore_fpu__mmx(void);
+void restore_fpu__c(void);
+
+#endif
+
+
 #endif
+
diff -Naur theora-exp/lib/i386/cpu.c theora-rel/lib/i386/cpu.c
--- theora-exp/lib/i386/cpu.c	1970-01-01 01:00:00.000000000 +0100
+++ theora-rel/lib/i386/cpu.c	2005-03-23 08:55:49.000000000 +0100
@@ -0,0 +1,98 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+#if 1
+ogg_uint32_t cpu_get_flags (void)
+{
+  ogg_uint32_t eax, ebx, ecx, edx;
+  ogg_uint32_t flags;
+
+#define cpuid(op,eax,ebx,ecx,edx)      \
+  asm volatile ("pushl %%ebx   \n\t"   \
+                "cpuid         \n\t"   \
+                "movl %%ebx,%1 \n\t"   \
+                "popl %%ebx"           \
+              : "=a" (eax),            \
+                "=r" (ebx),            \
+                "=c" (ecx),            \
+                "=d" (edx)             \
+              : "a" (op)               \
+              : "cc")
+
+  asm volatile ("pushfl              \n\t"
+                "pushfl              \n\t"
+                "popl %0             \n\t"
+                "movl %0,%1          \n\t"
+                "xorl $0x200000,%0   \n\t"
+                "pushl %0            \n\t"
+                "popfl               \n\t"
+                "pushfl              \n\t"
+                "popl %0             \n\t"
+                "popfl"
+              : "=r" (eax),
+                "=r" (ebx)
+              :
+              : "cc");
+         
+  if (eax == ebx)             /* no cpuid */
+    return 0;
+
+  cpuid(0, eax, ebx, ecx, edx);
+
+  if (ebx == 0x756e6547 &&
+      edx == 0x49656e69 &&
+      ecx == 0x6c65746e) {
+    /* intel */
+
+  inteltest:
+    cpuid(1, eax, ebx, ecx, edx);
+    if ((edx & 0x00800000) == 0)
+      return 0;
+    flags = CPU_X86_MMX;
+    if (edx & 0x02000000)
+      flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
+    if (edx & 0x04000000)
+      flags |= CPU_X86_SSE2;
+    return flags;
+  } else if (ebx == 0x68747541 &&
+             edx == 0x69746e65 &&
+             ecx == 0x444d4163) {
+    /* AMD */
+    cpuid(0x80000000, eax, ebx, ecx, edx);
+    if ((unsigned)eax < 0x80000001)
+      goto inteltest;
+    cpuid(0x80000001, eax, ebx, ecx, edx);
+    if ((edx & 0x00800000) == 0)
+      return 0;
+    flags = CPU_X86_MMX;
+    if (edx & 0x80000000)
+      flags |= CPU_X86_3DNOW;
+    if (edx & 0x00400000)
+      flags |= CPU_X86_MMXEXT;
+    return flags;
+  }
+  else {
+    /* implement me */
+  }
+
+  return flags;
+}
+#else
+ogg_uint32_t cpu_get_flags (void) {
+  return 0;
+}
+#endif
+
diff -Naur theora-exp/lib/i386/cpu.h theora-rel/lib/i386/cpu.h
--- theora-exp/lib/i386/cpu.h	1970-01-01 01:00:00.000000000 +0100
+++ theora-rel/lib/i386/cpu.h	2005-03-23 08:55:49.000000000 +0100
@@ -0,0 +1,21 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+#include "../internal.h"
+
+#define CPU_X86_MMX	(1<<0)
+#define CPU_X86_3DNOW	(1<<1)
+#define CPU_X86_MMXEXT	(1<<2)
+#define CPU_X86_SSE	(1<<3)
+#define CPU_X86_SSE2	(1<<4)
+
+ogg_uint32_t cpu_get_flags (void);
diff -Naur theora-exp/lib/i386/frag_mmx.c theora-rel/lib/i386/frag_mmx.c
--- theora-exp/lib/i386/frag_mmx.c	1970-01-01 01:00:00.000000000 +0100
+++ theora-rel/lib/i386/frag_mmx.c	2005-03-23 10:16:20.000000000 +0100
@@ -0,0 +1,122 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+
+#include "../internal.h"
+
+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
+
+void restore_fpu__mmx(void) {
+ __asm__ __volatile__ (
+ "  emms    \n\t" /* pack with next(high) four values */
+);
+}
+
+void oc_frag_recon_intra__mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue) {
+
+  __asm__ __volatile__ (
+"  mov 		$0x7, %%ecx 	\n\t" /* 8x loop */
+"  .balign 16              	\n\t"
+"1:  movq     (V128), %%mm0	\n\t" /* Set mm0 to 0x0080008000800080 */
+"  movq         (%1), %%mm2	\n\t" /* First four input values */
+"  movq        %%mm0, %%mm1	\n\t" /* Set mm1 == mm0 */
+"  movq        8(%1), %%mm3	\n\t" /* Next four input values */
+"  decl   	%%ecx		\n\t" /* dec counter */
+"  paddsw      %%mm3, %%mm1	\n\t" /* add+128 and saturate to 16bit */
+"  lea      0x10(%1), %1	\n\t" /*_residuo+16 */
+"  paddsw      %%mm2, %%mm0 	\n\t" /* add+128 and saturate to 16bit   */
+"  packuswb    %%mm1, %%mm0	\n\t" /* pack saturate with next(high) four values */
+"  movq      %%mm0, (%0)	\n\t" /* writeback */
+"  lea         (%0,%2), %0 	\n\t" /*_dst+_dst_ystride */
+"  jns 1b			\n\t" /* loop */
+  : "+r" (_dst)
+  : "r" (_residue), 
+    "r" (_dst_ystride)
+  : "memory", "ecx" );
+}
+
+
+void oc_frag_recon_inter__mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
+int i;
+  __asm__ __volatile__ (
+"  movl 	$0x7, 	%%eax 	\n\t" /* 8x loop */
+"  pxor 	%%mm0, 	%%mm0	\n\t" /* zero mm0  */
+"  .balign 16                   \n\t"
+"1: movq     	(%4), 	%%mm2	\n\t" /* load mm2 with _src */
+"  movq 	%%mm2,	%%mm3	\n\t" /* copy mm2 to mm3 */
+"  punpckhbw 	%%mm0,	%%mm2 	\n\t" /* expand high part of _src to 16 bits */
+"  punpcklbw 	%%mm0,	%%mm3	\n\t" /* expand low part of _src to 16 bits */
+"  paddsw 	(%1),	%%mm3	\n\t" /* add low part with low part of residue */
+"  paddsw 	8(%1),	%%mm2	\n\t" /* high with high */
+"  packuswb   	%%mm2,	%%mm3	\n\t" /* pack and saturate to mm3 */
+"  lea 		(%4,%3), %4 	\n\t" /* _src+_src_ystride */
+"  lea   	0x10(%1), %1	\n\t" /* _residuo+16 */
+"  movq        	%%mm3, 	(%0)	\n\t" /* put mm3 to dest */
+"  lea 		(%0,%2),%0 	\n\t" /* _dst+_dst_ystride */
+"  decl  	%%eax		\n\t" /* dec counter */
+"  jns 		1b		\n\t" /* loop */
+  : "+r" (_dst)
+  : "r" (_residue), 
+    "r" (_dst_ystride),
+    "r" (_src_ystride),
+    "r" (_src)
+  : "memory", "eax" );
+
+}
+
+void oc_frag_recon_inter2__mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+int i;
+  __asm__ __volatile__ (
+"  movl 	$0x7,	%7 	\n\t" /* 8x loop */
+"  pxor 	%%mm0,	%%mm0	\n\t" /* zero mm0 */
+"  movq 	(%4), 	%%mm2  	\n\t" /* load mm2 with _src1 */
+"  .balign 16             	\n\t"
+"1: movq 	(%6),	%%mm4 	\n\t" /* packed SRC2 */ 
+"  movq 	%%mm2,	%%mm3	\n\t" /* copy to mm3 */
+"  movq 	%%mm4,	%%mm5	\n\t" /* copy packed src2 to mm5 */
+"  mov 		%3,	%%eax	\n\t"
+"  punpcklbw 	%%mm0,	%%mm2	\n\t" /* expand low part of src1 to mm2 */
+"  punpcklbw 	%%mm0,	%%mm4	\n\t" /* low part expand of src2 to mm4 */
+"  lea 		(%4,%%eax), %4 	\n\t" /*  _src1+_src1_ystride */
+"  punpckhbw 	%%mm0,	%%mm3	\n\t" /* expand high part of src1 to mm3 */
+"  punpckhbw 	%%mm0,	%%mm5	\n\t" /* high part expand of src2 to mm5 */
+"  mov 		%5,	%%eax	\n\t"
+"  paddsw 	%%mm2,	%%mm4	\n\t" /* add low parts of src1 and src2 */
+"  paddsw 	%%mm3,	%%mm5	\n\t" /* add high parts of src1 and src2 */
+"  lea 		(%6,%%eax), %6 	\n\t" /* _src2+_src2_ystride */  
+"  movq   	(%4), %%mm2  	\n\t" /* load mm2 with _src1 */
+"  psrlw 	$1,	%%mm4	\n\t" /* shift logical 1 to right o 2 dolu */
+"  psrlw 	$1,	%%mm5	\n\t" /* shift logical 1 to right */
+"  paddsw 	(%1),	%%mm4	\n\t" /* add low parts wwith low parts */
+"  paddsw 	8(%1),	%%mm5	\n\t" /* add highparts with high */
+"  packuswb   	%%mm5,	%%mm4	\n\t" /* pack saturate high to low */
+"  lea   	0x10(%1), %1	\n\t" /* _residuo+16 */
+"  movq        	%%mm4, (%0)	\n\t" /* write to src */
+"  decl 	%7		\n\t"
+"  lea 		(%0,%2), %0 	\n\t" /* _dst+_dst_ystride */
+"  jns 		1b\n\t"
+     
+ : "+r" (_dst) /* 0 */
+  : "r" (_residue), /* 1 */
+    "r" (_dst_ystride), /* 2 */
+    "m" (_src1_ystride), /* 3 */
+    "r" (_src1), /* 4 */
+    "m" (_src2_ystride), /* 5 */
+    "r" (_src2), /* 6 */
+    "m" (i)
+  : "memory", "eax");
+  
+}
diff -Naur theora-exp/lib/i386/state_mmx.c theora-rel/lib/i386/state_mmx.c
--- theora-exp/lib/i386/state_mmx.c	1970-01-01 01:00:00.000000000 +0100
+++ theora-rel/lib/i386/state_mmx.c	2005-03-23 10:15:18.000000000 +0100
@@ -0,0 +1,74 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+
+#include "../internal.h"
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy__mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+  const int *fragi;
+  const int *fragi_end;
+  int        dst_framei;
+  int        dst_ystride;
+  int        src_framei;
+  int        src_ystride;
+  dst_framei=_state->ref_frame_idx[_dst_frame];
+  src_framei=_state->ref_frame_idx[_src_frame];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  fragi_end=_fragis+_nfragis;
+  for(fragi=_fragis;fragi<fragi_end;fragi++){
+    oc_fragment   *frag;
+    unsigned char *dst;
+   unsigned char *src;
+    frag=_state->frags+*fragi;
+    dst=frag->buffer[dst_framei];
+ src=frag->buffer[src_framei];
+  __asm__ __volatile__ (
+    "  lea         (%3, %3, 2), %%esi  	\n\t"  /* esi=src_stride*3 */
+    "  movq        (%1), 	%%mm0  	\n\t"  /* src */
+    "  lea         (%2, %2, 2), %%edi 	\n\t"  /* edi=dst_stride*3 */
+    "  movq        (%1, %3), 	%%mm1   \n\t"  /* src+1x stride */
+    "  movq        (%1, %3, 2), %%mm2  	\n\t"  /* src+2x stride */
+    "  movq        (%1, %%esi), %%mm3  	\n\t"  /* src+3x stride */
+    "  movq        %%mm0, 	(%0)    \n\t"  /* dst */
+    "  movq        %%mm1, 	(%0, %2)\n\t"  /* dst+dst_stride */
+    "  lea	   (%1,%3,4),	%1	\n\t"  /* pointer to next 4 */
+    "  movq        %%mm2, 	(%0, %2, 2) 	 \n\t"  /*dst+2x dst_stride */
+    "  movq        %%mm3, 	(%0, %%edi)  	 \n\t"  /* 3x */
+    "  lea	   (%0,%2,4),	%0	\n\t"  /* pointer to next 4 */
+    "  movq        (%1), 	%%mm0	\n\t"  /* src */
+    "  movq        (%1, %3), 	%%mm1	\n\t"  /* src+1x stride */
+    "  movq        (%1, %3, 2), %%mm2	\n\t"  /* src+2x stride */
+    "  movq        (%1, %%esi), %%mm3	\n\t"  /* src+3x stride */
+    "  movq        %%mm0, 	(%0)	\n\t"  /* dst */
+    "  movq        %%mm1, 	(%0, %2)\n\t"  /* dst+dst_stride */
+    "  movq        %%mm2, 	(%0, %2, 2)	\n\t"  /* dst+2x dst_stride */
+    "  movq        %%mm3, 	(%0, %%edi)  	\n\t"  /* 3x */
+      : "+r" (dst) /* 0 */
+      : "r" (src),  /* 1 */
+        "r" (dst_ystride), /* 2 */
+	"r" (src_ystride) /* 3 */
+      : "memory", "esi","edi");
+    
+  }
+  
+  __asm__ __volatile__ ( "emms\n\t") ; /* this needs to be removed when decode specific functions */
+  				  /* will be implemented */
+}
+
diff -Naur theora-exp/lib/state.c theora-rel/lib/state.c
--- theora-exp/lib/state.c	2005-03-23 08:54:44.182816776 +0100
+++ theora-rel/lib/state.c	2005-03-23 09:42:36.000000000 +0100
@@ -2,13 +2,14 @@
 #include <string.h>
 #include "internal.h"
 #include "idct.h"
+
+#include "i386/cpu.h"
+
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
 #endif
 
-
-
 /*Returns the fragment index of the top-left block in a macro block.
   This can be used to test whether or not the whole macro block is coded.
   _sb:    The super block.
@@ -497,9 +498,16 @@
   _ogg_free(_state->ref_frame_data);
 }
 
+ 
+void restore_fpu(const oc_theora_state *_state) { _state->opt_vtable.restore_fpu();  } ;
+
+void restore_fpu__c(void) { } ;
+
 int oc_state_init(oc_theora_state *_state,const theora_info *_info){
   /*First validate the parameters.*/
   if(_info==NULL)return OC_FAULT;
+  
+  
   /*The width and height of the encoded frame must be multiples of 16.
     They must also, when divided by 16, fit into a 16-bit unsigned integer.
     The displayable frame offset coordinates must fit into an 8-bit unsigned
@@ -521,6 +529,28 @@
   }
   memset(_state,0,sizeof(*_state));
   memcpy(&_state->info,_info,sizeof(*_info));
+
+  /* Initialize accelerated functions */
+  
+#ifdef X86CODE  
+  _state->cpu_flags=cpu_get_flags();  
+  if (_state->cpu_flags & CPU_X86_MMX) {
+     _state->opt_vtable.oc_frag_recon_intra = &oc_frag_recon_intra__mmx;
+     _state->opt_vtable.oc_frag_recon_inter = &oc_frag_recon_inter__mmx;
+     _state->opt_vtable.oc_frag_recon_inter2 = &oc_frag_recon_inter2__mmx;
+     _state->opt_vtable.oc_state_frag_copy = &oc_state_frag_copy__mmx;
+     _state->opt_vtable.restore_fpu = &restore_fpu__mmx;
+  }
+  else 
+#endif  
+  {
+     _state->opt_vtable.oc_frag_recon_intra = &oc_frag_recon_intra__c;
+     _state->opt_vtable.oc_frag_recon_inter = &oc_frag_recon_inter__c;
+     _state->opt_vtable.oc_frag_recon_inter2 = &oc_frag_recon_inter2__c;
+     _state->opt_vtable.oc_state_frag_copy = &oc_state_frag_copy__c;
+     _state->opt_vtable.restore_fpu = &restore_fpu;
+  }
+
   /*Invert the sense of pic_y to match Theora's right-handed coordinate
      system.*/
   _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
@@ -820,7 +850,7 @@
   dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra(_frag->buffer[dst_framei],dst_ystride,res_buf);
+    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
   }
   else{
     int ref_framei;
@@ -831,15 +861,16 @@
     ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
     if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
      _frag->mv[1],ref_ystride,_pli)>1){
-      oc_frag_recon_inter2(_frag->buffer[dst_framei],dst_ystride,
+      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
        _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
        _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
     }
     else{
-      oc_frag_recon_inter(_frag->buffer[dst_framei],dst_ystride,
+      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
        _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
     }
   }
+  restore_fpu(_state);
 }
 
 /*Copies the fragments specified by the lists of fragment indices from one
@@ -849,7 +880,13 @@
   _dst_frame: The reference frame to copy to.
   _src_frame: The reference frame to copy from.
   _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+
+inline void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+ _state->opt_vtable.oc_state_frag_copy(_state,_fragis,_nfragis,_dst_frame,_src_frame,_pli);
+ }
+
+void oc_state_frag_copy__c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli){
   const int *fragi;
   const int *fragi_end;
diff -Naur theora-exp/unix/Makefile theora-rel/unix/Makefile
--- theora-exp/unix/Makefile	2005-03-23 08:54:38.299711144 +0100
+++ theora-rel/unix/Makefile	2005-03-23 08:55:49.000000000 +0100
@@ -22,7 +22,7 @@
 # You may get speed increases by including flags such as -O2 or -O3 or
 #  -ffast-math, or additional flags, depending on your system and compiler.
 # The -g flag will generally include debugging information.
-CFLAGS = -g
+CFLAGS = -g  -O4 -march=pentium2 -mcpu=pentium2 -pipe -ffast-math -fomit-frame-pointer
 # Libraries to link with, and the location of library files.
 # Add -lpng -lz if you want to enable OC_DUMP_IMAGES.
 LIBS = -logg -lvorbis -lvorbisenc `sdl-config --libs`
@@ -41,6 +41,9 @@
 internal.c \
 quant.c \
 state.c \
+i386/state_mmx.c \
+i386/frag_mmx.c \
+i386/cpu.c \
 
 LIBTHEORABASE_CHEADERS = \
 dct.h \
@@ -75,8 +78,7 @@
 psych.c \
 
 LIBTHEORAENC_CHEADERS =   \
-${LIBTHEORABASE_CHEADERS} \
-encint.h \
+${LIBTHEORABASE_CHEADERS} \encint.h \
 enquant.h \
 fdct.h \
 huffenc.h \
@@ -156,7 +158,7 @@
 ${PLAYER_EXAMPLE_TARGET}: ${PLAYER_EXAMPLE_OBJS} ${LIBTHEORABASE_TARGET} \
                            ${LIBTHEORADEC_TARGET}
 	mkdir -p ${TARGETBINDIR}
-	${CC} ${CFLAGS} -o $@ ${PLAYER_EXAMPLE_OBJS} ${LIBS} \
+	${CC} ${CFLAGS} -o $@ ${PLAYER_EXAMPLE_OBJS} ${LIBS} -lSDL -lpthread \
          ${LIBTHEORADEC_TARGET} ${LIBTHEORABASE_TARGET}
 
 # Remove all targets.


More information about the Theora-dev mailing list