[xiph-commits] r9129 - in experimental/derf/theora-exp: lib lib/x86 unix

tterribe at motherfish-iii.xiph.org tterribe at motherfish-iii.xiph.org
Fri Apr 8 06:35:09 PDT 2005


Author: tterribe
Date: 2005-04-08 06:35:05 -0700 (Fri, 08 Apr 2005)
New Revision: 9129

Added:
   experimental/derf/theora-exp/lib/x86/
   experimental/derf/theora-exp/lib/x86/cpu.c
   experimental/derf/theora-exp/lib/x86/cpu.h
   experimental/derf/theora-exp/lib/x86/mmxfrag.c
   experimental/derf/theora-exp/lib/x86/mmxstate.c
   experimental/derf/theora-exp/lib/x86/x86int.h
   experimental/derf/theora-exp/lib/x86/x86state.c
Modified:
   experimental/derf/theora-exp/lib/fragment.c
   experimental/derf/theora-exp/lib/internal.h
   experimental/derf/theora-exp/lib/state.c
   experimental/derf/theora-exp/unix/Makefile
Log:
The first series of MMX patches by Rudolf Marek.
-DOC_X86ASM needs to be added to CFLAGS to enable them at compile time.

This does not yet include optimizations for the iDCT or the loop filter.
Experimental testing shows approximately an 11% speed-up in the decoder.
We may want to keep this in a separate branch someday, but it's intentionally
 been kept fairly unobtrusive, so it should be easy to split off later.


Modified: experimental/derf/theora-exp/lib/fragment.c
===================================================================
--- experimental/derf/theora-exp/lib/fragment.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/fragment.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -1,6 +1,11 @@
 #include "internal.h"
 
-void oc_frag_recon_intra(unsigned char *_dst,int _dst_ystride,
+void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
+ int _dst_ystride,const ogg_int16_t *_residue){
+  _state->opt_vtable.frag_recon_intra(_dst,_dst_ystride,_residue);
+}
+
+void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
   int i;
   for(i=0;i<8;i++){
@@ -10,7 +15,14 @@
   }
 }
 
-void oc_frag_recon_inter(unsigned char *_dst,int _dst_ystride,
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
+ int _dst_ystride,const unsigned char *_src,int _src_ystride,
+ const ogg_int16_t *_residue){
+  _state->opt_vtable.frag_recon_inter(_dst,_dst_ystride,_src,_src_ystride,
+   _residue);
+}
+
+void oc_frag_recon_inter_c(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
   int i;
   for(i=0;i<8;i++){
@@ -21,7 +33,14 @@
   }
 }
 
-void oc_frag_recon_inter2(unsigned char *_dst,int _dst_ystride,
+void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
+ int _dst_ystride,const unsigned char *_src1,int _src1_ystride,
+ const unsigned char *_src2,int _src2_ystride,const ogg_int16_t *_residue){
+  _state->opt_vtable.frag_recon_inter2(_dst,_dst_ystride,_src1,_src1_ystride,
+   _src2,_src2_ystride,_residue);
+}
+
+void oc_frag_recon_inter2_c(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
   int i;

Modified: experimental/derf/theora-exp/lib/internal.h
===================================================================
--- experimental/derf/theora-exp/lib/internal.h	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/internal.h	2005-04-08 13:35:05 UTC (rev 9129)
@@ -95,7 +95,10 @@
 
 
 
+typedef struct oc_theora_state oc_theora_state;
 
+
+
 /*A map from a super block to fragment numbers.*/
 typedef int oc_sb_map[4][4];
 /*A map from a macro block to fragment numbers.*/
@@ -221,10 +224,30 @@
 
 
 
+/*The shared (encoder and decoder) functions that have accelerated variants.*/
+typedef struct{
+  void (*frag_recon_intra)(unsigned char *_dst,int _dst_ystride,
+   const ogg_int16_t *_residue);
+  void (*frag_recon_inter)(unsigned char *_dst,int _dst_ystride,
+   const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+  void (*frag_recon_inter2)(unsigned char *_dst,int _dst_ystride,
+   const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+   int _src2_ystride,const ogg_int16_t *_residue);
+  void (*state_frag_copy)(const oc_theora_state *_state,
+   const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
+  void (*restore_fpu)(void);
+}oc_base_opt_vtable;
+
+
+
 /*Common state information between the encoder and decoder.*/
-typedef struct{
+struct oc_theora_state{
   /*The stream information.*/
   theora_info           info;
+  /*Table for shared accelerated functions.*/
+  oc_base_opt_vtable    opt_vtable;
+  /*CPU flags to detect the presence of extended instruction sets.*/
+  ogg_uint32_t          cpu_flags;
   /*The fragment plane descriptions.*/
   oc_fragment_plane     fplanes[3];
   /*The total number of fragments in a single frame.*/
@@ -291,7 +314,7 @@
   oc_quant_tables       dequant_table_data[2][3];
   /*Loop filter strength parameters.*/
   unsigned char         loop_filter_limits[64];
-}oc_theora_state;
+};
 
 
 
@@ -344,18 +367,12 @@
 
 int oc_dct_token_skip(int _token,int _extra_bits);
 
-void oc_frag_recon_intra(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t *_residue);
-void oc_frag_recon_inter(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter2(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue);
 int oc_frag_pred_dc(const oc_fragment *_frag,
  const oc_fragment_plane *_fplane,int _x,int _y,int _pred_last[3]);
 
 int oc_state_init(oc_theora_state *_state,const theora_info *_info);
 void oc_state_clear(oc_theora_state *_state);
+void oc_state_vtable_init_c(oc_theora_state *_state);
 void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
  int _y0,int _yend);
 void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
@@ -368,8 +385,7 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
-void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
@@ -379,4 +395,28 @@
  const char *_suf);
 #endif
 
+/*Shared accelerated functions.*/
+void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_restore_fpu(const oc_theora_state *_state);
+
+/*Default pure-C implementations.*/
+void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_c(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_c(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_restore_fpu_c(void);
+
 #endif

Modified: experimental/derf/theora-exp/lib/state.c
===================================================================
--- experimental/derf/theora-exp/lib/state.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/state.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -2,6 +2,9 @@
 #include <string.h>
 #include "internal.h"
 #include "idct.h"
+#if defined(OC_X86ASM)
+# include "x86/x86int.h"
+#endif
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
@@ -9,6 +12,12 @@
 
 
 
+void oc_restore_fpu(const oc_theora_state *_state){
+  _state->opt_vtable.restore_fpu();
+}
+
+void oc_restore_fpu_c(void){}
+
 /*Returns the fragment index of the top-left block in a macro block.
   This can be used to test whether or not the whole macro block is coded.
   _sb:    The super block.
@@ -497,6 +506,25 @@
   _ogg_free(_state->ref_frame_data);
 }
 
+
+void oc_state_vtable_init_c(oc_theora_state *_state){
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
+  _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
+  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+}
+
+/*Initialize the accelerated function pointers.*/
+void oc_state_vtable_init(oc_theora_state *_state){
+#if defined(OC_X86ASM)
+  oc_state_vtable_init_x86(_state);
+#else
+  oc_state_vtable_init_c(_state);
+#endif
+}
+
+
 int oc_state_init(oc_theora_state *_state,const theora_info *_info){
   /*First validate the parameters.*/
   if(_info==NULL)return OC_FAULT;
@@ -525,6 +553,7 @@
      system.*/
   _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
   _state->frame_type=OC_UNKWN_FRAME;
+  oc_state_vtable_init(_state);
   oc_state_frarray_init(_state);
   oc_state_ref_bufs_init(_state);
   /*If the keyframe_granule_shift is out of range, use the maximum allowable
@@ -820,7 +849,7 @@
   dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra(_frag->buffer[dst_framei],dst_ystride,res_buf);
+    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
   }
   else{
     int ref_framei;
@@ -831,15 +860,16 @@
     ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
     if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
      _frag->mv[1],ref_ystride,_pli)>1){
-      oc_frag_recon_inter2(_frag->buffer[dst_framei],dst_ystride,
+      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
        _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
        _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
     }
     else{
-      oc_frag_recon_inter(_frag->buffer[dst_framei],dst_ystride,
+      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
        _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
     }
   }
+  oc_restore_fpu(_state);
 }
 
 /*Copies the fragments specified by the lists of fragment indices from one
@@ -851,6 +881,12 @@
   _pli:       The color plane the fragments lie in.*/
 void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli){
+  _state->opt_vtable.state_frag_copy(_state,_fragis,_nfragis,_dst_frame,
+   _src_frame,_pli);
+}
+
+void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;

Added: experimental/derf/theora-exp/lib/x86/cpu.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/cpu.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/cpu.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,79 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+  ogg_uint32_t flags;
+#define cpuid(op,eax,ebx,ecx,edx) \
+  __asm__ __volatile__( \
+   "pushl %%ebx   \n\t" \
+   "cpuid         \n\t" \
+   "movl %%ebx,%1 \n\t" \
+   "popl %%ebx" \
+   :"=a" (eax), \
+    "=r" (ebx), \
+    "=c" (ecx), \
+    "=d" (edx) \
+   :"a" (op) \
+   :"cc" \
+  )
+  __asm__ __volatile__(
+   "pushfl              \n\t"
+   "pushfl              \n\t"
+   "popl %0             \n\t"
+   "movl %0,%1          \n\t"
+   "xorl $0x200000,%0   \n\t"
+   "pushl %0            \n\t"
+   "popfl               \n\t"
+   "pushfl              \n\t"
+   "popl %0             \n\t"
+   "popfl"
+   :"=r" (eax),
+    "=r" (ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+  cpuid(0,eax,ebx,ecx,edx);
+  if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
+    /*Intel:*/
+inteltest:
+    cpuid(1,eax,ebx,ecx,edx);
+    if((edx&0x00800000)==0)return 0;
+    flags=OC_CPU_X86_MMX;
+    if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+    if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  }
+  else if(ebx==0x68747541&&edx==0x69746e65&&ecx==0x444d4163){
+    /*AMD.*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)goto inteltest;
+    cpuid(0x80000001,eax,ebx,ecx,edx);
+    if((edx&0x00800000)==0)return 0;
+    flags=OC_CPU_X86_MMX;
+    if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+    if(edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}

Added: experimental/derf/theora-exp/lib/x86/cpu.h
===================================================================
--- experimental/derf/theora-exp/lib/x86/cpu.h	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/cpu.h	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,25 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+#if !defined(_x86_cpu_H)
+# define _x86_cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX    (1<<0)
+#define OC_CPU_X86_3DNOW  (1<<1)
+#define OC_CPU_X86_MMXEXT (1<<2)
+#define OC_CPU_X86_SSE    (1<<3)
+#define OC_CPU_X86_SSE2   (1<<4)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

Added: experimental/derf/theora-exp/lib/x86/mmxfrag.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxfrag.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/mmxfrag.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,120 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+#include "x86int.h"
+
+static const __attribute__((aligned(8),used)) ogg_int64_t V128=
+ 0x0080008000800080LL;
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+  __asm__ __volatile__(
+   "  mov          $0x7, %%ecx  \n\t" /* 8x loop */
+   "  .balign 16                \n\t"
+   "1:  movq     (V128), %%mm0  \n\t" /* Set mm0 to 0x0080008000800080 */
+   "  movq         (%1), %%mm2  \n\t" /* First four input values */
+   "  movq        %%mm0, %%mm1  \n\t" /* Set mm1 == mm0 */
+   "  movq        8(%1), %%mm3  \n\t" /* Next four input values */
+   "  decl      %%ecx           \n\t" /* dec counter */
+   "  paddsw      %%mm3, %%mm1  \n\t" /* add+128 and saturate to 16bit */
+   "  lea      0x10(%1), %1     \n\t" /*_residuo+16 */
+   "  paddsw      %%mm2, %%mm0  \n\t" /* add+128 and saturate to 16bit   */
+   "  packuswb    %%mm1, %%mm0  \n\t" /* pack saturate with next(high) four values */
+   "  movq      %%mm0, (%0)     \n\t" /* writeback */
+   "  lea         (%0,%2), %0   \n\t" /*_dst+_dst_ystride */
+   "  jns 1b                    \n\t" /* loop */
+   :"+r" (_dst)
+   :"r" (_residue),
+    "r" (_dst_ystride)
+   :"memory", "ecx"
+  );
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
+  int i;
+  __asm__ __volatile__(
+   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
+   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0  */
+   "  .balign 16                   \n\t"
+   "1: movq        (%4),   %%mm2   \n\t" /* load mm2 with _src */
+   "  movq         %%mm2,  %%mm3   \n\t" /* copy mm2 to mm3 */
+   "  punpckhbw    %%mm0,  %%mm2   \n\t" /* expand high part of _src to 16 bits */
+   "  punpcklbw    %%mm0,  %%mm3   \n\t" /* expand low part of _src to 16 bits */
+   "  paddsw       (%1),   %%mm3   \n\t" /* add low part with low part of residue */
+   "  paddsw       8(%1),  %%mm2   \n\t" /* high with high */
+   "  packuswb     %%mm2,  %%mm3   \n\t" /* pack and saturate to mm3 */
+   "  lea         (%4,%3), %4      \n\t" /* _src+_src_ystride */
+   "  lea         0x10(%1), %1     \n\t" /* _residuo+16 */
+   "  movq        %%mm3,   (%0)    \n\t" /* put mm3 to dest */
+   "  lea         (%0,%2),%0       \n\t" /* _dst+_dst_ystride */
+   "  decl        %%eax            \n\t" /* dec counter */
+   "  jns         1b               \n\t" /* loop */
+   :"+r" (_dst)
+   :"r" (_residue), 
+    "r" (_dst_ystride),
+    "r" (_src_ystride),
+    "r" (_src)
+   :"memory", "eax"
+  );
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+  int i;
+  __asm__ __volatile__(
+   "  movl         $0x7,   %7      \n\t" /* 8x loop */
+   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
+   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
+   "  .balign 16                   \n\t"
+   "1: movq        (%6),   %%mm4   \n\t" /* packed SRC2 */ 
+   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
+   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
+   "  mov          %3,     %%eax   \n\t"
+   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
+   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
+   "  lea          (%4,%%eax), %4  \n\t" /*  _src1+_src1_ystride */
+   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
+   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
+   "  mov          %5,     %%eax   \n\t"
+   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
+   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
+   "  lea          (%6,%%eax), %6  \n\t" /* _src2+_src2_ystride */  
+   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
+   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
+   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
+   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
+   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
+   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
+   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
+   "  movq         %%mm4, (%0)     \n\t" /* write to src */
+   "  decl         %7              \n\t"
+   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
+   "  jns          1b\n\t"
+   :"+r" (_dst) /* 0 */
+   :"r" (_residue), /* 1 */
+    "r" (_dst_ystride), /* 2 */
+    "m" (_src1_ystride), /* 3 */
+    "r" (_src1), /* 4 */
+    "m" (_src2_ystride), /* 5 */
+    "r" (_src2), /* 6 */
+    "m" (i)
+   :"memory", "eax"
+  );
+}
+
+void oc_restore_fpu_mmx(void){
+  __asm__ __volatile__(
+   "  emms    \n\t" /* pack with next(high) four values */
+  );
+}

Added: experimental/derf/theora-exp/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/mmxstate.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/mmxstate.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,72 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+*/
+#include "x86int.h"
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+  const int *fragi;
+  const int *fragi_end;
+  int        dst_framei;
+  int        dst_ystride;
+  int        src_framei;
+  int        src_ystride;
+  dst_framei=_state->ref_frame_idx[_dst_frame];
+  src_framei=_state->ref_frame_idx[_src_frame];
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  fragi_end=_fragis+_nfragis;
+  for(fragi=_fragis;fragi<fragi_end;fragi++){
+    oc_fragment   *frag;
+    unsigned char *dst;
+    unsigned char *src;
+    frag=_state->frags+*fragi;
+    dst=frag->buffer[dst_framei];
+    src=frag->buffer[src_framei];
+    __asm__ __volatile__(
+     "  lea         (%3, %3, 2), %%esi   \n\t"  /* esi=src_stride*3 */
+     "  movq        (%1),        %%mm0   \n\t"  /* src */
+     "  lea         (%2, %2, 2), %%edi   \n\t"  /* edi=dst_stride*3 */
+     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
+     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
+     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
+     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
+     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
+     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
+     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
+     "  movq        %%mm3,       (%0, %%edi)      \n\t"  /* 3x */
+     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
+     "  movq        (%1),        %%mm0   \n\t"  /* src */
+     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
+     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
+     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
+     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
+     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
+     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
+     "  movq        %%mm3,       (%0, %%edi)     \n\t"  /* 3x */
+     :"+r" (dst) /* 0 */
+     :"r" (src),  /* 1 */
+      "r" (dst_ystride), /* 2 */
+      "r" (src_ystride) /* 3 */
+     :"memory", "esi","edi"
+    );
+  }
+  /*This needs to be removed when decode specific functions are implemented:*/
+  __asm__ __volatile__("emms\n\t");
+}

Added: experimental/derf/theora-exp/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-exp/lib/x86/x86int.h	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/x86int.h	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,18 @@
+#if !defined(_x86_x86int_H)
+# define _x86_x86int_H (1)
+# include "../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_restore_fpu_mmx(void);
+
+#endif

Added: experimental/derf/theora-exp/lib/x86/x86state.c
===================================================================
--- experimental/derf/theora-exp/lib/x86/x86state.c	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/lib/x86/x86state.c	2005-04-08 13:35:05 UTC (rev 9129)
@@ -0,0 +1,14 @@
+#include "x86int.h"
+#include "cpu.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+  _state->cpu_flags=oc_cpu_flags_get();  
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+  }
+  else oc_state_vtable_init_c(_state);
+}

Modified: experimental/derf/theora-exp/unix/Makefile
===================================================================
--- experimental/derf/theora-exp/unix/Makefile	2005-04-08 09:27:35 UTC (rev 9128)
+++ experimental/derf/theora-exp/unix/Makefile	2005-04-08 13:35:05 UTC (rev 9129)
@@ -22,9 +22,10 @@
 # You may get speed increases by including flags such as -O2 or -O3 or
 #  -ffast-math, or additional flags, depending on your system and compiler.
 # The -g flag will generally include debugging information.
+#CFLAGS = -O3 -fforce-addr -fomit-frame-pointer -DOC_X86ASM
 CFLAGS = -g
 # Libraries to link with, and the location of library files.
-# Add -lpng -lz if you want to enable OC_DUMP_IMAGES.
+# Add -lpng -lz if you want to use -DOC_DUMP_IMAGES.
 LIBS = -logg -lvorbis -lvorbisenc `sdl-config --libs`
 
 # ANYTHING BELOW THIS LINE PROBABLY DOES NOT NEED EDITING
@@ -41,6 +42,10 @@
 internal.c \
 quant.c \
 state.c \
+x86/mmxstate.c \
+x86/x86state.c \
+x86/mmxfrag.c \
+x86/cpu.c \
 
 LIBTHEORABASE_CHEADERS = \
 dct.h \



More information about the commits mailing list