[xiph-commits] r12874 - in trunk/theora-exp: include/theora lib lib/x86

Sun Apr 15 18:41:05 PDT 2007

Author: tterribe
Date: 2007-04-15 18:41:05 -0700 (Sun, 15 Apr 2007)
New Revision: 12874

Modified:
   trunk/theora-exp/include/theora/codec.h
   trunk/theora-exp/lib/decode.c
   trunk/theora-exp/lib/encmsc.c
   trunk/theora-exp/lib/internal.h
   trunk/theora-exp/lib/state.c
   trunk/theora-exp/lib/x86/mmxfrag.c
   trunk/theora-exp/lib/x86/mmxidct.c
   trunk/theora-exp/lib/x86/mmxstate.c
   trunk/theora-exp/lib/x86/x86int.h
   trunk/theora-exp/lib/x86/x86state.c
Log:
Backport of r12873 to theora-exp.
Also includes some other things that have been sitting in my tree for a while:
 formatting cleanup from the theora_->th_ migration, and an initialization
 warning fix in the mode scheme chooser.


Modified: trunk/theora-exp/include/theora/codec.h
===================================================================

--- trunk/theora-exp/include/theora/codec.h	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/include/theora/codec.h	2007-04-16 01:41:05 UTC (rev 12874)
@@ -186,26 +186,26 @@
   /**\name Theora version
    * Bitstream version information.*/
   /*@{*/
-  unsigned char      version_major;
-  unsigned char      version_minor;
-  unsigned char      version_subminor;
+  unsigned char  version_major;
+  unsigned char  version_minor;
+  unsigned char  version_subminor;
   /*@}*/
   /**The encoded frame width.
    * This must be a multiple of 16, and less than 1048576.*/
-  ogg_uint32_t       frame_width;
+  ogg_uint32_t   frame_width;
   /**The encoded frame height.
    * This must be a multiple of 16, and less than 1048576.*/
-  ogg_uint32_t       frame_height;
+  ogg_uint32_t   frame_height;
   /**The displayed picture width.
    * This must be no larger than width.*/
-  ogg_uint32_t       pic_width;
+  ogg_uint32_t   pic_width;
   /**The displayed picture height.
    * This must be no larger than height.*/
-  ogg_uint32_t       pic_height;
+  ogg_uint32_t   pic_height;
   /**The X offset of the displayed picture.
    * This must be no larger than #frame_width-#pic_width or 255, whichever is
    *  smaller.*/
-  ogg_uint32_t       pic_x;
+  ogg_uint32_t   pic_x;
   /**The Y offset of the displayed picture.
    * This must be no larger than #frame_height-#pic_height, and
    *  #frame_height-#pic_height-#pic_y must be no larger than 255.
@@ -213,13 +213,13 @@
    *  specified from the top of the image for consistency with the standard
    *  graphics left-handed coordinate system used throughout this API, while it
    *  is stored in the encoded stream as an offset from the bottom.*/
-  ogg_uint32_t       pic_y;
+  ogg_uint32_t   pic_y;
   /**\name Frame rate
    * The frame rate, as a fraction.
    * If either is 0, the frame rate is undefined.*/
   /*@{*/
-  ogg_uint32_t       fps_numerator;
-  ogg_uint32_t       fps_denominator;
+  ogg_uint32_t   fps_numerator;
+  ogg_uint32_t   fps_denominator;
   /*@}*/
   /**\name Aspect ratio
    * The aspect ratio of the pixels.
@@ -230,8 +230,8 @@
    *  aspect_numerator*pic_width/(aspect_denominator*pic_height).
    * \endcode */
   /*@{*/
-  ogg_uint32_t       aspect_numerator;
-  ogg_uint32_t       aspect_denominator;
+  ogg_uint32_t   aspect_numerator;
+  ogg_uint32_t   aspect_denominator;
   /*@}*/
   /**The color space.*/
   th_colorspace  colorspace;
@@ -243,7 +243,7 @@
   /*TODO: Current encoder does not support CBR mode, or anything like it.
     We also don't really know what nominal rate each quality level
      corresponds to yet.*/
-  int                target_bitrate;
+  int            target_bitrate;
   /**The target quality level.
      Valid values range from 0 to 63, inclusive, with higher values giving
       higher quality.
@@ -263,7 +263,7 @@
      too large for the current bitstream to be able to store.
     We'd have to redesign the token syntax to store these large coefficients,
      which would make transcoding complex.*/
-  int                quality;
+  int            quality;
   /**The amount to shift to extract the last keyframe number from the granule
    *  position.
    * This can be at most 31.
@@ -277,7 +277,7 @@
    *  during encoding (for example, to force the next frame to be a keyframe),
    *  but it cannot be set larger than the amount permitted by this field after
    *  the headers have been output.*/
-  int                keyframe_granule_shift;
+  int            keyframe_granule_shift;
 }th_info;
 
 /**The comment information.
@@ -324,10 +324,10 @@
 /**A set of \a qi ranges.*/
 typedef struct{
   /**The number of ranges in the set.*/
-  int                      nranges;
+  int                  nranges;
   /**The size of each of the #nranges ranges.
      These must sum to 63.*/
-  const int               *sizes;
+  const int           *sizes;
   /**#nranges <tt>+1</tt> base matrices.
      Matrices \a i and <tt>i+1</tt> form the endpoints of range \a i.*/
   const th_quant_base *base_matrices;
@@ -392,11 +392,11 @@
    This is not required by the decoder.*/
 typedef struct{
   /**The DC scaling factors.*/
-  ogg_uint16_t        dc_scale[64];
+  ogg_uint16_t    dc_scale[64];
   /**The AC scaling factors.*/
-  ogg_uint16_t        ac_scale[64];
+  ogg_uint16_t    ac_scale[64];
   /**The loop filter limit values.*/
-  unsigned char       loop_filter_limits[64];
+  unsigned char   loop_filter_limits[64];
   /**The \a qi ranges for each \a ci and \a pli.*/
   th_quant_ranges qi_ranges[2][3];
 }th_quant_info;

Modified: trunk/theora-exp/lib/decode.c
===================================================================
--- trunk/theora-exp/lib/decode.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/decode.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -2031,7 +2031,7 @@
         if(pipe.loop_filter){
           sdelay+=notstart;
           edelay+=notdone;
-          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values+256,
+          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
            refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
         }
         /*To fill the borders, we have an additional two pixel delay, since a

Modified: trunk/theora-exp/lib/encmsc.c
===================================================================
--- trunk/theora-exp/lib/encmsc.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/encmsc.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -104,7 +104,6 @@
   si=1;
   best_bits+=mode_bits;
   do{
-    scheme1=_chooser->scheme_list[si];
     /*For any scheme except 0, we can just use the bit cost of the mode's rank
        in that scheme.*/
     if(scheme1!=0){
@@ -125,10 +124,10 @@
       scheme_bits=_chooser->scheme_bits[0]+OC_MODE_CODESA[ri].nbits;
     }
     if(scheme_bits<best_bits)best_bits=scheme_bits;
-    si++;
+    if(++si>=8)break;
+    scheme1=_chooser->scheme_list[si];
   }
-  while(si<8&&_chooser->scheme_bits[_chooser->scheme_list[si]]-
-   _chooser->scheme_bits[scheme0]<=6);
+  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
   return best_bits-_chooser->scheme_bits[scheme0];
 }
 

Modified: trunk/theora-exp/lib/internal.h
===================================================================
--- trunk/theora-exp/lib/internal.h	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/internal.h	2007-04-16 01:41:05 UTC (rev 12874)
@@ -241,6 +241,8 @@
   void (*state_frag_recon)(oc_theora_state *_state,const oc_fragment *_frag,
    int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+  void (*state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
+   int _refi,int _pli,int _fragy0,int _fragy_end);
   void (*restore_fpu)(void);
 }oc_base_opt_vtable;
 
@@ -391,8 +393,6 @@
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
-void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end);
 #if defined(OC_DUMP_IMAGES)
 int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
  const char *_suf);
@@ -411,6 +411,8 @@
 void oc_state_frag_recon(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu(const oc_theora_state *_state);
 
 /*Default pure-C implementations.*/
@@ -426,6 +428,8 @@
 void oc_state_frag_recon_c(oc_theora_state *_state,const oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_c(void);
 
 #endif

Modified: trunk/theora-exp/lib/state.c
===================================================================
--- trunk/theora-exp/lib/state.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/state.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -513,6 +513,8 @@
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
   _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
 }
 
@@ -950,15 +952,12 @@
 
 /*Initialize the bounding values array used by the loop filter.
   _bv: Storage for the array.
-       The total array size should be 512, but this pointer should point to the
-         256th entry, as that is more convenient for the filter functions.
   Return: 0 on success, or a non-zero value if no filtering need be applied.*/
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv){
   int flimit;
   int i;
   flimit=_state->loop_filter_limits[_state->qis[0]];
   if(flimit==0)return 1;
-  _bv-=256;
   memset(_bv,0,sizeof(_bv[0])*512);
   for(i=0;i<flimit;i++){
     _bv[256-i-flimit]=i-flimit;
@@ -979,7 +978,13 @@
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-  th_img_plane  *iplane;
+  _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
+   _fragy0,_fragy_end);
+}
+
+void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  th_img_plane      *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag_top;
   oc_fragment       *frag0;
@@ -987,6 +992,7 @@
   oc_fragment       *frag_end;
   oc_fragment       *frag0_end;
   oc_fragment       *frag_bot;
+  _bv+=256;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -1029,9 +1035,9 @@
   int framei;
   int pli;
   framei=_state->ref_frame_idx[_frame];
-  if(oc_state_loop_filter_init(_state,bounding_values+256))return;
+  if(oc_state_loop_filter_init(_state,bounding_values))return;
   for(pli=0;pli<3;pli++){
-    oc_state_loop_filter_frag_rows(_state,bounding_values+256,
+    oc_state_loop_filter_frag_rows(_state,bounding_values,
      framei,pli,0,_state->fplanes[pli].nvfrags);
   }
 }

Modified: trunk/theora-exp/lib/x86/mmxfrag.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxfrag.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/x86/mmxfrag.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -16,158 +16,220 @@
 
 #if defined(OC_X86ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t V128=
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V128=
  0x0080008000800080LL;
 
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
-  __asm__ __volatile__(
-   "  mov          $0x7, %%ecx  \n\t" /* 8x loop */
-   "  .p2align 4                \n\t"
-   "1:movq           %3, %%mm0  \n\t" /* Set mm0 to 0x0080008000800080 */
-   "  movq         (%1), %%mm2  \n\t" /* First four input values */
-   "  movq        %%mm0, %%mm1  \n\t" /* Set mm1 == mm0 */
-   "  movq        8(%1), %%mm3  \n\t" /* Next four input values */
-   "  decl      %%ecx           \n\t" /* dec counter */
-   "  paddsw      %%mm3, %%mm1  \n\t" /* add+128 and saturate to 16bit */
-   "  lea      0x10(%1), %1     \n\t" /*_residuo+16 */
-   "  paddsw      %%mm2, %%mm0  \n\t" /* add+128 and saturate to 16bit   */
-   "  packuswb    %%mm1, %%mm0  \n\t" /* pack saturate with next(high) four values */
-   "  movq      %%mm0, (%0)     \n\t" /* writeback */
-   "  lea         (%0,%2), %0   \n\t" /*_dst+_dst_ystride */
-   "  jns 1b                    \n\t" /* loop */
-   :"+r" (_dst)
-   :"r" (_residue),
-    "r" ((long)_dst_ystride),
-    "m" (V128)
-   :"memory", "ecx", "cc"
-  );
+  int i;
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+      /*Set mm0 to 0x0080008000800080.*/
+      "movq %[OC_V128],%%mm0\n\t"
+      /*First four input values*/
+      "movq (%[residue]),%%mm2\n\t"
+      /*Set mm1=mm0.*/
+      "movq %%mm0,%%mm1\n\t"
+      /*Next four input values.*/
+      "movq 8(%[residue]),%%mm3\n\t"
+      /*Add 128 and saturate to 16 bits.*/
+      "paddsw %%mm3,%%mm1\n\t"
+      /*_residue+=16*/
+      "lea 0x10(%[residue]),%[residue]\n\t"
+      /*Add 128 and saturate to 16 bits.*/
+      "paddsw %%mm2,%%mm0\n\t"
+      /*Pack saturate with next(high) four values.*/
+      "packuswb %%mm1,%%mm0\n\t"
+      /*Writeback.*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*_dst+=_dst_ystride*/
+      "lea  (%[dst],%[dst_ystride]),%[dst]\n\t"
+      :[dst]"+r"(_dst),[residue]"+r"(_residue)
+      :[dst_ystride]"r"((long)_dst_ystride),[OC_V128]"m"(OC_V128)
+      :"memory"
+    );
+  }
 }
 
 void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
   int i;
-  __asm__ __volatile__(
-   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0  */
-   "  .p2align 4                   \n\t"
-   "1: movq        (%4),   %%mm2   \n\t" /* load mm2 with _src */
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy mm2 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm2   \n\t" /* expand high part of _src to 16 bits */
-   "  punpcklbw    %%mm0,  %%mm3   \n\t" /* expand low part of _src to 16 bits */
-   "  paddsw       (%1),   %%mm3   \n\t" /* add low part with low part of residue */
-   "  paddsw       8(%1),  %%mm2   \n\t" /* high with high */
-   "  packuswb     %%mm2,  %%mm3   \n\t" /* pack and saturate to mm3 */
-   "  lea         (%4,%3), %4      \n\t" /* _src+_src_ystride */
-   "  lea         0x10(%1), %1     \n\t" /* _residuo+16 */
-   "  movq        %%mm3,   (%0)    \n\t" /* put mm3 to dest */
-   "  lea         (%0,%2),%0       \n\t" /* _dst+_dst_ystride */
-   "  decl        %%eax            \n\t" /* dec counter */
-   "  jns         1b               \n\t" /* loop */
-   :"+r" (_dst)
-   :"r" (_residue), 
-    "r" ((long)_dst_ystride),
-    "r" ((long)_src_ystride),
-    "r" (_src)
-   :"memory", "eax", "cc"
-  );
+  /*Zero mm0.*/
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+      /*Load mm2 with _src*/
+      "movq (%[src]),%%mm2\n\t"
+      /*Copy mm2 to mm3.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*Expand high part of _src to 16 bits.*/
+      "punpckhbw %%mm0,%%mm2\n\t"
+      /*Expand low part of _src to 16 bits.*/
+      "punpcklbw %%mm0,%%mm3\n\t"
+      /*Add low part with low part of residue.*/
+      "paddsw (%[residue]),%%mm3\n\t"
+      /*High with high.*/
+      "paddsw 8(%[residue]),%%mm2\n\t"
+      /*Pack and saturate to mm3.*/
+      "packuswb %%mm2,%%mm3\n\t"
+      /*_src+=_src_ystride*/
+      "lea (%[src],%[src_ystride]),%[src]\n\t"
+      /*_residue+=16*/
+      "lea 0x10(%[residue]),%[residue]\n\t"
+      /*Put mm3 to dest.*/
+      "movq %%mm3,(%[dst])\n\t"
+      /*_dst+=_dst_ystride*/
+      "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
+      :[dst]"+r"(_dst),[src]"+r"(_src),[residue]"+r"(_residue)
+      :[dst_ystride]"r"((long)_dst_ystride),
+       [src_ystride]"r"((long)_src_ystride)
+      :"memory"
+    );
+  }
 }
 
-#if (defined(__amd64__) ||  defined(__x86_64__))
+#if defined(__amd64__)||defined(__x86_64__)
 
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
-
+  int i;
   __asm__ __volatile__(
-   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
-   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
-   "  .p2align 4                   \n\t"
-   "1:movq         (%6),   %%mm4   \n\t" /* packed SRC2 */ 
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
-   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
-   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
-   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
-   "  lea          (%4,%3), %4     \n\t" /*  _src1+_src1_ystride */
-   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
-   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
-   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
-   "  lea          (%6,%5), %6     \n\t" /* _src2+_src2_ystride */  
-   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
-   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
-   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
-   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
-   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
-   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
-   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
-   "  movq         %%mm4, (%0)     \n\t" /* write to src */
-   "  decl         %%eax           \n\t"
-   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
-   "  jns          1b\n\t"
-   :"+r" (_dst) /* 0 */
-   :"r" (_residue), /* 1 */
-    "r" ((long)_dst_ystride), /* 2 */
-    "r" ((long)_src1_ystride), /* 3 */
-    "r" (_src1), /* 4 */
-    "r" ((long)_src2_ystride), /* 5 */
-    "r" (_src2) /* 6 */
-   : "memory", "cc", "eax"
+    /*Zero mm0.*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*Load mm2 with _src1.*/
+    "movq (%[src1]),%%mm2\n\t"
+    :[src1]"+r"(_src1)
+    :
   );
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+     /*Packed _src2.*/ 
+     "movq (%[src2]),%%mm4\n\t"
+     /*Copy packed src1 to mm3.*/
+     "movq %%mm2,%%mm3\n\t"
+     /*Copy packed src2 to mm5.*/
+     "movq %%mm4,%%mm5\n\t"
+     /*Expand low part of src1 to mm2.*/
+     "punpcklbw %%mm0,%%mm2\n\t"
+     /*Expand Low part of src2 to mm4.*/
+     "punpcklbw %%mm0,%%mm4\n\t"
+     /*_src1+=_src1_ystride*/
+     "lea (%[src1],%[src1_ystride]),%[src1]\n\t"
+     /*Expand high part of src1 to mm3.*/
+     "punpckhbw %%mm0,%%mm3\n\t"
+     /*Expand high part of src2 to mm5.*/
+     "punpckhbw %%mm0,%%mm5\n\t"
+     /*Add low parts of src1 and src2.*/
+     "paddsw %%mm2,%%mm4\n\t"
+     /*Add high parts of src1 and src2.*/
+     "paddsw %%mm3,%%mm5\n\t"
+     /*_src2+=_src2_ystride.*/
+     "lea (%[src2],%[src2_ystride]),%[src2]\n\t"
+     /*Load mm2 with _src1.*/
+     "movq (%[src1]),%%mm2\n\t"
+     /*Shift logical 1 to right o 2 dolu.*/
+     "psrlw $1,%%mm4\n\t"
+     /*Shift logical 1 to right.*/
+     "psrlw $1,%%mm5\n\t"
+     /*Add low parts wwith low parts.*/
+     "paddsw (%[residue]),%%mm4\n\t"
+     /*Add highparts with high.*/
+     "paddsw 8(%[residue]),%%mm5\n\t"
+     /*Pack saturate high to low.*/
+     "packuswb %%mm5,%%mm4\n\t"
+     /*_residue+=16.*/
+     "lea 0x10(%[residue]),%[residue]\n\t"
+     /*Write to dst.*/
+     "movq %%mm4,(%[dst])\n\t"
+     /*_dst+=_dst_ystride*/
+     "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
+     :[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
+     :[dst_ystride]"r"((long)_dst_ystride),
+      [src1_ystride]"r"((long)_src1_ystride),
+      [src2_ystride]"r"((long)_src2_ystride)
+     :"memory"
+    );
+  }
 }
+
 #else
 
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
-  int i;
+  long a;
+  int  i;
   __asm__ __volatile__(
-   "  movl         $0x7,   %7      \n\t" /* 8x loop */
-   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
-   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
-   "  .p2align 4                   \n\t"
-   "1: movq        (%6),   %%mm4   \n\t" /* packed SRC2 */ 
-   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
-   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
-   "  mov          %3,     %%eax   \n\t"
-   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
-   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
-   "  lea          (%4,%%eax), %4  \n\t" /*  _src1+_src1_ystride */
-   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
-   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
-   "  mov          %5,     %%eax   \n\t"
-   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
-   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
-   "  lea          (%6,%%eax), %6  \n\t" /* _src2+_src2_ystride */  
-   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
-   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
-   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
-   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
-   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
-   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
-   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
-   "  movq         %%mm4, (%0)     \n\t" /* write to src */
-   "  decl         %7              \n\t"
-   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
-   "  jns          1b\n\t"
-   :"+r" (_dst) /* 0 */
-   :"r" (_residue), /* 1 */
-    "r" (_dst_ystride), /* 2 */
-    "m" (_src1_ystride), /* 3 */
-    "r" (_src1), /* 4 */
-    "m" (_src2_ystride), /* 5 */
-    "r" (_src2), /* 6 */
-    "m" (i)
-   :"memory", "eax", "cc"
+    /*Zero mm0.*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*Load mm2 with _src1.*/
+    "movq (%[src1]),%%mm2\n\t"
+    :[src1]"+r"(_src1)
+    :
   );
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+     /*Packed _src2.*/ 
+     "movq (%[src2]),%%mm4\n\t"
+     /*Copy packed src1 to mm3.*/
+     "movq %%mm2,%%mm3\n\t"
+     /*Copy packed src2 to mm5.*/
+     "movq %%mm4,%%mm5\n\t"
+     /*eax=_src1_ystride*/
+     "mov %[src1_ystride],%[a]\n\t"
+     /*Expand low part of src1 to mm2.*/
+     "punpcklbw %%mm0,%%mm2\n\t"
+     /*Expand Low part of src2 to mm4.*/
+     "punpcklbw %%mm0,%%mm4\n\t"
+     /*_src1+=_src1_ystride*/
+     "lea (%[src1],%[a]),%[src1]\n\t"
+     /*Expand high part of src1 to mm3.*/
+     "punpckhbw %%mm0,%%mm3\n\t"
+     /*Expand high part of src2 to mm5.*/
+     "punpckhbw %%mm0,%%mm5\n\t"
+     /*eax=_src2_ystride*/
+     "mov %[src2_ystride],%[a]\n\t"
+     /*Add low parts of src1 and src2.*/
+     "paddsw %%mm2,%%mm4\n\t"
+     /*Add high parts of src1 and src2.*/
+     "paddsw %%mm3,%%mm5\n\t"
+     /*_src2+=_src2_ystride.*/
+     "lea (%[src2],%[a]),%[src2]\n\t"
+     /*Load mm2 with _src1.*/
+     "movq (%[src1]),%%mm2\n\t"
+     /*Shift logical 1 to right o 2 dolu.*/
+     "psrlw $1,%%mm4\n\t"
+     /*Shift logical 1 to right.*/
+     "psrlw $1,%%mm5\n\t"
+     /*Add low parts wwith low parts.*/
+     "paddsw (%[residue]),%%mm4\n\t"
+     /*Add highparts with high.*/
+     "paddsw 8(%[residue]),%%mm5\n\t"
+     /*eax=_dst_ystride.*/
+     "mov %[dst_ystride],%[a]\n\t"
+     /*Pack saturate high to low.*/
+     "packuswb %%mm5,%%mm4\n\t"
+     /*_residue+=16.*/
+     "lea 0x10(%[residue]),%[residue]\n\t"
+     /*Write to dst.*/
+     "movq %%mm4,(%[dst])\n\t"
+     /*_dst+=_dst_ystride*/
+     "lea (%[dst],%[a]),%[dst]\n\t"
+     :[a]"=&a"(a),[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
+     :[dst_ystride]"m"((long)_dst_ystride),
+      [src1_ystride]"m"((long)_src1_ystride),
+      [src2_ystride]"m"((long)_src2_ystride)
+     :"memory"
+    );
+  }
 }
 
 #endif
 
 void oc_restore_fpu_mmx(void){
-  __asm__ __volatile__(
-   "  emms    \n\t" /* pack with next(high) four values */
-  );
+  __asm__ __volatile__("emms\n\t");
 }
 #endif

Modified: trunk/theora-exp/lib/x86/mmxidct.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxidct.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/x86/mmxidct.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -19,7 +19,8 @@
 
 
 /*A table of constants used by the MMX routines.*/
-ogg_uint16_t __attribute__((aligned(8),used)) OC_IDCT_CONSTS[(4+7+1)*4]={
+static const ogg_uint16_t __attribute__((aligned(8),used))
+ OC_IDCT_CONSTS[(4+7+1)*4]={
   65535,    0,    0,    0,
       0,65535,    0,    0,
       0,    0,65535,    0,
@@ -41,416 +42,484 @@
       8,    8,    8,    8
 };
 
-/*Converts the expression in the argument to a sting.*/
+/*Converts the expression in the argument to a string.*/
 #define OC_M2STR(_s) #_s
 
 /*38 cycles*/
 #define OC_IDCT_BEGIN \
- "  #OC_IDCT_BEGIN\n\t" \
- "  movq   "OC_I(3)",     %mm2\n\t" \
- "  movq   "OC_C(3)",     %mm6\n\t" \
- "  movq        %mm2,     %mm4\n\t" \
- "  movq   "OC_J(5)",     %mm7\n\t" \
- "  pmulhw      %mm6,     %mm4\n\t" \
- "  movq   "OC_C(5)",     %mm1\n\t" \
- "  pmulhw      %mm7,     %mm6\n\t" \
- "  movq        %mm1,     %mm5\n\t" \
- "  pmulhw      %mm2,     %mm1\n\t" \
- "  movq   "OC_I(1)",     %mm3\n\t" \
- "  pmulhw      %mm7,     %mm5\n\t" \
- "  movq   "OC_C(1)",     %mm0\n\t" \
- "  paddw       %mm2,     %mm4\n\t" \
- "  paddw       %mm7,     %mm6\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  movq   "OC_J(7)",     %mm1\n\t" \
- "  paddw       %mm5,     %mm7\n\t" \
- "  movq        %mm0,     %mm5\n\t" \
- "  pmulhw      %mm3,     %mm0\n\t" \
- "  paddw       %mm7,     %mm4\n\t" \
- "  pmulhw      %mm1,     %mm5\n\t" \
- "  movq   "OC_C(7)",     %mm7\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm3,     %mm0\n\t" \
- "  pmulhw      %mm7,     %mm3\n\t" \
- "  movq   "OC_I(2)",     %mm2\n\t" \
- "  pmulhw      %mm1,     %mm7\n\t" \
- "  paddw       %mm1,     %mm5\n\t" \
- "  movq        %mm2,     %mm1\n\t" \
- "  pmulhw "OC_C(2)",     %mm2\n\t" \
- "  psubw       %mm5,     %mm3\n\t" \
- "  movq   "OC_J(6)",     %mm5\n\t" \
- "  paddw       %mm7,     %mm0\n\t" \
- "  movq        %mm5,     %mm7\n\t" \
- "  psubw       %mm4,     %mm0\n\t" \
- "  pmulhw "OC_C(2)",     %mm5\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  paddw       %mm4,     %mm4\n\t" \
- "  paddw       %mm0,     %mm4\n\t" \
- "  psubw       %mm6,     %mm3\n\t" \
- "  paddw       %mm7,     %mm5\n\t" \
- "  paddw       %mm6,     %mm6\n\t" \
- "  pmulhw "OC_C(6)",     %mm7\n\t" \
- "  paddw       %mm3,     %mm6\n\t" \
- "  movq        %mm4,"OC_I(1)"\n\t" \
- "  psubw       %mm5,     %mm1\n\t" \
- "  movq   "OC_C(4)",     %mm4\n\t" \
- "  movq        %mm3,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm3\n\t" \
- "  paddw       %mm2,     %mm7\n\t" \
- "  movq        %mm6,"OC_I(2)"\n\t" \
- "  movq        %mm0,     %mm2\n\t" \
- "  movq   "OC_I(0)",     %mm6\n\t" \
- "  pmulhw      %mm4,     %mm0\n\t" \
- "  paddw       %mm3,     %mm5\n\t" \
- "  movq   "OC_J(4)",     %mm3\n\t" \
- "  psubw       %mm1,     %mm5\n\t" \
- "  paddw       %mm0,     %mm2\n\t" \
- "  psubw       %mm3,     %mm6\n\t" \
- "  movq        %mm6,     %mm0\n\t" \
- "  pmulhw      %mm4,     %mm6\n\t" \
- "  paddw       %mm3,     %mm3\n\t" \
- "  paddw       %mm1,     %mm1\n\t" \
- "  paddw       %mm0,     %mm3\n\t" \
- "  paddw       %mm5,     %mm1\n\t" \
- "  pmulhw      %mm3,     %mm4\n\t" \
- "  paddw       %mm0,     %mm6\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm2,     %mm2\n\t" \
- "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddw       %mm6,     %mm2\n\t" \
- "  paddw       %mm3,     %mm4\n\t" \
- "  psubw       %mm1,     %mm2\n\t" \
- "#end OC_IDCT_BEGIN\n\t"
+  "#OC_IDCT_BEGIN\n\t" \
+  "movq "OC_I(3)",%%mm2\n\t" \
+  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "movq "OC_J(5)",%%mm7\n\t" \
+  "pmulhw %%mm6,%%mm4\n\t" \
+  "movq "OC_C(5)",%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm6\n\t" \
+  "movq %%mm1,%%mm5\n\t" \
+  "pmulhw %%mm2,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm3\n\t" \
+  "pmulhw %%mm7,%%mm5\n\t" \
+  "movq "OC_C(1)",%%mm0\n\t" \
+  "paddw %%mm2,%%mm4\n\t" \
+  "paddw %%mm7,%%mm6\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "movq "OC_J(7)",%%mm1\n\t" \
+  "paddw %%mm5,%%mm7\n\t" \
+  "movq %%mm0,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm0\n\t" \
+  "paddw %%mm7,%%mm4\n\t" \
+  "pmulhw %%mm1,%%mm5\n\t" \
+  "movq "OC_C(7)",%%mm7\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm3,%%mm0\n\t" \
+  "pmulhw %%mm7,%%mm3\n\t" \
+  "movq "OC_I(2)",%%mm2\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "paddw %%mm1,%%mm5\n\t" \
+  "movq %%mm2,%%mm1\n\t" \
+  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "psubw %%mm5,%%mm3\n\t" \
+  "movq "OC_J(6)",%%mm5\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "movq %%mm5,%%mm7\n\t" \
+  "psubw %%mm4,%%mm0\n\t" \
+  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psubw %%mm6,%%mm3\n\t" \
+  "paddw %%mm7,%%mm5\n\t" \
+  "paddw %%mm6,%%mm6\n\t" \
+  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm6\n\t" \
+  "movq %%mm4,"OC_I(1)"\n\t" \
+  "psubw %%mm5,%%mm1\n\t" \
+  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm4,%%mm3\n\t" \
+  "paddw %%mm2,%%mm7\n\t" \
+  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm0,%%mm2\n\t" \
+  "movq "OC_I(0)",%%mm6\n\t" \
+  "pmulhw %%mm4,%%mm0\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "movq "OC_J(4)",%%mm3\n\t" \
+  "psubw %%mm1,%%mm5\n\t" \
+  "paddw %%mm0,%%mm2\n\t" \
+  "psubw %%mm3,%%mm6\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "pmulhw %%mm4,%%mm6\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm0,%%mm3\n\t" \
+  "paddw %%mm5,%%mm1\n\t" \
+  "pmulhw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm6\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "movq "OC_I(1)",%%mm0\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "paddw %%mm3,%%mm4\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "#end OC_IDCT_BEGIN\n\t" \
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm__ __volatile__( \
- "  #OC_ROW_IDCT\n" \
- OC_IDCT_BEGIN \
- "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
- "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
- "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
- "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
- "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddw       %mm3,     %mm3\n\t" \
- "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
- "  paddw       %mm5,     %mm5\n\t" \
- "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
- "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
- "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddw       %mm0,     %mm0\n\t" \
- "  movq        %mm1,"OC_I(1)"\n\t"  /* save R1 */ \
- "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
- "#end OC_ROW_IDCT\n\t" \
-)
+#define OC_ROW_IDCT \
+  "#OC\n" \
+  OC_IDCT_BEGIN \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*Save R1.*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r0=R0=G.+C.*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  "#end OC_ROW_IDCT\n\t" \
 
-/* The following macro does two 4x4 transposes in place.
-   At entry, we assume:
-     r0 = a3 a2 a1 a0
-   I(1) = b3 b2 b1 b0
-     r2 = c3 c2 c1 c0
-     r3 = d3 d2 d1 d0
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
 
-     r4 = e3 e2 e1 e0
-     r5 = f3 f2 f1 f0
-     r6 = g3 g2 g1 g0
-     r7 = h3 h2 h1 h0
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
 
-   At exit, we have:
-   I(0) = d0 c0 b0 a0
-   I(1) = d1 c1 b1 a1
-   I(2) = d2 c2 b2 a2
-   I(3) = d3 c3 b3 a3
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
 
-   J(4) = h0 g0 f0 e0
-   J(5) = h1 g1 f1 e1
-   J(6) = h2 g2 f2 e2
-   J(7) = h3 g3 f3 e3
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
 
-   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
-   J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
 
-   Since r1 is free at entry, we calculate the Js first.*/
+  Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm__ __volatile__( \
- "  #OC_TRANSPOSE\n\t" \
- "  movq           %mm4,     %mm1\n\t" \
- "  punpcklwd      %mm5,     %mm4\n\t" \
- "  movq           %mm0,"OC_I(0)"\n\t" \
- "  punpckhwd      %mm5,     %mm1\n\t" \
- "  movq           %mm6,     %mm0\n\t" \
- "  punpcklwd      %mm7,     %mm6\n\t" \
- "  movq           %mm4,     %mm5\n\t" \
- "  punpckldq      %mm6,     %mm4\n\t" \
- "  punpckhdq      %mm6,     %mm5\n\t" \
- "  movq           %mm1,     %mm6\n\t" \
- "  movq           %mm4,"OC_J(4)"\n\t" \
- "  punpckhwd      %mm7,     %mm0\n\t" \
- "  movq           %mm5,"OC_J(5)"\n\t" \
- "  punpckhdq      %mm0,     %mm6\n\t" \
- "  movq      "OC_I(0)",     %mm4\n\t" \
- "  punpckldq      %mm0,     %mm1\n\t" \
- "  movq      "OC_I(1)",     %mm5\n\t" \
- "  movq           %mm4,     %mm0\n\t" \
- "  movq           %mm6,"OC_J(7)"\n\t" \
- "  punpcklwd      %mm5,     %mm0\n\t" \
- "  movq           %mm1,"OC_J(6)"\n\t" \
- "  punpckhwd      %mm5,     %mm4\n\t" \
- "  movq           %mm2,     %mm5\n\t" \
- "  punpcklwd      %mm3,     %mm2\n\t" \
- "  movq           %mm0,     %mm1\n\t" \
- "  punpckldq      %mm2,     %mm0\n\t" \
- "  punpckhdq      %mm2,     %mm1\n\t" \
- "  movq           %mm4,     %mm2\n\t" \
- "  movq           %mm0,"OC_I(0)"\n\t" \
- "  punpckhwd      %mm3,     %mm5\n\t" \
- "  movq           %mm1,"OC_I(1)"\n\t" \
- "  punpckhdq      %mm5,     %mm4\n\t" \
- "  punpckldq      %mm5,     %mm2\n\t" \
- "  movq           %mm4,"OC_I(3)"\n\t" \
- "  movq           %mm2,"OC_I(2)"\n\t" \
- "#end OC_TRANSPOSE\n\t" \
-)
+#define OC_TRANSPOSE \
+  "#OC_TRANSPOSE\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "punpcklwd %%mm5,%%mm4\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm5,%%mm1\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "punpcklwd %%mm7,%%mm6\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "punpckldq %%mm6,%%mm4\n\t" \
+  "punpckhdq %%mm6,%%mm5\n\t" \
+  "movq %%mm1,%%mm6\n\t" \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  "punpckhwd %%mm7,%%mm0\n\t" \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  "punpckhdq %%mm0,%%mm6\n\t" \
+  "movq "OC_I(0)",%%mm4\n\t" \
+  "punpckldq %%mm0,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq %%mm4,%%mm0\n\t" \
+  "movq %%mm6,"OC_J(7)"\n\t" \
+  "punpcklwd %%mm5,%%mm0\n\t" \
+  "movq %%mm1,"OC_J(6)"\n\t" \
+  "punpckhwd %%mm5,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklwd %%mm3,%%mm2\n\t" \
+  "movq %%mm0,%%mm1\n\t" \
+  "punpckldq %%mm2,%%mm0\n\t" \
+  "punpckhdq %%mm2,%%mm1\n\t" \
+  "movq %%mm4,%%mm2\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm3,%%mm5\n\t" \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  "punpckhdq %%mm5,%%mm4\n\t" \
+  "punpckldq %%mm5,%%mm2\n\t" \
+  "movq %%mm4,"OC_I(3)"\n\t" \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  "#end OC_TRANSPOSE\n\t" \
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm__ __volatile__( \
- "  #OC_COLUMN_IDCT\n" \
- OC_IDCT_BEGIN \
- "  paddw     "OC_8",     %mm2\n\t" \
- "  paddw       %mm1,     %mm1\n\t"  /* r1 = H. + H. */ \
- "  paddw       %mm2,     %mm1\n\t"  /* r1 = R1 = A.. + H. */ \
- "  psraw         $4,     %mm2\n\t"  /* r2 = NR2 */ \
- "  psubw       %mm7,     %mm4\n\t"  /* r4 = E. = E - G */ \
- "  psraw         $4,     %mm1\n\t"  /* r1 = NR1 */ \
- "  movq   "OC_I(2)",     %mm3\n\t"  /* r3 = D. */ \
- "  paddw       %mm7,     %mm7\n\t"  /* r7 = G + G */ \
- "  movq        %mm2,"OC_I(2)"\n\t"  /* store NR2 at I2 */ \
- "  paddw       %mm4,     %mm7\n\t"  /* r7 = G. = E + G */ \
- "  movq        %mm1,"OC_I(1)"\n\t"  /* store NR1 at I1 */ \
- "  psubw       %mm3,     %mm4\n\t"  /* r4 = R4 = E. - D. */ \
- "  paddw     "OC_8",     %mm4\n\t" \
- "  paddw       %mm3,     %mm3\n\t"  /* r3 = D. + D. */ \
- "  paddw       %mm4,     %mm3\n\t"  /* r3 = R3 = E. + D. */ \
- "  psraw         $4,     %mm4\n\t"  /* r4 = NR4 */ \
- "  psubw       %mm5,     %mm6\n\t"  /* r6 = R6 = F. - B.. */ \
- "  psraw         $4,     %mm3\n\t"  /* r3 = NR3 */ \
- "  paddw     "OC_8",     %mm6\n\t" \
- "  paddw       %mm5,     %mm5\n\t"  /* r5 = B.. + B.. */ \
- "  paddw       %mm6,     %mm5\n\t"  /* r5 = R5 = F. + B.. */ \
- "  psraw         $4,     %mm6\n\t"  /* r6 = NR6 */ \
- "  movq        %mm4,"OC_J(4)"\n\t"  /* store NR4 at J4 */ \
- "  psraw         $4,     %mm5\n\t"  /* r5 = NR5 */ \
- "  movq        %mm3,"OC_I(3)"\n\t"  /* store NR3 at I3 */ \
- "  psubw       %mm0,     %mm7\n\t"  /* r7 = R7 = G. - C. */ \
- "  paddw     "OC_8",     %mm7\n\t" \
- "  paddw       %mm0,     %mm0\n\t"  /* r0 = C. + C. */ \
- "  paddw       %mm7,     %mm0\n\t"  /* r0 = R0 = G. + C. */ \
- "  psraw         $4,     %mm7\n\t"  /* r7 = NR7 */ \
- "  movq        %mm6,"OC_J(6)"\n\t"  /* store NR6 at J6 */ \
- "  psraw         $4,     %mm0\n\t"  /* r0 = NR0 */ \
- "  movq        %mm5,"OC_J(5)"\n\t"  /* store NR5 at J5 */ \
- "  movq        %mm7,"OC_J(7)"\n\t"  /* store NR7 at J7 */ \
- "  movq        %mm0,"OC_I(0)"\n\t"  /* store NR0 at I0 */ \
- "  #end OC_COLUMN_IDCT\n\t" \
-)
+#define OC_COLUMN_IDCT \
+  "#OC_COLUMN_IDCT\n" \
+  OC_IDCT_BEGIN \
+  "paddw "OC_8",%%mm2\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r2=NR2*/ \
+  "psraw $4,%%mm2\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=NR1*/ \
+  "psraw $4,%%mm1\n\t" \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*Store NR2 at I(2).*/ \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*Store NR1 at I(1).*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw "OC_8",%%mm4\n\t" \
+  /*r3=D'+D'*/ \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r4=NR4*/ \
+  "psraw $4,%%mm4\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*r3=NR3*/ \
+  "psraw $4,%%mm3\n\t" \
+  "paddw "OC_8",%%mm6\n\t" \
+  /*r5=B''+B''*/ \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r6=NR6*/ \
+  "psraw $4,%%mm6\n\t" \
+  /*Store NR4 at J(4).*/ \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  /*r5=NR5*/ \
+  "psraw $4,%%mm5\n\t" \
+  /*Store NR3 at I(3).*/ \
+  "movq %%mm3,"OC_I(3)"\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw "OC_8",%%mm7\n\t" \
+  /*r0=C'+C'*/ \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*r0=R0=G'+C'*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  /*r7=NR7*/ \
+  "psraw $4,%%mm7\n\t" \
+  /*Store NR6 at J(6).*/ \
+  "movq %%mm6,"OC_J(6)"\n\t" \
+  /*r0=NR0*/ \
+  "psraw $4,%%mm0\n\t" \
+  /*Store NR5 at J(5).*/ \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  /*Store NR7 at J(7).*/ \
+  "movq %%mm7,"OC_J(7)"\n\t" \
+  /*Store NR0 at I(0).*/ \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "#end OC_COLUMN_IDCT\n\t" \
 
-#if (defined(__amd64__) || defined(__x86_64__))
-# define OC_MID_REG "%rcx"
-# define OC_Y_REG   "%rdx"
-#else
-# define OC_MID_REG "%ecx"
-# define OC_Y_REG   "%edx"
-#endif
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"("OC_MID_REG")"
+#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
 #define OC_M(_i)      OC_MID(OC_MASK_OFFSET,_i)
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
 void oc_idct8x8_mmx(ogg_int16_t _y[64]){
-/*This routine accepts an 8x8 matrix, but in transposed form.
-  Every 4x4 submatrix is transposed.*/
+  /*This routine accepts an 8x8 matrix, but in transposed form.
+    Every 4x4 submatrix is transposed.*/
   __asm__ __volatile__(
-   ""
-   :
-   :"d" (_y),
-    "c" (OC_IDCT_CONSTS)
-  );
-#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
-  OC_ROW_IDCT;
-  OC_TRANSPOSE;
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"("OC_Y_REG")"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"("OC_Y_REG")"
-  OC_ROW_IDCT;
-  OC_TRANSPOSE;
+#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k)      OC_I(_k)
-  OC_COLUMN_IDCT;
+    OC_COLUMN_IDCT
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
 #define OC_J(_k)      OC_I(_k)
-  OC_COLUMN_IDCT;
+    OC_COLUMN_IDCT
 #undef  OC_I
 #undef  OC_J
-  __asm__ __volatile__(
-   " emms\n\t"
+    "emms\n\t"
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 
 /*25 cycles.*/
 #define OC_IDCT_BEGIN_10 \
- "  #OC_IDCT_BEGIN_10\n\t" \
- "  movq   "OC_I(3)",     %mm2\n\t" \
- "  nop\n\t" \
- "  movq   "OC_C(3)",     %mm6\n\t" \
- "  movq        %mm2,     %mm4\n\t" \
- "  movq   "OC_C(5)",     %mm1\n\t" \
- "  pmulhw      %mm6,     %mm4\n\t" \
- "  movq   "OC_I(1)",     %mm3\n\t" \
- "  pmulhw      %mm2,     %mm1\n\t" \
- "  movq   "OC_C(1)",     %mm0\n\t" \
- "  paddw       %mm2,     %mm4\n\t" \
- "  pxor        %mm6,     %mm6\n\t" \
- "  paddw       %mm1,     %mm2\n\t" \
- "  movq   "OC_I(2)",     %mm5\n\t" \
- "  pmulhw      %mm3,     %mm0\n\t" \
- "  movq        %mm5,     %mm1\n\t" \
- "  paddw       %mm3,     %mm0\n\t" \
- "  pmulhw "OC_C(7)",     %mm3\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  pmulhw "OC_C(2)",     %mm5\n\t" \
- "  psubw       %mm4,     %mm0\n\t" \
- "  movq   "OC_I(2)",     %mm7\n\t" \
- "  paddw       %mm4,     %mm4\n\t" \
- "  paddw       %mm5,     %mm7\n\t" \
- "  paddw       %mm0,     %mm4\n\t" \
- "  pmulhw "OC_C(6)",     %mm1\n\t" \
- "  psubw       %mm6,     %mm3\n\t" \
- "  movq        %mm4,"OC_I(1)"\n\t" \
- "  paddw       %mm6,     %mm6\n\t" \
- "  movq   "OC_C(4)",     %mm4\n\t" \
- "  paddw       %mm3,     %mm6\n\t" \
- "  movq        %mm3,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm3\n\t" \
- "  movq        %mm6,"OC_I(2)"\n\t" \
- "  movq        %mm0,     %mm2\n\t" \
- "  movq   "OC_I(0)",     %mm6\n\t" \
- "  pmulhw      %mm4,     %mm0\n\t" \
- "  paddw       %mm3,     %mm5\n\t" \
- "  paddw       %mm0,     %mm2\n\t" \
- "  psubw       %mm1,     %mm5\n\t" \
- "  pmulhw      %mm4,     %mm6\n\t" \
- "  paddw  "OC_I(0)",     %mm6\n\t" \
- "  paddw       %mm1,     %mm1\n\t" \
- "  movq        %mm6,     %mm4\n\t" \
- "  paddw       %mm5,     %mm1\n\t" \
- "  psubw       %mm2,     %mm6\n\t" \
- "  paddw       %mm2,     %mm2\n\t" \
- "  movq   "OC_I(1)",     %mm0\n\t" \
- "  paddw       %mm6,     %mm2\n\t" \
- "  psubw       %mm1,     %mm2\n\t" \
- "  nop\n\t" \
- "  #end OC_IDCT_BEGIN_10\n\t"
+ "#OC_IDCT_BEGIN_10\n\t" \
+ "movq "OC_I(3)",%%mm2\n\t" \
+ "nop\n\t" \
+ "movq "OC_C(3)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_C(5)",%%mm1\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_I(1)",%%mm3\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_C(1)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_I(2)",%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "movq %%mm5,%%mm1\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movq "OC_I(2)",%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movq %%mm4,"OC_I(1)"\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "movq "OC_C(4)",%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "nop\n\t" \
+ "#end OC_IDCT_BEGIN_10\n\t" \
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm__ __volatile__( \
- "  #OC_ROW_IDCT_10\n\t" \
+#define OC_ROW_IDCT_10 \
+ "#OC_ROW_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "  movq    "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  psubw        %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
- "  paddw        %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddw        %mm7,     %mm7\n\t" /* r7 = G + G */ \
- "  paddw        %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- "  paddw        %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
- "  psubw        %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddw        %mm3,     %mm3\n\t" \
- "  psubw        %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- "  paddw        %mm5,     %mm5\n\t" \
- "  paddw        %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
- "  paddw        %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- "  psubw        %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddw        %mm0,     %mm0\n\t" \
- "  movq         %mm1,"OC_I(1)"\n\t" /* save R1 */ \
- "  paddw        %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT_10\n\t" \
-)
 
-/*25+19=44 cycles.*/
-#define OC_COLUMN_IDCT_10 __asm__ __volatile__( \
- "  #OC_COLUMN_IDCT_10\n\t" \
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10 \
+ "#OC_COLUMN_IDCT_10\n\t" \
  OC_IDCT_BEGIN_10 \
- "  paddw     "OC_8",     %mm2\n\t" \
- "  paddw       %mm1,     %mm1\n\t" /* r1 = H. + H. */ \
- "  paddw       %mm2,     %mm1\n\t" /* r1 = R1 = A.. + H. */ \
- "  psraw         $4,     %mm2\n\t" /* r2 = NR2 */ \
- "  psubw       %mm7,     %mm4\n\t" /* r4 = E. = E - G */ \
- "  psraw         $4,     %mm1\n\t" /* r1 = NR1 */ \
- "  movq   "OC_I(2)",     %mm3\n\t" /* r3 = D. */ \
- "  paddw       %mm7,     %mm7\n\t" /* r7 = G + G */ \
- "  movq        %mm2,"OC_I(2)"\n\t" /* store NR2 at I2 */ \
- "  paddw       %mm4,     %mm7\n\t" /* r7 = G. = E + G */ \
- "  movq        %mm1,"OC_I(1)"\n\t" /* store NR1 at I1 */ \
- "  psubw       %mm3,     %mm4\n\t" /* r4 = R4 = E. - D. */ \
- "  paddw     "OC_8",     %mm4\n\t" \
- "  paddw       %mm3,     %mm3\n\t" /* r3 = D. + D. */ \
- "  paddw       %mm4,     %mm3\n\t" /* r3 = R3 = E. + D. */ \
- "  psraw         $4,     %mm4\n\t" /* r4 = NR4 */ \
- "  psubw       %mm5,     %mm6\n\t" /* r6 = R6 = F. - B.. */ \
- "  psraw         $4,     %mm3\n\t" /* r3 = NR3 */ \
- "  paddw     "OC_8",     %mm6\n\t" \
- "  paddw       %mm5,     %mm5\n\t" /* r5 = B.. + B.. */ \
- "  paddw       %mm6,     %mm5\n\t" /* r5 = R5 = F. + B.. */ \
- "  psraw         $4,     %mm6\n\t" /* r6 = NR6 */ \
- "  movq        %mm4,"OC_J(4)"\n\t" /* store NR4 at J4 */ \
- "  psraw         $4,     %mm5\n\t" /* r5 = NR5 */ \
- "  movq        %mm3,"OC_I(3)"\n\t" /* store NR3 at I3 */ \
- "  psubw       %mm0,     %mm7\n\t" /* r7 = R7 = G. - C. */ \
- "  paddw     "OC_8",     %mm7\n\t" \
- "  paddw       %mm0,     %mm0\n\t" /* r0 = C. + C. */ \
- "  paddw       %mm7,     %mm0\n\t" /* r0 = R0 = G. + C. */ \
- "  psraw         $4,     %mm7\n\t" /* r7 = NR7 */ \
- "  movq        %mm6,"OC_J(6)"\n\t" /* store NR6 at J6 */ \
- "  psraw         $4,     %mm0\n\t" /* r0 = NR0 */ \
- "  movq        %mm5,"OC_J(5)"\n\t" /* store NR5 at J5 */ \
- "  movq        %mm7,"OC_J(7)"\n\t" /* store NR7 at J7 */ \
- "  movq        %mm0,"OC_I(0)"\n\t" /* store NR0 at I0 */ \
- "  #end OC_COLUMN_IDCT_10\n\t" \
-)
+ "paddw "OC_8",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_8",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_8",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_8",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0)"\n\t" \
+ "#end OC_COLUMN_IDCT_10\n\t" \
 
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
   __asm__ __volatile__(
-   ""
-   :
-   :"d" (_y),
-   "c" (OC_IDCT_CONSTS)
-  );
-#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"("OC_Y_REG")"
-  /*Done with dequant, descramble, and partial transpose.
-    Now do the iDCT itself.*/
-  OC_ROW_IDCT_10;
-  OC_TRANSPOSE;
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10
+    OC_TRANSPOSE
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"("OC_Y_REG")"
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k) OC_I(_k)
-  OC_COLUMN_IDCT_10;
+    OC_COLUMN_IDCT_10
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"("OC_Y_REG")"
+#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
 #define OC_J(_k) OC_I(_k)
-  OC_COLUMN_IDCT_10;
+    OC_COLUMN_IDCT_10
 #undef  OC_I
 #undef  OC_J
-  __asm__ __volatile__(
-   " emms\n\t"
+    "emms\n\t"
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
 #endif

Modified: trunk/theora-exp/lib/x86/mmxstate.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxstate.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/x86/mmxstate.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -17,6 +17,10 @@
 
 #if defined(OC_X86ASM)
 
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
+ 0x0003000300030003LL; 
+static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
+ 0x0004000400040004LL; 
 
 static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
    0, 8, 1, 2, 9,16,24,17,
@@ -38,7 +42,6 @@
   int dst_framei;
   int dst_ystride;
   int zzi;
-  int ci;
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -65,41 +68,41 @@
     Needless to say we inherited this approach from VP3.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
-    ogg_int16_t p;
+    ogg_uint16_t p;
     /*Why is the iquant product rounded in this case and no others?
       Who knows.*/
     p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-    /*for(ci=0;ci<64;ci++)res_buf[ci]=p;*/
-    /*This could also be done with MMX 2.*/
+    /*Fill res_buf with p.*/
     __asm__ __volatile__(
-     "  movzwl    %1,   %%eax\n\t"
-     "  movd   %%eax,   %%mm0\n\t" /* XXXX XXXX 0000 AAAA */
-     "  movq   %%mm0,   %%mm1\n\t" /* XXXX XXXX 0000 AAAA */
-     "  pslld    $16,   %%mm1\n\t" /* XXXX XXXX AAAA 0000 */
-     "  por    %%mm0,   %%mm1\n\t" /* XXXX XXXX AAAA AAAA */
-     "  movq   %%mm1,   %%mm0\n\t" /* XXXX XXXX AAAA AAAA */
-     "  psllq    $32,   %%mm1\n\t" /* AAAA AAAA 0000 0000 */
-     "  por    %%mm1,   %%mm0\n\t" /* AAAA AAAA AAAA AAAA */
-     "  movq   %%mm0,    (%0)\n\t"
-     "  movq   %%mm0,   8(%0)\n\t"
-     "  movq   %%mm0,  16(%0)\n\t"
-     "  movq   %%mm0,  24(%0)\n\t"
-     "  movq   %%mm0,  32(%0)\n\t"
-     "  movq   %%mm0,  40(%0)\n\t"
-     "  movq   %%mm0,  48(%0)\n\t"
-     "  movq   %%mm0,  56(%0)\n\t"
-     "  movq   %%mm0,  64(%0)\n\t"
-     "  movq   %%mm0,  72(%0)\n\t"
-     "  movq   %%mm0,  80(%0)\n\t"
-     "  movq   %%mm0,  88(%0)\n\t"
-     "  movq   %%mm0,  96(%0)\n\t"
-     "  movq   %%mm0, 104(%0)\n\t"
-     "  movq   %%mm0, 112(%0)\n\t"
-     "  movq   %%mm0, 120(%0)\n\t"
-     :
-     :"r" (res_buf),
-      "r" (p)
-     :"memory"
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm1=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm1\n\t"
+      /*mm0=0000 0000 AAAA 0000*/
+      "pslld $16,%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "por %%mm1,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[res_buf])\n\t"
+      "movq %%mm0,8(%[res_buf])\n\t"
+      "movq %%mm0,16(%[res_buf])\n\t"
+      "movq %%mm0,24(%[res_buf])\n\t"
+      "movq %%mm0,32(%[res_buf])\n\t"
+      "movq %%mm0,40(%[res_buf])\n\t"
+      "movq %%mm0,48(%[res_buf])\n\t"
+      "movq %%mm0,56(%[res_buf])\n\t"
+      "movq %%mm0,64(%[res_buf])\n\t"
+      "movq %%mm0,72(%[res_buf])\n\t"
+      "movq %%mm0,80(%[res_buf])\n\t"
+      "movq %%mm0,88(%[res_buf])\n\t"
+      "movq %%mm0,96(%[res_buf])\n\t"
+      "movq %%mm0,104(%[res_buf])\n\t"
+      "movq %%mm0,112(%[res_buf])\n\t"
+      "movq %%mm0,120(%[res_buf])\n\t"
+      :
+      :[res_buf]"r"(res_buf),[p]"r"((unsigned)p)
+      :"memory"
     );
   }
   else{
@@ -108,26 +111,26 @@
     /*First zero the buffer.*/
     /*On K7, etc., this could be replaced with movntq and sfence.*/
     __asm__ __volatile__(
-     "  pxor %%mm0,   %%mm0\n\t"
-     "  movq %%mm0,    (%0)\n\t"
-     "  movq %%mm0,   8(%0)\n\t"
-     "  movq %%mm0,  16(%0)\n\t"
-     "  movq %%mm0,  24(%0)\n\t"
-     "  movq %%mm0,  32(%0)\n\t"
-     "  movq %%mm0,  40(%0)\n\t"
-     "  movq %%mm0,  48(%0)\n\t"
-     "  movq %%mm0,  56(%0)\n\t"
-     "  movq %%mm0,  64(%0)\n\t"
-     "  movq %%mm0,  72(%0)\n\t"
-     "  movq %%mm0,  80(%0)\n\t"
-     "  movq %%mm0,  88(%0)\n\t"
-     "  movq %%mm0,  96(%0)\n\t"
-     "  movq %%mm0, 104(%0)\n\t"
-     "  movq %%mm0, 112(%0)\n\t"
-     "  movq %%mm0, 120(%0)\n\t"
-     :
-     :"r" (res_buf)
-     :"memory"
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[res_buf])\n\t"
+      "movq %%mm0,8(%[res_buf])\n\t"
+      "movq %%mm0,16(%[res_buf])\n\t"
+      "movq %%mm0,24(%[res_buf])\n\t"
+      "movq %%mm0,32(%[res_buf])\n\t"
+      "movq %%mm0,40(%[res_buf])\n\t"
+      "movq %%mm0,48(%[res_buf])\n\t"
+      "movq %%mm0,56(%[res_buf])\n\t"
+      "movq %%mm0,64(%[res_buf])\n\t"
+      "movq %%mm0,72(%[res_buf])\n\t"
+      "movq %%mm0,80(%[res_buf])\n\t"
+      "movq %%mm0,88(%[res_buf])\n\t"
+      "movq %%mm0,96(%[res_buf])\n\t"
+      "movq %%mm0,104(%[res_buf])\n\t"
+      "movq %%mm0,112(%[res_buf])\n\t"
+      "movq %%mm0,120(%[res_buf])\n\t"
+      :
+      :[res_buf]"r"(res_buf)
+      :"memory"
     );
     res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
     /*This is planned to be rewritten in MMX.*/
@@ -137,12 +140,8 @@
       res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
        _ac_iquant[ci]);
     }
-    if(_last_zzi<10){
-      oc_idct8x8_10_mmx(res_buf);
-    }
-    else{
-      oc_idct8x8_mmx(res_buf);
-    }
+    if(_last_zzi<10)oc_idct8x8_10_mmx(res_buf);
+    else oc_idct8x8_mmx(res_buf);
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
@@ -184,9 +183,9 @@
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;
-  int        dst_ystride;
+  long       dst_ystride;
   int        src_framei;
-  int        src_ystride;
+  long       src_ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
   dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
@@ -196,69 +195,401 @@
     oc_fragment   *frag;
     unsigned char *dst;
     unsigned char *src;
+    long           esi;
     frag=_state->frags+*fragi;
     dst=frag->buffer[dst_framei];
     src=frag->buffer[src_framei];
-#if (defined(__amd64__) || defined(__x86_64__))
     __asm__ __volatile__(
-     "  lea         (%3, %3, 2), %%rsi   \n\t"  /* esi=src_stride*3 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  lea         (%2, %2, 2), %%rdi   \n\t"  /* edi=dst_stride*3 */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
-     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%rdi)      \n\t"  /* 3x */
-     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%rdi)     \n\t"  /* 3x */
-     :"+r" (dst) /* 0 */
-     :"r" (src),  /* 1 */
-      "r" ((long)dst_ystride), /* 2 */
-      "r" ((long)src_ystride) /* 3 */
-     :"memory", "rsi","rdi"
+      /*src+0*src_ystride*/
+      "movq (%[src]),%%mm0\n\t"
+      /*esi=src_ystride*3*/
+      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
+      /*src+1*src_ystride*/
+      "movq (%[src],%[src_ystride]),%%mm1\n\t"
+      /*src+2*src_ystride*/
+      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
+      /*src+3*src_ystride*/
+      "movq (%[src],%[s]),%%mm3\n\t"
+      /*dst+0*dst_ystride*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*esi=dst_ystride*3*/
+      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
+      /*dst+1*dst_ystride*/
+      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
+      /*Pointer to next 4.*/
+      "lea (%[src],%[src_ystride],4),%[src]\n\t"
+      /*dst+2*dst_ystride*/
+      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
+      /*dst+3*dst_ystride*/
+      "movq %%mm3,(%[dst],%[s])\n\t"
+      /*Pointer to next 4.*/
+      "lea (%[dst],%[dst_ystride],4),%[dst]\n\t"
+      /*src+0*src_ystride*/
+      "movq (%[src]),%%mm0\n\t"
+      /*esi=src_ystride*3*/
+      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
+      /*src+1*src_ystride*/
+      "movq (%[src],%[src_ystride]),%%mm1\n\t"
+      /*src+2*src_ystride*/
+      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
+      /*src+3*src_ystride*/
+      "movq (%[src],%[s]),%%mm3\n\t"
+      /*dst+0*dst_ystride*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*esi=dst_ystride*3*/
+      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
+      /*dst+1*dst_ystride*/
+      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
+      /*dst+2*dst_ystride*/
+      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
+      /*dst+3*dst_ystride*/
+      "movq %%mm3,(%[dst],%[s])\n\t"
+      :[s]"=&S"(esi)
+      :[dst]"r"(dst),[src]"r"(src),[dst_ystride]"r"(dst_ystride),
+       [src_ystride]"r"(src_ystride)
+      :"memory"
     );
   }
-#else
-    __asm__ __volatile__(
-     "  lea         (%3, %3, 2), %%esi   \n\t"  /* esi=src_stride*3 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  lea         (%2, %2, 2), %%edi   \n\t"  /* edi=dst_stride*3 */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
-     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%edi)      \n\t"  /* 3x */
-     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
-     "  movq        (%1),        %%mm0   \n\t"  /* src */
-     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
-     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
-     "  movq        (%1, %%esi), %%mm3   \n\t"  /* src+3x stride */
-     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
-     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
-     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
-     "  movq        %%mm3,       (%0, %%edi)     \n\t"  /* 3x */
-     :"+r" (dst) /* 0 */
-     :"r" (src),  /* 1 */
-      "r" (dst_ystride), /* 2 */
-      "r" (src_ystride) /* 3 */
-     :"memory", "esi","edi"
-    );
+  /*This needs to be removed when decode specific functions are implemented:*/
+  __asm__ __volatile__("emms\n\t");
+}
+
+static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+  long esi;
+  long edi;
+  _pix-=_ystride*2;
+  __asm__ __volatile__(
+    /*mm0=0*/
+    "pxor %%mm0,%%mm0\n\t"
+    /*mm7=_pix[0...8]*/
+    "movq (%[pix]),%%mm7\n\t"
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*mm4=_pix[0...8+_ystride*3]*/
+    "movq (%[pix],%[s]),%%mm4\n\t"
+    /*mm6=_pix[0...8]*/
+    "movq %%mm7,%%mm6\n\t"
+    /*Expand unsigned _pix[0...3] to 16 bits.*/
+    "punpcklbw %%mm0,%%mm6\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    /*Expand unsigned _pix[4...8] to 16 bits.*/
+    "punpckhbw %%mm0,%%mm7\n\t"
+    /*Expand other arrays too.*/
+    "punpcklbw %%mm0,%%mm4\n\t"
+    "punpckhbw %%mm0,%%mm5\n\t"
+    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
+    "psubw %%mm4,%%mm6\n\t"
+    "psubw %%mm5,%%mm7\n\t"
+    /*mm5=mm4=_pix[0...8+_ystride]*/
+    "movq (%[pix],%[ystride]),%%mm4\n\t"
+    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
+    "movq (%[pix],%[ystride],2),%%mm2\n\t"
+    "movq %%mm4,%%mm5\n\t"
+    "movq %%mm2,%%mm3\n\t"
+    "movq %%mm2,%%mm1\n\t"
+    /*Expand these arrays.*/
+    "punpckhbw %%mm0,%%mm5\n\t"
+    "punpcklbw %%mm0,%%mm4\n\t" 
+    "punpckhbw %%mm0,%%mm3\n\t"
+    "punpcklbw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V3],%%mm0\n\t"
+    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "psubw %%mm5,%%mm3\n\t" 
+    "psubw %%mm4,%%mm2\n\t" 
+    /*Scale by 3.*/
+    "pmullw %%mm0,%%mm3\n\t"
+    "pmullw %%mm0,%%mm2\n\t"
+    /*Preload...*/
+    "movq %[OC_V4],%%mm0\n\t"
+    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "paddw %%mm7,%%mm3\n\t"
+    "paddw %%mm6,%%mm2\n\t"
+    /*Add 4.*/
+    "paddw %%mm0,%%mm3\n\t"
+    "paddw %%mm0,%%mm2\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm3\n\t"
+    "psraw $3,%%mm2\n\t"
+    /*Now perform mm7:m6=_bv[(f+4>>3)]*/
+    /*First the low part:*/
+    /*pextrw requires MMX+/SSE.
+    "pextrw $0,%%mm2,%%esi\n\t"
+    "pextrw $1,%%mm2,%%edi\n\t"*/
+    /*We duplicate the value and pull out of two registers in parallel;
+       perhaps we should not bother with just MMX, since any processor with
+       multiply MMX units will also have SSE, and should be using that
+       instead.*/
+    "movq %%mm2,%%mm0\n\t"
+    "psrlq $16,%%mm2\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd %%mm2,%%edi\n\t"
+    "psrlq $32,%%mm0\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $32,%%mm2\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*pinsrw requires MMX+/SSE.
+    "pinsrw $0,(%[bv],%[s],4),%%mm6\n\t"
+    "pinsrw $1,(%[bv],%[d],4),%%mm6\n\t"
+    "pextrw $2,%%mm2,%%esi\n\t"
+    "pextrw $3,%%mm2,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm6\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm0\n\t"
+    "movd %%mm2,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm6\n\t"
+    "pinsrw $3,(%[bv],%%edi,4),%%mm6\n\t"*/
+    "movd (%[bv],%[s],4),%%mm2\n\t"
+    "pslld $16,%%mm2\n\t"
+    "por %%mm2,%%mm6\n\t"
+    "movd (%[bv],%[d],4),%%mm2\n\t"
+    "pslld $16,%%mm2\n\t"
+    "por %%mm2,%%mm0\n\t"
+    "punpcklwd %%mm0,%%mm6\n\t"
+    /*Do it again for the high part:*/
+    /*"pextrw $0,%%mm3,%%esi\n\t" 
+    "pextrw $1,%%mm3,%%edi\n\t"*/
+    "movq %%mm3,%%mm0\n\t"
+    "psrlq $16,%%mm3\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd %%mm3,%%edi\n\t"
+    "psrlq $32,%%mm0\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $32,%%mm3\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $0,(%[bv],%%esi,4),%%mm7\n\t"
+    "pinsrw $1,(%[bv],%%edi,4),%%mm7\n\t"
+    "pextrw $2,%%mm3,%%esi\n\t"
+    "pextrw $3,%%mm3,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm7\n\t"
+    "movd %%mm0,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm0\n\t"
+    "movd %%mm3,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm7\n\t"
+    "pinsrw $3, (%[bv],%%edi,4),%%mm7\n\t"*/
+    "movd (%[bv],%[s],4),%%mm2\n\t"
+    "movd (%[bv],%[d],4),%%mm3\n\t"
+    "pslld $16,%%mm2\n\t"
+    "pslld $16,%%mm3\n\t"
+    "por %%mm2,%%mm7\n\t"
+    "por %%mm3,%%mm0\n\t"
+    "punpcklwd %%mm0,%%mm7\n\t"
+    /*mm7:mm6 now contain the final values of f.*/
+    /*_pix[0...8+_ystride]+=f*/
+    "paddw %%mm6,%%mm4\n\t"
+    "paddw %%mm7,%%mm5\n\t"
+    /*Re-expand _pix[0...8+_ystride*2], since we didn't have enough registers
+       to keep the whole thing around.*/
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm1,%%mm2\n\t"
+    "punpcklbw %%mm0,%%mm1\n\t"
+    "punpckhbw %%mm0,%%mm2\n\t"
+    /*_pix[0...8+_ystride*2]-=f*/
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm7,%%mm2\n\t"
+    /*Pack it back into 8 bits and write it back out.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    "packuswb %%mm5,%%mm4\n\t"
+    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
+    "movq %%mm4,(%[pix],%[ystride])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[bv]"r"(_bv),
+     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
+
+/*This code implements the bulk of loop_filter_h().
+  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+   four p0's to one register we must transpose the values in four mmx regs.
+  When half is done we repeat this for the rest.
+  TODO: some instruction stalls can be avoided.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,const int *_bv){
+  long esi;
+  long edi;
+  __asm__ __volatile__(
+    /*esi=_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*x x x x 3 2 1 0*/
+    "movd (%[pix]),%%mm0\n\t"
+    /*x x x x 7 6 5 4*/
+    "movd (%[pix],%[ystride]),%%mm1\n\t"
+    /*x x x x B A 9 8*/
+    "movd (%[pix],%[ystride],2),%%mm2\n\t"
+    /*x x x x F E D C*/
+    "movd (%[pix],%[s]),%%mm3\n\t"
+    /*mm0=7 3 6 2 5 1 4 0*/
+    "punpcklbw %%mm1,%%mm0\n\t"
+    /*mm2=F B E A D 9 C 8*/
+    "punpcklbw %%mm3,%%mm2\n\t"
+    /*mm1=7 3 6 2 5 1 4 0*/
+    "movq %%mm0,%%mm1\n\t"
+    /*mm1=D 9 5 1 C 8 4 0*/
+    "punpcklwd %%mm2,%%mm1\n\t"
+    /*mm0=F B 7 3 E A 6 2*/
+    "punpckhwd %%mm2,%%mm0\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    /*mm5=D 9 5 1 C 8 4 0*/
+    "movq %%mm1,%%mm5\n\t"
+    /*mm5=x D x 9 x 5 x 1==pix[1]*/
+    "punpckhbw %%mm7,%%mm5\n\t"
+    /*mm1=x C x 8 x 4 x 0==pix[0]*/
+    "punpcklbw %%mm7,%%mm1\n\t"
+    /*mm3=F B 7 3 E A 6 2*/
+    "movq %%mm0,%%mm3\n\t"
+    /*mm3=x F x B x 7 x 3==pix[3]*/
+    "punpckhbw %%mm7,%%mm3\n\t"
+    /*mm0=x E x A x 6 x 2==pix[2]*/
+    "punpcklbw %%mm7,%%mm0\n\t"
+    /*mm1=mm1-mm3==pix[0]-pix[3]*/
+    "psubw %%mm3,%%mm1\n\t"
+    /*Save a copy of pix[2] for later.*/
+    "movq %%mm0,%%mm4\n\t"
+    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    "psubw %%mm5,%%mm0\n\t"
+    /*Scale by 3.*/
+    "pmullw %[OC_V3],%%mm0\n\t"
+    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "paddw %%mm0,%%mm1\n\t"
+    /*Add 4.*/
+    "paddw %[OC_V4],%%mm1\n\t"
+    /*"Divide" by 8.*/
+    "psraw $3,%%mm1\n\t"
+    /*Now perform mm0=_bv[(f+4>>3)]*/
+    /*pextrw requires MMX+/SSE.
+    "pextrw $0,%%mm1,%%esi\n\t"
+    "pextrw $1,%%mm1,%%edi\n\t"*/
+    "movd %%mm1,%%esi\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movd %%mm1,%%edi\n\t"
+    "movsx %%si,%[s]\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movsx %%di,%[d]\n\t"
+    /*pinsrw requires MMX+/SSE.
+    "pinsrw $0,(%[bv],%%esi,4),%%mm0\n\t"
+    "pextrw $2,%%mm1,%%esi\n\t"
+    "pinsrw $1,(%[bv],%%edi,4),%%mm0\n\t"
+    "pextrw $3,%%mm1,%%edi\n\t"*/
+    "movd (%[bv],%[s],4),%%mm0\n\t"
+    "movd %%mm1,%%esi\n\t"
+    "movd (%[bv],%[d],4),%%mm2\n\t"
+    "psrlq $16,%%mm1\n\t"
+    "movsx %%si,%[s]\n\t"
+    "movd %%mm1,%%edi\n\t"
+    /*"pinsrw $2,(%[bv],%%esi,4),%%mm0\n\t"
+    "pinsrw $3,(%[bv],%%edi,4),%%mm0\n\t"*/
+    "movd (%[bv],%[s],4),%%mm3\n\t"
+    "movsx %%di,%[d]\n\t"
+    "pslld $16,%%mm3\n\t"
+    "movd (%[bv],%[d],4),%%mm6\n\t"
+    "por %%mm3,%%mm0\n\t"
+    "pslld $16,%%mm6\n\t"
+    "por %%mm6,%%mm2\n\t"
+    "punpcklwd %%mm2,%%mm0\n\t"
+    /*_pix[1]+=f;*/
+    "paddw %%mm0,%%mm5\n\t"
+    /*_pix[2]-=f;*/
+    "psubw %%mm0,%%mm4\n\t"
+    /*mm5=x x x x D 9 5 1*/
+    "packuswb %%mm7,%%mm5\n\t"
+    /*mm4=x x x x E A 6 2*/
+    "packuswb %%mm7,%%mm4\n\t"
+    /*mm5=E D A 9 6 5 2 1*/
+    "punpcklbw %%mm4,%%mm5\n\t"
+    /*esi=6 5 2 1*/
+    "movd %%mm5,%%esi\n\t"
+    "movw %%si,1(%[pix])\n\t"
+    /*Why is there such a big stall here?*/
+    "psrlq $32,%%mm5\n\t"
+    "shrl $16,%%esi\n\t"
+    "movw %%si,1(%[pix],%[ystride])\n\t"
+    /*esi=E D A 9*/
+    "movd %%mm5,%%esi\n\t"
+    "lea (%[ystride],%[ystride],2),%[d]\n\t"
+    "movw %%si,(%[pix],%[ystride])\n\t"
+    "shrl $16,%%esi\n\t"
+    "movw %%si,1(%[pix],%[d])\n\t"
+    :[s]"=&S"(esi),[d]"=&D"(edi),
+     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[bv]"+r"(_bv)
+    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :"memory"
+  );
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+  _pix-=2;
+  loop_filter_h4(_pix,_ystride,_bv);
+  _pix+=_ystride*4;
+  loop_filter_h4(_pix,_ystride,_bv);
+}
+
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+   and we do just a single emms call at the end.
+  Originally _bv pointer would also not be offset by 256 to get rid of a sign
+   extension instruction, but it turns out this is still needed on x86-64 to
+   avoid a partial register stall, and is needed even on x86-32 once we
+   eliminate the MMX+/SSE-specific pextrw/pinsrw instructions.*/
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  th_img_plane  *iplane;
+  oc_fragment_plane *fplane;
+  oc_fragment       *frag_top;
+  oc_fragment       *frag0;
+  oc_fragment       *frag;
+  oc_fragment       *frag_end;
+  oc_fragment       *frag0_end;
+  oc_fragment       *frag_bot;
+  _bv+=256;
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  fplane=_state->fplanes+_pli;
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  frag_top=_state->frags+fplane->froffset;
+  frag0=frag_top+_fragy0*fplane->nhfrags;
+  frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+  frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+  while(frag0<frag0_end){
+    frag=frag0;
+    frag_end=frag+fplane->nhfrags;
+    while(frag<frag_end){
+      if(frag->coded){
+        if(frag>frag0){
+          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
+        }
+        if(frag0>frag_top){
+          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
+        }
+        if(frag+1<frag_end&&!(frag+1)->coded){
+          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
+        }
+        if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+           iplane->ystride,_bv);
+        }
+      }
+      frag++;
+    }
+    frag0+=fplane->nhfrags;
   }
-#endif
   /*This needs to be removed when decode specific functions are implemented:*/
   __asm__ __volatile__("emms\n\t");
 }
+
 #endif

Modified: trunk/theora-exp/lib/x86/x86int.h
===================================================================
--- trunk/theora-exp/lib/x86/x86int.h	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/x86/x86int.h	2007-04-16 01:41:05 UTC (rev 12874)
@@ -16,6 +16,8 @@
 void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,                                               
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 void oc_idct8x8_mmx(ogg_int16_t _y[64]);
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);

Modified: trunk/theora-exp/lib/x86/x86state.c
===================================================================
--- trunk/theora-exp/lib/x86/x86state.c	2007-04-16 01:32:17 UTC (rev 12873)
+++ trunk/theora-exp/lib/x86/x86state.c	2007-04-16 01:41:05 UTC (rev 12874)
@@ -12,6 +12,8 @@
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
   }
   else oc_state_vtable_init_c(_state);