[xiph-commits] r15674 - branches/theora-thusnelda/lib/enc/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Thu Feb 5 16:18:42 PST 2009


Author: tterribe
Date: 2009-02-05 16:18:42 -0800 (Thu, 05 Feb 2009)
New Revision: 15674

Modified:
   branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
Log:
Fix encoder asm to actually compile on x86-32.


Modified: branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-02-05 23:53:39 UTC (rev 15673)
+++ branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
@@ -16,90 +16,104 @@
  ********************************************************************/
 
 #include "codec_internal.h"
+#include <stddef.h>
 
 #if defined(USE_ASM)
 
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-static void copy8x8__mmx (const unsigned char *src,
-                          unsigned char *dest,
-                          ogg_uint32_t stride)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  lea         (%2, %2, 2), %%rdi  \n\t"
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%rdi), %%mm3  \n\t"
-
-    "  lea         (%1, %2, 4), %1     \n\t" 
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%rdi)  \n\t"
-
-    "  lea         (%0, %2, 4), %0     \n\t" 
-
-    "  movq        (%1), %%mm0         \n\t"
-    "  movq        (%1, %2), %%mm1     \n\t"
-    "  movq        (%1, %2, 2), %%mm2  \n\t"
-    "  movq        (%1, %%rdi), %%mm3  \n\t"
-
-    "  movq        %%mm0, (%0)         \n\t"
-    "  movq        %%mm1, (%0, %2)     \n\t"
-    "  movq        %%mm2, (%0, %2, 2)  \n\t"
-    "  movq        %%mm3, (%0, %%rdi)  \n\t"
-      : "+a" (dest)
-      : "c" (src),
-        "d" ((unsigned long)stride)
-      : "memory", "rdi"
+/*TODO: This is basically oc_state_frag_copy_mmx() without the enclosing loop.
+  Seems like one of these two should share the other's code.*/
+static void oc_copy8x8_mmx(const unsigned char *_src,unsigned char *_dst,
+ ogg_uint32_t _ystride){
+  ptrdiff_t esi;
+  __asm__ __volatile__(
+    /*src+0*src_ystride*/
+    "movq (%[src]),%%mm0\n\t"
+    /*esi=src_ystride*3*/
+    "lea (%[ystride],%[ystride],2),%[s]\n\t"
+    /*src+1*src_ystride*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*src+2*src_ystride*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*src+3*src_ystride*/
+    "movq (%[src],%[s]),%%mm3\n\t"
+    /*dst+0*dst_ystride*/
+    "movq %%mm0,(%[dst])\n\t"
+    /*dst+1*dst_ystride*/
+    "movq %%mm1,(%[dst],%[ystride])\n\t"
+    /*Pointer to next 4.*/
+    "lea (%[src],%[ystride],4),%[src]\n\t" 
+    /*dst+2*dst_ystride*/
+    "movq %%mm2,(%[dst],%[ystride],2)\n\t"
+    /*dst+3*dst_ystride*/
+    "movq %%mm3,(%[dst],%[s])\n\t"
+    /*Pointer to next 4.*/
+    "lea (%[dst],%[ystride],4),%[dst]\n\t" 
+    /*src+0*src_ystride*/
+    "movq (%[src]),%%mm0\n\t"
+    /*src+1*src_ystride*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*src+2*src_ystride*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*src+3*src_ystride*/
+    "movq (%[src],%[s]),%%mm3\n\t"
+    /*dst+0*dst_ystride*/
+    "movq %%mm0,(%[dst])\n\t"
+    /*dst+1*dst_ystride*/
+    "movq %%mm1,(%[dst],%[ystride])\n\t"
+    /*dst+2*dst_ystride*/
+    "movq %%mm2,(%[dst],%[ystride],2)\n\t"
+    /*dst+3*dst_ystride*/
+    "movq %%mm3,(%[dst],%[s])\n\t"
+    :[s]"=&S"(esi)
+    :[dst]"r"(_dst),[src]"r"(_src),[ystride]"r"((ptrdiff_t)_ystride)
+    :"memory"
   );
 }
 
-static void recon8x8__mmx (unsigned char *ReconPtr, 
-			   const ogg_int16_t *ChangePtr, 
-			   ogg_uint32_t LineStep)
-{
-  __asm__ __volatile__ (
-    "  .balign 16                      \n\t"
-
-    "  pxor        %%mm0, %%mm0        \n\t"
-    "  lea         128(%1), %%rdi      \n\t"
-
-    "1:                                \n\t"
-    "  movq        (%0), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
-
-    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
-    "  movq        %%mm2, %%mm3        \n\t"
-    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
-    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
-    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
-    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
-
-    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
-    "  lea         16(%1), %1          \n\t" /* next row of changes */
-    "  cmp         %%rdi, %1           \n\t" /* are we done? */
-
-    "  movq        %%mm2, (%0)         \n\t" /* store result */
-
-    "  lea         (%0, %2), %0        \n\t" /* next row of output */
-    "  jc          1b                  \n\t"
-      : "+r" (ReconPtr)
-      : "r" (ChangePtr),
-        "r" ((unsigned long)LineStep)
-      : "memory", "rdi"
-  );
+/*TODO: There isn't much penalty to just re-using
+   oc_frag_recon_inter_mmx() from the decoder here; we should do that.*/
+static void oc_recon8x8_mmx(unsigned char *_dst,const ogg_int16_t *_residue,
+ ogg_uint32_t _ystride){
+  ptrdiff_t s;
+  int       i;
+  /*Zero mm0.*/
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=8;i-->0;){
+    __asm__ __volatile__(
+      /*Load mm2 with _src*/
+      "movq (%[dst]),%%mm2\n\t"
+      /*Load mm4 with low part of residue.*/
+      "movq (%[res]),%%mm4\n\t"
+      /*Load mm5 with high part of residue.*/
+      "movq 8(%[res]),%%mm5\n\t"
+      /*Copy mm2 to mm3.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*Expand low part of _src to 16 bits.*/
+      "punpcklbw %%mm0,%%mm2\n\t"
+      /*Expand high part of _src to 16 bits.*/
+      "punpckhbw %%mm0,%%mm3\n\t"
+      /*Add low part with low part of residue.*/
+      "paddsw %%mm4,%%mm2\n\t"
+      /*High with high.*/
+      "paddsw %%mm5,%%mm3\n\t"
+      /*Pack and saturate to mm2.*/
+      "packuswb %%mm3,%%mm2\n\t"
+      /*_residue+=16*/
+      "lea 16(%[res]),%[res]\n\t"
+      /*Put mm2 to dest.*/
+      "movq %%mm2,(%[dst])\n\t"
+      /*_dst+=_dst_ystride*/
+      "lea (%[dst],%[ystride]),%[dst]\n\t"
+      :[dst]"+r"(_dst),[res]"+r"(_residue)
+      :[ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
 }
 
-void dsp_mmx_recon_init(DspFunctions *funcs)
-{
-  funcs->copy8x8 = copy8x8__mmx;
-  funcs->recon8x8 = recon8x8__mmx;
+void dsp_mmx_recon_init(DspFunctions *_funcs){
+  _funcs->copy8x8=oc_copy8x8_mmx;
+  _funcs->recon8x8=oc_recon8x8_mmx;
 }
 
 #endif /* USE_ASM */



More information about the commits mailing list