[Theora-dev] [PATCH] promised MMX patches rc1

Tue Jul 19 06:05:24 PDT 2005

> i would like to try it  

I have now 64bit userspace account so I fixed it.
I would like to have it tested. Please can you try? It produces files ./dump_video with same MD5SUMs so I think it works.

Patch is in attachment.
You need to apply it with -p2 sorry about that.

Derf if you like it please check it into SVN.

I hope I will find some time to come with MMX IDCT patch soon. (I just need to integrate it to current SVN tree the patch itself works)

I would like to thank j^ for testing this patch.

Regards

Rudolf
-------------- next part --------------
diff -Naur svn/theora-exp/lib/x86/cpu.c integrate/theora-exp/lib/x86/cpu.c

--- svn/theora-exp/lib/x86/cpu.c	2005-07-19 14:11:13.975046750 +0200
+++ integrate/theora-exp/lib/x86/cpu.c	2005-07-19 14:49:33.690770000 +0200
@@ -20,6 +20,25 @@
   ogg_uint32_t ecx;
   ogg_uint32_t edx;
   ogg_uint32_t flags;
+
+#if (defined(__amd64__) || defined(__x86_64__))
+
+#define cpuid(op,eax,ebx,ecx,edx) \
+  __asm__ __volatile__( \
+   "push %%rbx   \n\t" \
+   "cpuid         \n\t" \
+   "movl %%ebx,%1 \n\t" \
+   "pop %%rbx" \
+   :"=a" (eax), \
+    "=r" (ebx), \
+    "=c" (ecx), \
+    "=d" (edx) \
+   :"a" (op) \
+   :"cc" \
+  )
+
+#else
+
 #define cpuid(op,eax,ebx,ecx,edx) \
   __asm__ __volatile__( \
    "pushl %%ebx   \n\t" \
@@ -33,6 +52,7 @@
    :"a" (op) \
    :"cc" \
   )
+  
   __asm__ __volatile__(
    "pushfl              \n\t"
    "pushfl              \n\t"
@@ -51,6 +71,7 @@
   );
   /*No cpuid.*/
   if(eax==ebx)return 0;
+#endif
   cpuid(0,eax,ebx,ecx,edx);
   if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
     /*Intel:*/
diff -Naur svn/theora-exp/lib/x86/mmxfrag.c integrate/theora-exp/lib/x86/mmxfrag.c
--- svn/theora-exp/lib/x86/mmxfrag.c	2005-07-19 14:11:13.971046500 +0200
+++ integrate/theora-exp/lib/x86/mmxfrag.c	2005-07-19 14:55:05.863529500 +0200
@@ -20,7 +20,7 @@
   __asm__ __volatile__(
    "  mov          $0x7, %%ecx  \n\t" /* 8x loop */
    "  .balign 16                \n\t"
-   "1:  movq     (V128), %%mm0  \n\t" /* Set mm0 to 0x0080008000800080 */
+   "1:  movq     %3, %%mm0  \n\t" /* Set mm0 to 0x0080008000800080 */
    "  movq         (%1), %%mm2  \n\t" /* First four input values */
    "  movq        %%mm0, %%mm1  \n\t" /* Set mm1 == mm0 */
    "  movq        8(%1), %%mm3  \n\t" /* Next four input values */
@@ -34,8 +34,9 @@
    "  jns 1b                    \n\t" /* loop */
    :"+r" (_dst)
    :"r" (_residue),
-    "r" (_dst_ystride)
-   :"memory", "ecx"
+    "r" ((long) _dst_ystride),
+    "m" (V128)
+   :"memory", "ecx", "cc"
   );
 }
 
@@ -61,13 +62,58 @@
    "  jns         1b               \n\t" /* loop */
    :"+r" (_dst)
    :"r" (_residue), 
-    "r" (_dst_ystride),
-    "r" (_src_ystride),
+    "r" ((long) _dst_ystride),
+    "r" ((long) _src_ystride),
     "r" (_src)
-   :"memory", "eax"
+   :"memory", "eax", "cc"
   );
 }
 
+#if (defined(__amd64__) ||  defined(__x86_64__))
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+
+  __asm__ __volatile__(
+   "  movl         $0x7,   %%eax   \n\t" /* 8x loop */
+   "  pxor         %%mm0,  %%mm0   \n\t" /* zero mm0 */
+   "  movq         (%4),   %%mm2   \n\t" /* load mm2 with _src1 */
+   "  .balign 16                   \n\t"
+   "1: movq        (%6),   %%mm4   \n\t" /* packed SRC2 */ 
+   "  movq         %%mm2,  %%mm3   \n\t" /* copy to mm3 */
+   "  movq         %%mm4,  %%mm5   \n\t" /* copy packed src2 to mm5 */
+   "  punpcklbw    %%mm0,  %%mm2   \n\t" /* expand low part of src1 to mm2 */
+   "  punpcklbw    %%mm0,  %%mm4   \n\t" /* low part expand of src2 to mm4 */
+   "  lea          (%4,%3), %4     \n\t" /*  _src1+_src1_ystride */
+   "  punpckhbw    %%mm0,  %%mm3   \n\t" /* expand high part of src1 to mm3 */
+   "  punpckhbw    %%mm0,  %%mm5   \n\t" /* high part expand of src2 to mm5 */
+   "  paddsw       %%mm2,  %%mm4   \n\t" /* add low parts of src1 and src2 */
+   "  paddsw       %%mm3,  %%mm5   \n\t" /* add high parts of src1 and src2 */
+   "  lea          (%6,%5), %6     \n\t" /* _src2+_src2_ystride */  
+   "  movq         (%4), %%mm2     \n\t" /* load mm2 with _src1 */
+   "  psrlw        $1,     %%mm4   \n\t" /* shift logical 1 to right o 2 dolu */
+   "  psrlw        $1,     %%mm5   \n\t" /* shift logical 1 to right */
+   "  paddsw       (%1),   %%mm4   \n\t" /* add low parts wwith low parts */
+   "  paddsw       8(%1),  %%mm5   \n\t" /* add highparts with high */
+   "  packuswb     %%mm5,  %%mm4   \n\t" /* pack saturate high to low */
+   "  lea          0x10(%1), %1    \n\t" /* _residuo+16 */
+   "  movq         %%mm4, (%0)     \n\t" /* write to src */
+   "  decl         %%eax           \n\t"
+   "  lea          (%0,%2), %0     \n\t" /* _dst+_dst_ystride */
+   "  jns          1b\n\t"
+   :"+r" (_dst) /* 0 */
+   :"r" (_residue), /* 1 */
+    "r" ((long) _dst_ystride), /* 2 */
+    "r" ((long) _src1_ystride), /* 3 */
+    "r" (_src1), /* 4 */
+    "r" ((long) _src2_ystride), /* 5 */
+    "r" (_src2) /* 6 */
+   : "memory", "cc", "eax"
+  );
+}
+#else
+
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
@@ -109,10 +155,12 @@
     "m" (_src2_ystride), /* 5 */
     "r" (_src2), /* 6 */
     "m" (i)
-   :"memory", "eax"
+   : "memory", "eax", "cc"
   );
 }
 
+#endif
+
 void oc_restore_fpu_mmx(void){
   __asm__ __volatile__(
    "  emms    \n\t" /* pack with next(high) four values */
diff -Naur svn/theora-exp/lib/x86/mmxstate.c integrate/theora-exp/lib/x86/mmxstate.c
--- svn/theora-exp/lib/x86/mmxstate.c	2005-07-19 14:11:13.971046500 +0200
+++ integrate/theora-exp/lib/x86/mmxstate.c	2005-07-19 14:57:31.636639750 +0200
@@ -39,6 +39,37 @@
     frag=_state->frags+*fragi;
     dst=frag->buffer[dst_framei];
     src=frag->buffer[src_framei];
+
+#if (defined(__amd64__) || defined(__x86_64__))
+    __asm__ __volatile__(
+     "  lea         (%3, %3, 2), %%rsi   \n\t"  /* esi=src_stride*3 */
+     "  movq        (%1),        %%mm0   \n\t"  /* src */
+     "  lea         (%2, %2, 2), %%rdi   \n\t"  /* edi=dst_stride*3 */
+     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
+     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
+     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
+     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
+     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
+     "  lea         (%1,%3,4),   %1      \n\t"  /* pointer to next 4 */
+     "  movq        %%mm2,       (%0, %2, 2)      \n\t"  /*dst+2x dst_stride */
+     "  movq        %%mm3,       (%0, %%rdi)      \n\t"  /* 3x */
+     "  lea         (%0,%2,4),   %0      \n\t"  /* pointer to next 4 */
+     "  movq        (%1),        %%mm0   \n\t"  /* src */
+     "  movq        (%1, %3),    %%mm1   \n\t"  /* src+1x stride */
+     "  movq        (%1, %3, 2), %%mm2   \n\t"  /* src+2x stride */
+     "  movq        (%1, %%rsi), %%mm3   \n\t"  /* src+3x stride */
+     "  movq        %%mm0,       (%0)    \n\t"  /* dst */
+     "  movq        %%mm1,       (%0, %2)\n\t"  /* dst+dst_stride */
+     "  movq        %%mm2,       (%0, %2, 2)     \n\t"  /* dst+2x dst_stride */
+     "  movq        %%mm3,       (%0, %%rdi)     \n\t"  /* 3x */
+     :"+r" (dst) /* 0 */
+     :"r" (src),  /* 1 */
+      "r" ((long) dst_ystride), /* 2 */
+      "r" ((long) src_ystride) /* 3 */
+     :"memory", "rsi","rdi"
+    );
+  }
+#else
     __asm__ __volatile__(
      "  lea         (%3, %3, 2), %%esi   \n\t"  /* esi=src_stride*3 */
      "  movq        (%1),        %%mm0   \n\t"  /* src */
@@ -67,6 +98,7 @@
      :"memory", "esi","edi"
     );
   }
+#endif
   /*This needs to be removed when decode specific functions are implemented:*/
   __asm__ __volatile__("emms\n\t");
 }