[xiph-commits] r17280 - branches/theora-gumboot/lib/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Mon Jun 7 15:40:35 PDT 2010


Author: tterribe
Date: 2010-06-07 15:40:34 -0700 (Mon, 07 Jun 2010)
New Revision: 17280

Added:
   branches/theora-gumboot/lib/x86/mmxextfrag.h
   branches/theora-gumboot/lib/x86/sse2frag.h
Modified:
   branches/theora-gumboot/lib/x86/mmxstate.c
Log:
Add MMXEXT and SSE2 versions of the MC functions that don't add a residual.
These will need to be reorganized later, with proper detection, etc.


Copied: branches/theora-gumboot/lib/x86/mmxextfrag.h (from rev 17278, branches/theora-gumboot/lib/x86/mmxfrag.h)
===================================================================
--- branches/theora-gumboot/lib/x86/mmxextfrag.h	                        (rev 0)
+++ branches/theora-gumboot/lib/x86/mmxextfrag.h	2010-06-07 22:40:34 UTC (rev 17280)
@@ -0,0 +1,160 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_mmxextfrag_H)
+# define _x86_mmxextfrag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src1 and _src2 to _dst, assuming _ystride
+   bytes between rows, taking the average of the two sources.*/
+#define OC_FRAG_COPY2_MMXEXT(_dst,_src1,_src2,_ystride) \
+  do{ \
+    const unsigned char *cpysrc1; \
+    const unsigned char *cpysrc2; \
+    const unsigned char *cpydst; \
+    ptrdiff_t            ystride3; \
+    cpysrc1=(_src1); \
+    cpysrc2=(_src2); \
+    cpydst=(_dst); \
+    __asm__ __volatile__( \
+      "movq (%[src1]),%%mm0\n\t" \
+      "movq (%[src2]),%%mm2\n\t" \
+      "pcmpeqb %%mm7,%%mm7\n\t" \
+      "movq (%[src1],%[ystride]),%%mm1\n\t" \
+      "movq (%[src2],%[ystride]),%%mm3\n\t" \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      "pxor %%mm7,%%mm0\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm1\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "pavgb %%mm2,%%mm0\n\t" \
+      "pavgb %%mm3,%%mm1\n\t" \
+      "movq (%[src1],%[ystride],2),%%mm2\n\t" \
+      "movq (%[src2],%[ystride],2),%%mm4\n\t" \
+      "movq (%[src1],%[ystride3]),%%mm3\n\t" \
+      "movq (%[src2],%[ystride3]),%%mm5\n\t" \
+      "pxor %%mm7,%%mm0\n\t" \
+      "pxor %%mm7,%%mm1\n\t" \
+      "lea (%[src1],%[ystride],4),%[src1]\n\t" \
+      "lea (%[src2],%[ystride],4),%[src2]\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm4\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "pxor %%mm7,%%mm5\n\t" \
+      "pavgb %%mm4,%%mm2\n\t" \
+      "pavgb %%mm5,%%mm3\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "movq %%mm0,(%[dst])\n\t" \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      "movq (%[src1]),%%mm0\n\t" \
+      "movq (%[src2]),%%mm2\n\t" \
+      "movq (%[src1],%[ystride]),%%mm1\n\t" \
+      "movq (%[src2],%[ystride]),%%mm3\n\t" \
+      "pxor %%mm7,%%mm0\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm1\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "pavgb %%mm2,%%mm0\n\t" \
+      "pavgb %%mm3,%%mm1\n\t" \
+      "movq (%[src1],%[ystride],2),%%mm2\n\t" \
+      "movq (%[src2],%[ystride],2),%%mm4\n\t" \
+      "movq (%[src1],%[ystride3]),%%mm3\n\t" \
+      "movq (%[src2],%[ystride3]),%%mm5\n\t" \
+      "pxor %%mm7,%%mm0\n\t" \
+      "pxor %%mm7,%%mm1\n\t" \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm4\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "pxor %%mm7,%%mm5\n\t" \
+      "pavgb %%mm4,%%mm2\n\t" \
+      "pavgb %%mm5,%%mm3\n\t" \
+      "movq %%mm0,(%[dst])\n\t" \
+      "pxor %%mm7,%%mm2\n\t" \
+      "pxor %%mm7,%%mm3\n\t" \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(cpydst),[src1]"+%r"(cpysrc1),[src2]"+r"(cpysrc2), \
+       [ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)_ystride) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+/*Copies a 16x8 block of pixels from _src1 and _src2 to _dst, assuming _ystride
+   bytes between rows, taking the average of the two sources.*/
+#define OC_FRAGX2_COPY2_MMXEXT(_dst,_src1,_src2,_ystride) \
+  do{ \
+    const unsigned char *cpysrc1; \
+    const unsigned char *cpysrc2; \
+    const unsigned char *cpydst; \
+    int                  i; \
+    cpysrc1=(_src1); \
+    cpysrc2=(_src2); \
+    cpydst=(_dst); \
+    __asm__ __volatile__("pcmpeqb %%mm7,%%mm7\n\t"::); \
+    for(i=0;i<8;i+=2){ \
+      __asm__ __volatile__( \
+        "movq (%[src1]),%%mm0\n\t" \
+        "movq 8(%[src1]),%%mm1\n\t" \
+        "movq (%[src2]),%%mm2\n\t" \
+        "movq 8(%[src2]),%%mm3\n\t" \
+        "movq (%[src1],%[ystride]),%%mm4\n\t" \
+        "movq 8(%[src1],%[ystride]),%%mm5\n\t" \
+        "pxor %%mm7,%%mm0\n\t" \
+        "pxor %%mm7,%%mm1\n\t" \
+        "pxor %%mm7,%%mm2\n\t" \
+        "pxor %%mm7,%%mm3\n\t" \
+        "lea (%[src1],%[ystride],2),%[src1]\n\t" \
+        "pavgb %%mm2,%%mm0\n\t" \
+        "pavgb %%mm3,%%mm1\n\t" \
+        "movq (%[src2],%[ystride]),%%mm2\n\t" \
+        "movq 8(%[src2],%[ystride]),%%mm3\n\t" \
+        "pxor %%mm7,%%mm0\n\t" \
+        "pxor %%mm7,%%mm1\n\t" \
+        "pxor %%mm7,%%mm4\n\t" \
+        "pxor %%mm7,%%mm5\n\t" \
+        "pxor %%mm7,%%mm2\n\t" \
+        "pxor %%mm7,%%mm3\n\t" \
+        "lea (%[src2],%[ystride],2),%[src2]\n\t" \
+        "pavgb %%mm4,%%mm2\n\t" \
+        "pavgb %%mm5,%%mm3\n\t" \
+        "movq %%mm0,(%[dst])\n\t" \
+        "pxor %%mm7,%%mm2\n\t" \
+        "pxor %%mm7,%%mm3\n\t" \
+        "movq %%mm1,8(%[dst])\n\t" \
+        "movq %%mm2,(%[dst],%[ystride])\n\t" \
+        "movq %%mm3,8(%[dst],%[ystride])\n\t" \
+        "lea (%[dst],%[ystride],2),%[dst]\n\t" \
+        :[dst]"+r"(cpydst),[src1]"+%r"(cpysrc1),[src2]"+r"(cpysrc2) \
+        :[ystride]"r"((ptrdiff_t)_ystride) \
+        :"memory" \
+      ); \
+    } \
+  } \
+  while(0)
+
+# endif
+#endif

Modified: branches/theora-gumboot/lib/x86/mmxstate.c
===================================================================
--- branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-07 21:55:14 UTC (rev 17279)
+++ branches/theora-gumboot/lib/x86/mmxstate.c	2010-06-07 22:40:34 UTC (rev 17280)
@@ -20,6 +20,9 @@
 #include <string.h>
 #include "x86int.h"
 #include "mmxfrag.h"
+/*TODO: These shouldn't be included in an mmx<foo>.c file.*/
+#include "mmxextfrag.h"
+#include "sse2frag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
@@ -94,59 +97,6 @@
   }
 }
 
-static void oc_int_fragx2_copy2_sse2(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
-  int i;
-  for(i=0;i<2;i++){
-    __asm__ __volatile__(
-      /*Load the first 4 rows.*/
-      "movdqu (%[src1]),%%xmm0\n\t"
-      "movdqu (%[src2]),%%xmm1\n\t"
-      "movdqu (%[src1],%[src_ystride]),%%xmm2\n\t"
-      "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
-      "movdqu (%[src2],%[src_ystride]),%%xmm3\n\t"
-      "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
-      /*xmm7={-1}x16.*/
-      "pcmpeqb %%xmm7,%%xmm7\n\t"
-      "movdqu (%[src1]),%%xmm4\n\t"
-      "movdqu (%[src2]),%%xmm5\n\t"
-      "movdqu (%[src1],%[src_ystride]),%%xmm6\n\t"
-      /*Start averaging %%xmm0 and %%xmm1.*/
-      "pxor %%xmm7,%%xmm0\n\t"
-      "pxor %%xmm7,%%xmm1\n\t"
-      "pavgb %%xmm1,%%xmm0\n\t"
-      "movdqu (%[src2],%[src_ystride]),%%xmm1\n\t"
-      "pxor %%xmm7,%%xmm2\n\t"
-      "pxor %%xmm7,%%xmm3\n\t"
-      "pavgb %%xmm3,%%xmm2\n\t"
-      "pxor %%xmm7,%%xmm4\n\t"
-      "pxor %%xmm7,%%xmm5\n\t"
-      "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
-      "pxor %%xmm7,%%xmm0\n\t"
-      "pxor %%xmm7,%%xmm2\n\t"
-      "pavgb %%xmm5,%%xmm4\n\t"
-      "pxor %%xmm7,%%xmm6\n\t"
-      "pxor %%xmm7,%%xmm1\n\t"
-      "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
-      "pavgb %%xmm1,%%xmm6\n\t"
-      "pxor %%xmm7,%%xmm4\n\t"
-      "pxor %%xmm7,%%xmm6\n\t"
-      "movdqa %%xmm0,(%[dst])\n\t"
-      "movdqa %%xmm2,(%[dst],%[dst_ystride])\n\t"
-      "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
-      "movdqa %%xmm4,(%[dst])\n\t"
-      "movdqa %%xmm6,(%[dst],%[dst_ystride])\n\t"
-      "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
-      :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
-      :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
-       [src_ystride]"r"((ptrdiff_t)_src_ystride)
-      :"memory"
-    );
-  }
-}
-
-static const ogg_int16_t zeroes[64]={0};
-
 void oc_state_quad_predict_mmx(const oc_theora_state *_state,ptrdiff_t _frag_buf_off,
  int _pli,int _mask,int _ref_frame, oc_mv _mv){
   const unsigned char *ref;
@@ -164,29 +114,25 @@
   if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mv[0],_mv[1])>1){
     switch(_mask&3){
     case 3:
-      oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+      OC_FRAGX2_COPY2_MMXEXT(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
       break;
     case 1:
-      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],ystride);
       break;
     case 2:
-      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],ystride);
     }
     dst+=ystride*8;
     ref+=ystride*8;
     switch(_mask>>2){
     case 3:
-      oc_int_fragx2_copy2_sse2(dst,ystride,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
+      OC_FRAGX2_COPY2_MMXEXT(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride);
       break;
     case 1:
-      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],ystride);
       break;
     case 2:
-      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],ystride);
     }
   }
   else{
@@ -233,15 +179,13 @@
    +_frag_buf_off;
   if (_mask & 1){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[0][0],_mvs[0][1])>1){
-      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],ystride);
     }
     else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
   }
   if (_mask & 2){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[1][0],_mvs[1][1])>1){
-      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],ystride);
     }
     else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
   }
@@ -249,15 +193,13 @@
   ref+=ystride*8;
   if (_mask & 4){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[2][0],_mvs[2][1])>1){
-      oc_frag_recon_inter2_mmx(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+0,ref+0+mvoffsets[0],ref+0+mvoffsets[1],ystride);
     }
     else OC_FRAG_COPY_MMX(dst+0,ref+0+mvoffsets[0],ystride);
   }
   if (_mask & 8){
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,_mvs[3][0],_mvs[3][1])>1){
-      oc_frag_recon_inter2_mmx(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],
-       ystride,zeroes);
+      OC_FRAG_COPY2_MMXEXT(dst+8,ref+8+mvoffsets[0],ref+8+mvoffsets[1],ystride);
     }
     else OC_FRAG_COPY_MMX(dst+8,ref+8+mvoffsets[0],ystride);
   }

Copied: branches/theora-gumboot/lib/x86/sse2frag.h (from rev 17278, branches/theora-gumboot/lib/x86/mmxfrag.h)
===================================================================
--- branches/theora-gumboot/lib/x86/sse2frag.h	                        (rev 0)
+++ branches/theora-gumboot/lib/x86/sse2frag.h	2010-06-07 22:40:34 UTC (rev 17280)
@@ -0,0 +1,107 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2frag_H)
+# define _x86_sse2frag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies a 16x8 block of pixels from _src1 and _src2 to _dst, assuming _ystride
+   bytes between rows, taking the average of the two sources.*/
+# define OC_FRAGX2_COPY2_SSE2(_dst,_src1,_src2,_ystride) \
+  do{ \
+    const unsigned char *cpysrc1; \
+    const unsigned char *cpysrc2; \
+    const unsigned char *cpydst; \
+    ptrdiff_t            ystride3; \
+    cpysrc1=(_src1); \
+    cpysrc2=(_src2); \
+    cpydst=(_dst); \
+    __asm__ __volatile__( \
+      "movdqu (%[src1]),%%xmm0\n\t" \
+      "movdqu (%[src2]),%%xmm2\n\t" \
+      "pcmpeqb %%xmm7,%%xmm7\n\t" \
+      "movdqu (%[src1],%[ystride]),%%xmm1\n\t" \
+      "movdqu (%[src2],%[ystride]),%%xmm3\n\t" \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      "pxor %%xmm7,%%xmm0\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm1\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "pavgb %%xmm2,%%xmm0\n\t" \
+      "pavgb %%xmm3,%%xmm1\n\t" \
+      "movdqu (%[src1],%[ystride],2),%%xmm2\n\t" \
+      "movdqu (%[src2],%[ystride],2),%%xmm4\n\t" \
+      "movdqu (%[src1],%[ystride3]),%%xmm3\n\t" \
+      "movdqu (%[src2],%[ystride3]),%%xmm5\n\t" \
+      "pxor %%xmm7,%%xmm0\n\t" \
+      "pxor %%xmm7,%%xmm1\n\t" \
+      "lea (%[src1],%[ystride],4),%[src1]\n\t" \
+      "lea (%[src2],%[ystride],4),%[src2]\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm4\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "pxor %%xmm7,%%xmm5\n\t" \
+      "pavgb %%xmm4,%%xmm2\n\t" \
+      "pavgb %%xmm5,%%xmm3\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "movdqa %%xmm0,(%[dst])\n\t" \
+      "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+      "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+      "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+      "movdqu (%[src1]),%%xmm0\n\t" \
+      "movdqu (%[src2]),%%xmm2\n\t" \
+      "movdqu (%[src1],%[ystride]),%%xmm1\n\t" \
+      "movdqu (%[src2],%[ystride]),%%xmm3\n\t" \
+      "pxor %%xmm7,%%xmm0\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm1\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "pavgb %%xmm2,%%xmm0\n\t" \
+      "pavgb %%xmm3,%%xmm1\n\t" \
+      "movdqu (%[src1],%[ystride],2),%%xmm2\n\t" \
+      "movdqu (%[src2],%[ystride],2),%%xmm4\n\t" \
+      "movdqu (%[src1],%[ystride3]),%%xmm3\n\t" \
+      "movdqu (%[src2],%[ystride3]),%%xmm5\n\t" \
+      "pxor %%xmm7,%%xmm0\n\t" \
+      "pxor %%xmm7,%%xmm1\n\t" \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm4\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "pxor %%xmm7,%%xmm5\n\t" \
+      "pavgb %%xmm4,%%xmm2\n\t" \
+      "pavgb %%xmm5,%%xmm3\n\t" \
+      "movdqa %%xmm0,(%[dst])\n\t" \
+      "pxor %%xmm7,%%xmm2\n\t" \
+      "pxor %%xmm7,%%xmm3\n\t" \
+      "movdqa %%xmm1,(%[dst],%[ystride])\n\t" \
+      "movdqa %%xmm2,(%[dst],%[ystride],2)\n\t" \
+      "movdqa %%xmm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(cpydst),[src1]"+%r"(cpysrc1),[src2]"+r"(cpysrc2), \
+       [ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)_ystride) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif



More information about the commits mailing list