[xiph-commits] r14152 - trunk/theora-exp/lib/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Thu Nov 15 06:23:26 PST 2007


Author: tterribe
Date: 2007-11-15 06:23:26 -0800 (Thu, 15 Nov 2007)
New Revision: 14152

Modified:
   trunk/theora-exp/lib/x86/cpu.c
   trunk/theora-exp/lib/x86/mmxfrag.c
   trunk/theora-exp/lib/x86/mmxidct.c
   trunk/theora-exp/lib/x86/mmxstate.c
Log:
Minor asm updates to get rid of mem operands that can be easily generated in a
 register.


Modified: trunk/theora-exp/lib/x86/cpu.c
===================================================================
--- trunk/theora-exp/lib/x86/cpu.c	2007-11-15 14:21:36 UTC (rev 14151)
+++ trunk/theora-exp/lib/x86/cpu.c	2007-11-15 14:23:26 UTC (rev 14152)
@@ -23,7 +23,7 @@
   ogg_uint32_t ecx;
   ogg_uint32_t edx;
   ogg_uint32_t flags;
-#if (defined(__amd64__) || defined(__x86_64__))
+#if (defined(__amd64__)||defined(__x86_64__))
 # define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
    "push %%rbx\n\t" \

Modified: trunk/theora-exp/lib/x86/mmxfrag.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxfrag.c	2007-11-15 14:21:36 UTC (rev 14151)
+++ trunk/theora-exp/lib/x86/mmxfrag.c	2007-11-15 14:23:26 UTC (rev 14152)
@@ -16,28 +16,29 @@
 
 #if defined(OC_X86ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V128=
- 0x0080008000800080LL;
-
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
   int i;
   for(i=8;i-->0;){
     __asm__ __volatile__(
-      /*Set mm0 to 0x0080008000800080.*/
-      "movq %[OC_V128],%%mm0\n\t"
+      /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+      "pcmpeqw %%mm0,%%mm0\n\t"
       /*First four input values*/
       "movq (%[residue]),%%mm2\n\t"
-      /*Set mm1=mm0.*/
-      "movq %%mm0,%%mm1\n\t"
+      /*Set mm0 to 0x8000800080008000.*/
+      "psllw $15,%%mm0\n\t"
       /*Next four input values.*/
       "movq 8(%[residue]),%%mm3\n\t"
-      /*Add 128 and saturate to 16 bits.*/
-      "paddsw %%mm3,%%mm1\n\t"
+      /*Set mm0 to 0x0080008000800080.*/
+      "psrlw $8,%%mm0\n\t"
       /*_residue+=16*/
       "lea 0x10(%[residue]),%[residue]\n\t"
+      /*Set mm1=mm0.*/
+      "movq %%mm0,%%mm1\n\t"
       /*Add 128 and saturate to 16 bits.*/
       "paddsw %%mm2,%%mm0\n\t"
+      /*Add 128 and saturate to 16 bits.*/
+      "paddsw %%mm3,%%mm1\n\t"
       /*Pack saturate with next(high) four values.*/
       "packuswb %%mm1,%%mm0\n\t"
       /*Writeback.*/
@@ -45,7 +46,7 @@
       /*_dst+=_dst_ystride*/
       "lea  (%[dst],%[dst_ystride]),%[dst]\n\t"
       :[dst]"+r"(_dst),[residue]"+r"(_residue)
-      :[dst_ystride]"r"((long)_dst_ystride),[OC_V128]"m"(OC_V128)
+      :[dst_ystride]"r"((long)_dst_ystride)
       :"memory"
     );
   }

Modified: trunk/theora-exp/lib/x86/mmxidct.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxidct.c	2007-11-15 14:21:36 UTC (rev 14151)
+++ trunk/theora-exp/lib/x86/mmxidct.c	2007-11-15 14:23:26 UTC (rev 14152)
@@ -9,22 +9,16 @@
 #if defined(OC_X86ASM)
 
 /*These are offsets into the table of constants below.*/
-/*4 masks, in order: low word to high.*/
-#define OC_MASK_OFFSET    (0)
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (32)
+#define OC_COSINE_OFFSET (0)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (88)
+#define OC_EIGHT_OFFSET  (56)
 
 
 
 /*A table of constants used by the MMX routines.*/
 static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(4+7+1)*4]={
-  65535,    0,    0,    0,
-      0,65535,    0,    0,
-      0,    0,65535,    0,
-      0,    0,    0,65535,
+ OC_IDCT_CONSTS[(7+1)*4]={
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -299,7 +293,6 @@
   "#end OC_COLUMN_IDCT\n\t" \
 
 #define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_M(_i)      OC_MID(OC_MASK_OFFSET,_i)
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 

Modified: trunk/theora-exp/lib/x86/mmxstate.c
===================================================================
--- trunk/theora-exp/lib/x86/mmxstate.c	2007-11-15 14:21:36 UTC (rev 14151)
+++ trunk/theora-exp/lib/x86/mmxstate.c	2007-11-15 14:23:26 UTC (rev 14152)
@@ -17,11 +17,6 @@
 
 #if defined(OC_X86ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
- 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
- 0x0004000400040004LL;
-
 static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
    0, 8, 1, 2, 9,16,24,17,
   10, 3,32,11,18,25, 4,12,
@@ -292,19 +287,21 @@
     "punpcklbw %%mm0,%%mm4\n\t"
     "punpckhbw %%mm0,%%mm3\n\t"
     "punpcklbw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V3],%%mm0\n\t"
-    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    /*mm0=3 3 3 3
+      mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
     "psubw %%mm5,%%mm3\n\t"
+    "psrlw $14,%%mm0\n\t"
     "psubw %%mm4,%%mm2\n\t"
     /*Scale by 3.*/
     "pmullw %%mm0,%%mm3\n\t"
     "pmullw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V4],%%mm0\n\t"
-    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+    /*mm0=4 4 4 4
+      f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
        3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "psrlw $1,%%mm0\n\t"
     "paddw %%mm7,%%mm3\n\t"
+    "psllw $2,%%mm0\n\t"
     "paddw %%mm6,%%mm2\n\t"
     /*Add 4.*/
     "paddw %%mm0,%%mm3\n\t"
@@ -427,8 +424,7 @@
     "movq %%mm4,(%[pix],%[ystride])\n\t"
     "movq %%mm1,(%[pix],%[ystride],2)\n\t"
     :[s]"=&S"(esi)
-    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
-     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll)
     :"memory"
   );
 }
@@ -479,14 +475,20 @@
     "psubw %%mm3,%%mm1\n\t"
     /*Save a copy of pix[2] for later.*/
     "movq %%mm0,%%mm4\n\t"
-    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    /*mm2=3 3 3 3
+      mm0=mm0-mm5==pix[2]-pix[1]*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
     "psubw %%mm5,%%mm0\n\t"
+    "psrlw $14,%%mm2\n\t"
     /*Scale by 3.*/
-    "pmullw %[OC_V3],%%mm0\n\t"
-    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "pmullw %%mm2,%%mm0\n\t"
+    /*mm2=4 4 4 4
+      f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "psrlw $1,%%mm2\n\t"
     "paddw %%mm1,%%mm0\n\t"
+    "psllw $2,%%mm2\n\t"
     /*Add 4.*/
-    "paddw %[OC_V4],%%mm0\n\t"
+    "paddw %%mm2,%%mm0\n\t"
     /*"Divide" by 8, producing the residuals R_i.*/
     "psraw $3,%%mm0\n\t"
     /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
@@ -565,7 +567,7 @@
     "movw %%di,1(%[pix],%[s])\n\t"
     :[s]"=&S"(esi),[d]"=&D"(edi),
      [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
-    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :
     :"memory"
   );
 }
@@ -579,8 +581,8 @@
 
 /*We copy the whole function because the MMX routines will be inlined 4 times,
    and we can do just a single emms call at the end this way.
-  We also do not utilize the _bv lookup table, instead computing the values
-   that would lie in it on the fly.*/
+  We also do not use the _bv lookup table, instead computing the values that
+   would lie in it on the fly.*/
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of



More information about the commits mailing list