[xiph-commits] r14835 - trunk/theora/lib/dec/x86
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sun May 4 10:37:16 PDT 2008
Author: tterribe
Date: 2008-05-04 10:37:16 -0700 (Sun, 04 May 2008)
New Revision: 14835
Modified:
trunk/theora/lib/dec/x86/mmxfrag.c
trunk/theora/lib/dec/x86/mmxstate.c
Log:
Replace "long" with "ptrdiff_t" in the x86 asm, since win64 is apparently the
only platform in existence where sizeof(long) < sizeof(void *).
Yes, I knew it was technically possible, but I thought at least on x86 we could
assume such things did not happen; now I know better.
Modified: trunk/theora/lib/dec/x86/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxfrag.c 2008-05-04 14:03:21 UTC (rev 14834)
+++ trunk/theora/lib/dec/x86/mmxfrag.c 2008-05-04 17:37:16 UTC (rev 14835)
@@ -21,6 +21,7 @@
Note: Loops are unrolled for best performance.
The iteration each instruction belongs to is marked in the comments as #i.*/
#include "x86int.h"
+#include <stdlib.h>
#if defined(USE_ASM)
@@ -133,8 +134,8 @@
:[residue]"r"(_residue),
[dst]"r"(_dst),
[dst4]"r"(_dst+(_dst_ystride<<2)),
- [dst_ystride]"r"((long)_dst_ystride),
- [dst_ystride3]"r"((long)_dst_ystride*3)
+ [dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+ [dst_ystride3]"r"((ptrdiff_t)_dst_ystride*3)
:"memory"
);
}
@@ -185,8 +186,8 @@
/*Advance dst.*/
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
:[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
- :[dst_ystride]"r"((long)_dst_ystride),
- [src_ystride]"r"((long)_src_ystride)
+ :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+ [src_ystride]"r"((ptrdiff_t)_src_ystride)
:"memory"
);
}
@@ -278,7 +279,7 @@
"lea (%[dst],%[ystride],2),%[dst]\n\t"
:[dst]"+r"(_dst),[residue]"+r"(_residue),
[src1]"+r"(_src1),[src2]"+r"(_src2)
- :[ystride]"r"((long)_dst_ystride)
+ :[ystride]"r"((ptrdiff_t)_dst_ystride)
:"memory"
);
}
Modified: trunk/theora/lib/dec/x86/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86/mmxstate.c 2008-05-04 14:03:21 UTC (rev 14834)
+++ trunk/theora/lib/dec/x86/mmxstate.c 2008-05-04 17:37:16 UTC (rev 14835)
@@ -19,6 +19,7 @@
Originally written by Rudolf Marek.*/
#include "x86int.h"
#include "../../internal.h"
+#include <stdlib.h>
#if defined(USE_ASM)
@@ -182,9 +183,9 @@
const int *fragi;
const int *fragi_end;
int dst_framei;
- long dst_ystride;
+ ptrdiff_t dst_ystride;
int src_framei;
- long src_ystride;
+ ptrdiff_t src_ystride;
dst_framei=_state->ref_frame_idx[_dst_frame];
src_framei=_state->ref_frame_idx[_src_frame];
dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
@@ -194,14 +195,14 @@
oc_fragment *frag;
unsigned char *dst;
unsigned char *src;
- long esi;
+ ptrdiff_t s;
frag=_state->frags+*fragi;
dst=frag->buffer[dst_framei];
src=frag->buffer[src_framei];
__asm__ __volatile__(
/*src+0*src_ystride*/
"movq (%[src]),%%mm0\n\t"
- /*esi=src_ystride*3*/
+ /*s=src_ystride*3*/
"lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
/*src+1*src_ystride*/
"movq (%[src],%[src_ystride]),%%mm1\n\t"
@@ -211,7 +212,7 @@
"movq (%[src],%[s]),%%mm3\n\t"
/*dst+0*dst_ystride*/
"movq %%mm0,(%[dst])\n\t"
- /*esi=dst_ystride*3*/
+ /*s=dst_ystride*3*/
"lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
/*dst+1*dst_ystride*/
"movq %%mm1,(%[dst],%[dst_ystride])\n\t"
@@ -225,7 +226,7 @@
"lea (%[dst],%[dst_ystride],4),%[dst]\n\t"
/*src+0*src_ystride*/
"movq (%[src]),%%mm0\n\t"
- /*esi=src_ystride*3*/
+ /*s=src_ystride*3*/
"lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
/*src+1*src_ystride*/
"movq (%[src],%[src_ystride]),%%mm1\n\t"
@@ -235,7 +236,7 @@
"movq (%[src],%[s]),%%mm3\n\t"
/*dst+0*dst_ystride*/
"movq %%mm0,(%[dst])\n\t"
- /*esi=dst_ystride*3*/
+ /*s=dst_ystride*3*/
"lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
/*dst+1*dst_ystride*/
"movq %%mm1,(%[dst],%[dst_ystride])\n\t"
@@ -243,7 +244,7 @@
"movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
/*dst+3*dst_ystride*/
"movq %%mm3,(%[dst],%[s])\n\t"
- :[s]"=&S"(esi)
+ :[s]"=&r"(s)
:[dst]"r"(dst),[src]"r"(src),[dst_ystride]"r"(dst_ystride),
[src_ystride]"r"(src_ystride)
:"memory"
@@ -255,12 +256,12 @@
static void loop_filter_v(unsigned char *_pix,int _ystride,
const ogg_int16_t *_ll){
- long esi;
+ ptrdiff_t s;
_pix-=_ystride*2;
__asm__ __volatile__(
/*mm0=0*/
"pxor %%mm0,%%mm0\n\t"
- /*esi=_ystride*3*/
+ /*s=_ystride*3*/
"lea (%[ystride],%[ystride],2),%[s]\n\t"
/*mm7=_pix[0...8]*/
"movq (%[pix]),%%mm7\n\t"
@@ -427,8 +428,8 @@
/*Write it back out.*/
"movq %%mm4,(%[pix],%[ystride])\n\t"
"movq %%mm1,(%[pix],%[ystride],2)\n\t"
- :[s]"=&S"(esi)
- :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll)
+ :[s]"=&r"(s)
+ :[pix]"r"(_pix),[ystride]"r"((ptrdiff_t)_ystride),[ll]"r"(_ll)
:"memory"
);
}
@@ -437,14 +438,16 @@
Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
four p0's to one register we must transpose the values in four mmx regs.
When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride,
+static void loop_filter_h4(unsigned char *_pix,ptrdiff_t _ystride,
const ogg_int16_t *_ll){
- long esi;
- long edi;
+ ptrdiff_t s;
+ /*d doesn't technically need to be 64-bit on x86-64, but making it so will
+ help avoid partial register stalls.*/
+ ptrdiff_t d;
__asm__ __volatile__(
/*x x x x 3 2 1 0*/
"movd (%[pix]),%%mm0\n\t"
- /*esi=_ystride*3*/
+ /*s=_ystride*3*/
"lea (%[ystride],%[ystride],2),%[s]\n\t"
/*x x x x 7 6 5 4*/
"movd (%[pix],%[ystride]),%%mm1\n\t"
@@ -557,19 +560,19 @@
"packuswb %%mm7,%%mm4\n\t"
/*mm5=E D A 9 6 5 2 1*/
"punpcklbw %%mm4,%%mm5\n\t"
- /*edi=6 5 2 1*/
- "movd %%mm5,%%edi\n\t"
- "movw %%di,1(%[pix])\n\t"
+ /*d=6 5 2 1*/
+ "movd %%mm5,%[d]\n\t"
+ "movw %w[d],1(%[pix])\n\t"
/*Why is there such a big stall here?*/
"psrlq $32,%%mm5\n\t"
- "shrl $16,%%edi\n\t"
- "movw %%di,1(%[pix],%[ystride])\n\t"
- /*edi=E D A 9*/
- "movd %%mm5,%%edi\n\t"
- "movw %%di,1(%[pix],%[ystride],2)\n\t"
- "shrl $16,%%edi\n\t"
- "movw %%di,1(%[pix],%[s])\n\t"
- :[s]"=&S"(esi),[d]"=&D"(edi),
+ "shr $16,%[d]\n\t"
+ "movw %w[d],1(%[pix],%[ystride])\n\t"
+ /*d=E D A 9*/
+ "movd %%mm5,%[d]\n\t"
+ "movw %w[d],1(%[pix],%[ystride],2)\n\t"
+ "shr $16,%[d]\n\t"
+ "movw %w[d],1(%[pix],%[s])\n\t"
+ :[s]"=&r"(s),[d]"=&r"(d),
[pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
:
:"memory"
More information about the commits
mailing list