[xiph-commits] r16221 - in branches/theora-thusnelda/lib/enc: x86 x86_vc

Tue Jul 7 19:17:57 PDT 2009

Author: tterribe
Date: 2009-07-07 19:17:57 -0700 (Tue, 07 Jul 2009)
New Revision: 16221

Modified:
   branches/theora-thusnelda/lib/enc/x86/mmxfdct.c
   branches/theora-thusnelda/lib/enc/x86/sse2fdct.c
   branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c
Log:
Better fix for r16212 (now also correctly handles overflow in the other
 direction, which was much more unlikely, but stil possible).


Modified: branches/theora-thusnelda/lib/enc/x86/mmxfdct.c
===================================================================

--- branches/theora-thusnelda/lib/enc/x86/mmxfdct.c	2009-07-08 02:06:38 UTC (rev 16220)
+++ branches/theora-thusnelda/lib/enc/x86/mmxfdct.c	2009-07-08 02:17:57 UTC (rev 16221)
@@ -178,13 +178,16 @@
  "movq "_r5"(%[y]),%%mm1\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
-   The naive implementation could cause overflow, so we use u=s+(r-s>>1).*/ \
- "mov $0x7FFF54DC,%[a]\n\t" \
- "psubw %%mm4,%%mm0\n\t" \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
  "movq "_r3"(%[y]),%%mm2\n\t" \
+ "movq %%mm0,%%mm7\n\t" \
+ "pxor %%mm4,%%mm0\n\t" \
+ "pand %%mm4,%%mm7\n\t" \
  "psraw $1,%%mm0\n\t" \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "paddw %%mm7,%%mm0\n\t" \
  "movd %[a],%%mm7\n\t" \
- "paddw %%mm4,%%mm0\n\t" \
  /*mm7={54491-0x7FFF,0x7FFF}x2 \
    mm4=_y[4]=v=r-u*/ \
  "psubw %%mm0,%%mm4\n\t" \

Modified: branches/theora-thusnelda/lib/enc/x86/sse2fdct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/sse2fdct.c	2009-07-08 02:06:38 UTC (rev 16220)
+++ branches/theora-thusnelda/lib/enc/x86/sse2fdct.c	2009-07-08 02:17:57 UTC (rev 16221)
@@ -153,11 +153,14 @@
  "mov $0x7FFF6C84,%[a]\n\t" \
  "paddw %%xmm1,%%xmm4\n\t" \
  /*xmm0=_y[0]=u=r+s>>1 \
-   The naive implementation could cause overflow, so we use u=s+(r-s>>1).*/ \
- "psubw %%xmm4,%%xmm0\n\t" \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
+ "movdqa %%xmm0,%%xmm6\n\t" \
+ "pxor %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm6\n\t" \
+ "psraw $1,%%xmm0\n\t" \
  "movd %[a],%%xmm13\n\t" \
- "psraw $1,%%xmm0\n\t" \
- "paddw %%xmm4,%%xmm0\n\t" \
+ "paddw %%xmm6,%%xmm0\n\t" \
  /*xmm4=_y[4]=v=r-u*/ \
  "pshufd $00,%%xmm13,%%xmm13\n\t" \
  "psubw %%xmm0,%%xmm4\n\t" \

Modified: branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c	2009-07-08 02:06:38 UTC (rev 16220)
+++ branches/theora-thusnelda/lib/enc/x86_vc/mmxfdct.c	2009-07-08 02:17:57 UTC (rev 16221)
@@ -177,13 +177,16 @@
   __asm  movq mm1,[Y+_r5] \
   __asm  paddw mm4,mm2 \
   /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
-    The naive implementation could cause overflow, so we use u=s+(r-s>>1).*/ \
-  __asm  mov A,0x7FFF54DC \
-  __asm  psubw mm0,mm4 \
+    The naive implementation could cause overflow, so we use \
+     u=(r&s)+((r^s)>>1).*/ \
   __asm  movq mm2,[Y+_r3] \
+  __asm  movq mm7,mm0 \
+  __asm  pxor mm0,mm4 \
+  __asm  pand mm7,mm4 \
   __asm  psraw mm0,1 \
+  __asm  mov A,0x7FFF54DC \
+  __asm  paddw mm0,mm7 \
   __asm  movd mm7,A \
-  __asm  paddw mm0,mm4 \
   /*mm7={54491-0x7FFF,0x7FFF}x2 \
     mm4=_y[4]=v=r-u*/ \
   __asm  psubw mm4,mm0 \