[xiph-commits] r16889 - in experimental/derf/theora-ptalarbvorm/lib: . x86 x86_vc
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Wed Feb 10 08:48:00 PST 2010
Author: tterribe
Date: 2010-02-10 08:48:00 -0800 (Wed, 10 Feb 2010)
New Revision: 16889
Added:
experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h
Modified:
experimental/derf/theora-ptalarbvorm/lib/Makefile.am
experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c
experimental/derf/theora-ptalarbvorm/lib/x86/sse2fdct.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxencfrag.c
Log:
Add an SSE2 implementation of SATD.
Currently this is only used on x86-64, where it gives about a 5% speed-up.
It is disabled on x86-32, because it is slower than the MMX version.
Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am 2010-02-10 16:48:00 UTC (rev 16889)
@@ -6,6 +6,7 @@
encoder_disabled.c \
x86/mmxencfrag.c \
x86/mmxfdct.c \
+ x86/sse2encfrag.c \
x86/sse2fdct.c \
x86/x86enc.c \
x86/x86enc.h \
@@ -30,7 +31,8 @@
encoder_uniq_x86_sources = \
x86/mmxencfrag.c \
x86/mmxfdct.c \
- x86/x86enc.c
+ x86/x86enc.c \
+ x86/sse2encfrag.c
encoder_uniq_x86_64_sources = \
x86/sse2fdct.c
@@ -126,7 +128,6 @@
huffenc.h \
mathops.h \
modedec.h \
- x86/x86enc.h \
apiwrapper.h \
bitpack.h \
dct.h \
@@ -136,9 +137,11 @@
huffman.h \
ocintrin.h \
quant.h \
+ x86/x86enc.h \
x86/mmxfrag.h \
x86/mmxloop.h \
- x86/x86int.h
+ x86/x86int.h \
+ x86/sse2trans.h
libtheoradec_la_SOURCES = \
$(decoder_sources) \
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxencfrag.c 2010-02-10 16:48:00 UTC (rev 16889)
@@ -87,7 +87,9 @@
The latter is exactly 1 too large when the low bit of two corresponding \
bytes is only set in one of them. \
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
- correct the output of pavgb.*/ \
+ correct the output of pavgb. \
+ TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
+ schedules better; currently, however, this function is unused.*/ \
"movq %%mm0,%%mm6\n\t" \
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
"pxor %%mm1,%%mm0\n\t" \
@@ -248,7 +250,7 @@
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
- Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+ Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 \
"#OC_HADAMARD_AB_8x4\n\t" \
@@ -281,7 +283,7 @@
"psubw %%mm5,%%mm7\n\t" \
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
- Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+ Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 \
"#OC_HADAMARD_C_8x4\n\t" \
@@ -544,7 +546,7 @@
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
we can share code with oc_enc_frag_satd2_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
__asm__ __volatile__(
/*Load the first 3 rows.*/
Added: experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2encfrag.c 2010-02-10 16:48:00 UTC (rev 16889)
@@ -0,0 +1,356 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+#include "sse2trans.h"
+
+#if defined(OC_X86_ASM)
+
+/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
+ 16-bit difference in %%xmm0...%%xmm7.*/
+#define OC_LOAD_SUB_8x8 \
+ "#OC_LOAD_SUB_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movq (%[src]),%%xmm2\n\t" \
+ "movq (%[ref]),%%xmm7\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
+ "punpcklbw %%xmm4,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm0\n\t" \
+ "movq (%[src]),%%xmm4\n\t" \
+ "movdqa %%xmm0,(%[buf])\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm5,%%xmm1\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm5,%%xmm1\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm2\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm6,%%xmm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm3\n\t" \
+ "movq (%[src]),%%xmm6\n\t" \
+ "punpcklbw %%xmm0,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm7,%%xmm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm5\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%xmm0,%%xmm6\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],8),%[src]\n\t" \
+ "punpcklbw %%xmm0,%%xmm7\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "movdqa (%[buf]),%%xmm0\n\t" \
+
+/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
+#define OC_LOAD_8x8 \
+ "#OC_LOAD_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "pxor %%xmm7,%%xmm7\n\t" \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "punpcklbw %%xmm7,%%xmm0\n\t" \
+ "movq (%[src4]),%%xmm4\n\t" \
+ "punpcklbw %%xmm7,%%xmm1\n\t" \
+ "movq (%[src4],%[ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm3\n\t" \
+ "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psrlw $8,%%xmm4\n\t" \
+ "psrlw $8,%%xmm5\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psrlw $8,%%xmm6\n\t" \
+ "psrlw $8,%%xmm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
+ Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
+ perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x8 \
+ "#OC_HADAMARD_AB_8x8\n\t" \
+ /*Stage A:*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm1,%%xmm5\n\t" \
+ "psubw %%xmm2,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm3\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "psubw %%xmm3,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*Stage B:*/ \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm5\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm6\n\t" \
+ "psubw %%xmm5,%%xmm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+ Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+ place with no temporary registers).*/
+#define OC_HADAMARD_C_8x8 \
+ "#OC_HADAMARD_C_8x8\n\t" \
+ /*Stage C:*/ \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm1\n\t" \
+ "psubw %%xmm2,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm5\n\t" \
+ "psubw %%xmm6,%%xmm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform in place.
+ Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
+ in place with no temporary registers).*/
+#define OC_HADAMARD_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_8x8 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+ summing of absolute values.
+ At the end of this part, %%xmm1 will contain the DC coefficient of the
+ transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ /*We use the fact that \
+ (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+ to merge the final butterfly with the abs and the first stage of \
+ accumulation. \
+ Thus we can avoid using pabsw, which is not available until SSSE3. \
+ Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
+ implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+ registers). \
+ Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+ This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
+ "movdqa %%xmm7,0x10(%[buf])\n\t" \
+ "movdqa %%xmm6,(%[buf])\n\t" \
+ /*xmm7={0x7FFF}x4 \
+ xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
+ "pcmpeqb %%xmm7,%%xmm7\n\t" \
+ "movdqa %%xmm4,%%xmm6\n\t" \
+ "psrlw $1,%%xmm7\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm4\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm4\n\t" \
+ /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
+ xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
+ "movdqa %%xmm2,%%xmm6\n\t" \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ "pmaxsw %%xmm3,%%xmm2\n\t" \
+ "pmaxsw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm6\n\t" \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "movdqa 0x10(%[buf]),%%xmm3\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+ summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "movdqa (%[buf]),%%xmm5\n\t" \
+ "paddsw %%xmm7,%%xmm1\n\t" \
+ "psubw %%xmm6,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm0\n\t" \
+ /*xmm7={1}x4 (needed for the horizontal add that follows) \
+ xmm0+=xmm2+xmm4+max(abs(xmm1),abs(xmm3))-0x7FFF*/ \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm3,%%xmm0\n\t" \
+ "psrlw $14,%%xmm7\n\t" \
+ "psubw %%xmm6,%%xmm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+ absolute value of each component, and accumulates everything into xmm0.*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+ component, and accumulates everything into xmm0.
+ Note that xmm0 will have an extra 4 added to each column, and that after
+ removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_8x8
+
+static unsigned oc_int_frag_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+ OC_ALIGN16(ogg_int16_t buf[16]);
+ ogg_int16_t *bufp;
+ unsigned ret;
+ unsigned dc;
+ bufp=buf;
+ __asm__ __volatile__(
+ OC_LOAD_SUB_8x8
+ OC_HADAMARD_8x8
+ OC_TRANSPOSE_8x8
+ /*We split out the stages here so we can save the DC coefficient in the
+ middle.*/
+ OC_HADAMARD_AB_8x8
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8
+ "movd %%xmm1,%[dc]\n\t"
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8
+ /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+ difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+ for the factor of two we dropped + 3 for the vertical accumulation).
+ Now we finally have to promote things to dwords.
+ We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
+ latency of pmaddwd by computing abs(dc) here.*/
+ "pmaddwd %%xmm7,%%xmm0\n\t"
+ "movsx %w[dc],%[ret]\n\t"
+ "cdq\n\t"
+ "movdqa %%xmm0,%%xmm1\n\t"
+ "punpckhqdq %%xmm0,%%xmm0\n\t"
+ "add %[dc],%[ret]\n\t"
+ "paddd %%xmm1,%%xmm0\n\t"
+ "pshufd $1,%%xmm0,%%xmm1\n\t"
+ "xor %[ret],%[dc]\n\t"
+ "paddd %%xmm1,%%xmm0\n\t"
+ "movd %%xmm0,%[ret]\n\t"
+ /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
+ added to them, and a factor of two removed; correct the final sum here.*/
+ "lea -64(%[ret],%[ret]),%[ret]\n\t"
+ "sub %[dc],%[ret]\n\t"
+ /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+ and %[dc] with some of the inputs, since for once we don't write to
+ them until after we're done using everything but %[buf] (which is also
+ listed as an output to ensure gcc _doesn't_ alias them against it).*/
+ /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+ constraints, otherewise if gcc can prove they're equal it will allocate
+ them to the same register (which is bad); _src and _ref face a similar
+ problem, though those are never actually the same.*/
+ :[ret]"=a"(ret),[dc]"=d"(dc),[buf]"+r"(bufp)
+ :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+ [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+ /*We have to use neg, so we actually clobber the condition codes for once
+ (not to mention sub, and add).*/
+ :"cc"
+ );
+ *_dc=dc;
+ return ret;
+}
+
+unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+ return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
+}
+
+unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+ OC_ALIGN8(unsigned char ref[64]);
+ oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+ return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
+}
+
+unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _ystride){
+ OC_ALIGN16(ogg_int16_t buf[16]);
+ ogg_int16_t *bufp;
+ unsigned ret;
+ unsigned dc;
+ bufp=buf;
+ __asm__ __volatile__(
+ OC_LOAD_8x8
+ OC_HADAMARD_8x8
+ OC_TRANSPOSE_8x8
+ /*We split out the stages here so we can save the DC coefficient in the
+ middle.*/
+ OC_HADAMARD_AB_8x8
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8
+ "movd %%xmm1,%[dc]\n\t"
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8
+ /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+ difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+ for the factor of two we dropped + 3 for the vertical accumulation).
+ Now we finally have to promote things to dwords.*/
+ "pmaddwd %%xmm7,%%xmm0\n\t"
+ /*We assume that the DC coefficient is always positive (which is true,
+ because the input to the INTRA transform was not a difference).*/
+ "movzx %w[dc],%[dc]\n\t"
+ "movdqa %%xmm0,%%xmm1\n\t"
+ "punpckhqdq %%xmm0,%%xmm0\n\t"
+ "paddd %%xmm1,%%xmm0\n\t"
+ "pshufd $1,%%xmm0,%%xmm1\n\t"
+ "paddd %%xmm1,%%xmm0\n\t"
+ "movd %%xmm0,%[ret]\n\t"
+ "lea -64(%[ret],%[ret]),%[ret]\n\t"
+ "sub %[dc],%[ret]\n\t"
+ /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+ and %[dc] with some of the inputs, since for once we don't write to
+ them until after we're done using everything but %[buf] (which is also
+ listed as an output to ensure gcc _doesn't_ alias them against it).*/
+ :[ret]"=a"(ret),[dc]"=r"(dc),[buf]"+r"(bufp)
+ :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+ [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+ /*We have to use sub, so we actually clobber the condition codes for once.*/
+ :"cc"
+ );
+ *_dc=dc;
+ return ret;
+}
+
+#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2fdct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2fdct.c 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2fdct.c 2010-02-10 16:48:00 UTC (rev 16889)
@@ -13,12 +13,13 @@
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include <stddef.h>
#include "x86enc.h"
+#include "sse2trans.h"
#if defined(OC_X86_64_ASM)
-# define OC_FDCT8x8 \
+# define OC_FDCT_8x8 \
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
- "#OC_FDCT8x8\n\t" \
+ "#OC_FDCT_8x8\n\t" \
/*Stage 1:*/ \
"movdqa %%xmm0,%%xmm11\n\t" \
"movdqa %%xmm1,%%xmm10\n\t" \
@@ -349,81 +350,6 @@
"psubw %%xmm14,%%xmm10\n\t" \
"paddw %%xmm10,%%xmm7\n\t " \
-# define OC_TRANSPOSE8x8 \
- "#OC_TRANSPOSE8x8\n\t" \
- "movdqa %%xmm4,%%xmm8\n\t" \
- /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
- "punpcklwd %%xmm5,%%xmm4\n\t" \
- /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
- "punpckhwd %%xmm5,%%xmm8\n\t" \
- /*xmm5 is free.*/ \
- "movdqa %%xmm0,%%xmm5\n\t" \
- /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
- "punpcklwd %%xmm1,%%xmm0\n\t" \
- /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
- "punpckhwd %%xmm1,%%xmm5\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
- "punpcklwd %%xmm7,%%xmm6\n\t" \
- /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
- "punpckhwd %%xmm7,%%xmm1\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm2,%%xmm7\n\t" \
- /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
- "punpcklwd %%xmm3,%%xmm7\n\t" \
- /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "punpckhwd %%xmm3,%%xmm2\n\t" \
- /*xmm3 is free.*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
- "punpckldq %%xmm7,%%xmm0\n\t" \
- /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
- "punpckhdq %%xmm7,%%xmm3\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
- "punpckldq %%xmm2,%%xmm5\n\t" \
- /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
- "punpckhdq %%xmm2,%%xmm7\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm4,%%xmm2\n\t" \
- /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
- "punpckldq %%xmm6,%%xmm2\n\t" \
- /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
- "punpckhdq %%xmm6,%%xmm4\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm8,%%xmm6\n\t" \
- /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
- "punpckldq %%xmm1,%%xmm6\n\t" \
- /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "punpckhdq %%xmm1,%%xmm8\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm0,%%xmm1\n\t" \
- /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "punpcklqdq %%xmm2,%%xmm0\n\t" \
- /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
- "punpckhqdq %%xmm2,%%xmm1\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm3,%%xmm2\n\t" \
- /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
- "punpcklqdq %%xmm4,%%xmm2\n\t" \
- /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
- "punpckhqdq %%xmm4,%%xmm3\n\t" \
- /*xmm4 is free.*/ \
- "movdqa %%xmm5,%%xmm4\n\t" \
- /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
- "punpcklqdq %%xmm6,%%xmm4\n\t" \
- /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
- "punpckhqdq %%xmm6,%%xmm5\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm7,%%xmm6\n\t" \
- /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
- "punpcklqdq %%xmm8,%%xmm6\n\t" \
- /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
- "punpckhqdq %%xmm8,%%xmm7\n\t" \
- /*xmm8 is free.*/ \
-
/*SSE2 implementation of the fDCT for x86-64 only.
Because of the 8 extra XMM registers on x86-64, this version can operate
without any temporary stack access at all.*/
@@ -482,12 +408,12 @@
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
"psubw %%xmm9,%%xmm1\n\t"
/*Transform columns.*/
- OC_FDCT8x8
+ OC_FDCT_8x8
/*Transform rows.*/
- OC_TRANSPOSE8x8
- OC_FDCT8x8
+ OC_TRANSPOSE_8x8
+ OC_FDCT_8x8
/*TODO: zig-zag ordering?*/
- OC_TRANSPOSE8x8
+ OC_TRANSPOSE_8x8
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
"paddw %%xmm14,%%xmm14\n\t"
"psubw %%xmm14,%%xmm0\n\t"
Added: experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2trans.h 2010-02-10 16:48:00 UTC (rev 16889)
@@ -0,0 +1,201 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "../encint.h"
+# include "x86enc.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+ By clever choices of the order to apply the butterflies and the order of
+ their outputs, we can take the rows in order and output the columns in order
+ without any extra operations and using just one temporary register.*/
+# define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+ Again, the butterflies are carefully arranged to get the columns to come out
+ in order, minimizing register spills and maximizing the delay between a load
+ and when the value loaded is actually used.*/
+# define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,(%[buf])\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa (%[buf]),%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,0x10(%[buf])\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa 0x10(%[buf]),%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,(%[buf])\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa (%[buf]),%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,0x10(%[buf])\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa 0x10(%[buf]),%%xmm0\n\t" \
+
+# endif
+
+#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.c 2010-02-10 16:48:00 UTC (rev 16889)
@@ -42,7 +42,13 @@
}
if(cpu_flags&OC_CPU_X86_SSE2){
# if defined(OC_X86_64_ASM)
- /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
+ _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+ /*These routines work on x86-32, but are actually slower than the MMX ones
+ on my Core Duo, which is probably the most advanced SSE2 engine any
+ 32-bit Intel chip had.*/
+ _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
+ _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
+ _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
# endif
}
}
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86enc.h 2010-02-10 16:48:00 UTC (rev 16889)
@@ -39,9 +39,19 @@
const unsigned char *_x,const unsigned char *_y,int _stride);
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
const unsigned char *_x,int _stride);
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+unsigned oc_enc_frag_satd_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_sse2(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_sse2(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
+# if defined(OC_X86_64_ASM)
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+# endif
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxencfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxencfrag.c 2010-02-10 16:00:47 UTC (rev 16888)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxencfrag.c 2010-02-10 16:48:00 UTC (rev 16889)
@@ -266,7 +266,7 @@
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
- Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+ Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 __asm{ \
/*Stage A: \
@@ -299,7 +299,7 @@
}
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
- Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+ Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 __asm{ \
/*Stage C:*/ \
More information about the commits
mailing list