[xiph-commits] r14714 - in trunk/theora: lib lib/dec lib/dec/x86_vc lib/enc win32/VS2005/libtheora
giles at svn.xiph.org
giles at svn.xiph.org
Fri Apr 11 18:04:43 PDT 2008
Author: giles
Date: 2008-04-11 18:04:43 -0700 (Fri, 11 Apr 2008)
New Revision: 14714
Added:
trunk/theora/lib/dec/x86_vc/
trunk/theora/lib/dec/x86_vc/mmxfrag.c
trunk/theora/lib/dec/x86_vc/mmxidct.c
trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
trunk/theora/lib/dec/x86_vc/mmxstate.c
trunk/theora/lib/dec/x86_vc/x86int.h
trunk/theora/lib/dec/x86_vc/x86state.c
Modified:
trunk/theora/lib/cpu.c
trunk/theora/lib/dec/state.c
trunk/theora/lib/enc/dct_decode.c
trunk/theora/lib/enc/encoder_idct.c
trunk/theora/lib/internal.h
trunk/theora/win32/VS2005/libtheora/libtheora.vcproj
Log:
Untested merge of Nils Pipenbrinck's translation of the inline assembly
to MSVC syntax.
Modified: trunk/theora/lib/cpu.c
===================================================================
--- trunk/theora/lib/cpu.c 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/cpu.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -5,7 +5,7 @@
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
* by the Xiph.Org Foundation http://www.xiph.org/ *
* *
********************************************************************
@@ -18,18 +18,42 @@
********************************************************************/
-#include "cpu.h"
+#if !defined(USE_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+ return 0;
+}
-ogg_uint32_t oc_cpu_flags_get(void){
- ogg_uint32_t flags = 0;
-#if defined(USE_ASM)
- ogg_uint32_t eax;
- ogg_uint32_t ebx;
- ogg_uint32_t ecx;
- ogg_uint32_t edx;
-#if (defined(__amd64__) || defined(__x86_64__))
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+#else /* USE_ASM */
+
+# if defined(_MSC_VER)
+/* Visual C cpuid helper function. For VS2005 we could
+ as well use the _cpuid builtin, but that wouldn't work
+ for VS2003 users, so we do it in inline assembler */
+
+static void oc_cpuid_helper (ogg_uint32_t * CpuInfo, ogg_uint32_t op){
+ _asm {
+ mov eax, [op]
+ mov esi, CpuInfo
+ cpuid
+ mov [esi + 0], eax
+ mov [esi + 4], ebx
+ mov [esi + 8], ecx
+ mov [esi +12], edx
+ }
+}
+
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ { \
+ ogg_uint32_t nfo[4]; \
+ oc_cpuid_helper (nfo, (_op)); \
+ (_eax) = nfo[0],(_ebx) = nfo[1]; \
+ (_ecx) = nfo[2],(_edx) = nfo[3]; \
+ }
+
+# elif (defined(__amd64__) || defined(__x86_64__))
+
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
__asm__ __volatile__( \
"push %%rbx\n\t" \
"cpuid\n\t" \
@@ -42,8 +66,9 @@
:"a" (_op) \
:"cc" \
)
-#else
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+# else /* x86_32, GCC */
+
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
__asm__ __volatile__( \
"pushl %%ebx\n\t" \
"cpuid\n\t" \
@@ -56,6 +81,18 @@
:"a" (_op) \
:"cc" \
)
+
+# endif /* arch switch */
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags = 0;
+ ogg_uint32_t eax;
+ ogg_uint32_t ebx;
+ ogg_uint32_t ecx;
+ ogg_uint32_t edx;
+
+# if !defined(_MSC_VER) && !defined(__amd64__) && !defined(__x86_64__)
+ /* check for cpuid */
__asm__ __volatile__(
"pushfl\n\t"
"pushfl\n\t"
@@ -74,7 +111,8 @@
);
/*No cpuid.*/
if(eax==ebx)return 0;
-#endif
+# endif /* GCC, x86_32 */
+
cpuid(0,eax,ebx,ecx,edx);
if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
/*Intel:*/
@@ -102,8 +140,8 @@
/*Implement me.*/
flags=0;
}
-
-#ifdef DEBUG
+
+# ifdef DEBUG
if (flags) {
TH_DEBUG("vectorized instruction sets supported:");
if (flags & OC_CPU_X86_MMX) TH_DEBUG(" mmx");
@@ -114,9 +152,9 @@
if (flags & OC_CPU_X86_3DNOWEXT) TH_DEBUG(" 3dnowext");
TH_DEBUG("\n");
}
-#endif
-#endif
-
+# endif
+
return flags;
}
+#endif /* USE_ASM */
Modified: trunk/theora/lib/dec/state.c
===================================================================
--- trunk/theora/lib/dec/state.c 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/dec/state.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -20,8 +20,12 @@
#include "../internal.h"
#include "idct.h"
#if defined(USE_ASM)
+#if defined(_MSC_VER)
+# include "x86_vc/x86int.h"
+#else
# include "x86/x86int.h"
#endif
+#endif
#if defined(OC_DUMP_IMAGES)
# include <stdio.h>
# include "png.h"
Added: trunk/theora/lib/dec/x86_vc/mmxfrag.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxfrag.c (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxfrag.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,430 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+#include "../../internal.h"
+
+/* ------------------------------------------------------------------------
+ MMX reconstruction fragment routines for Visual Studio.
+ Tested with VS2005. Should compile for VS2003 and VC6 as well.
+
+ Initial implementation 2007 by Nils Pipenbrinck.
+ ---------------------------------------------------------------------*/
+
+#if defined(USE_ASM)
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter reconstruction step with 8 iterations
+ unrolled. The iteration for each instruction is noted by the #id in the
+ comments (in case you want to reconstruct it)
+ --------------------------------------------------------------------- */
+ _asm{
+ mov edi, [_residue] /* load residue ptr */
+ mov eax, 0x00800080 /* generate constant */
+ mov ebx, [_dst_ystride] /* load dst-stride */
+ mov edx, [_dst] /* load dest pointer */
+
+ /* unrolled loop begins here */
+
+ movd mm0, eax /* load constant */
+ movq mm1, [edi+ 8*0] /* #1 load low residue */
+ movq mm2, [edi+ 8*1] /* #1 load high residue */
+ punpckldq mm0, mm0 /* build constant */
+ movq mm3, [edi+ 8*2] /* #2 load low residue */
+ movq mm4, [edi+ 8*3] /* #2 load high residue */
+ movq mm5, [edi+ 8*4] /* #3 load low residue */
+ movq mm6, [edi+ 8*5] /* #3 load high residue */
+ paddsw mm1, mm0 /* #1 bias low residue */
+ paddsw mm2, mm0 /* #1 bias high residue */
+ packuswb mm1, mm2 /* #1 pack to byte */
+ paddsw mm3, mm0 /* #2 bias low residue */
+ paddsw mm4, mm0 /* #2 bias high residue */
+ packuswb mm3, mm4 /* #2 pack to byte */
+ paddsw mm5, mm0 /* #3 bias low residue */
+ paddsw mm6, mm0 /* #3 bias high residue */
+ packuswb mm5, mm6 /* #3 pack to byte */
+ movq [edx], mm1 /* #1 write row */
+ movq [edx + ebx], mm3 /* #2 write row */
+ movq [edx + ebx*2], mm5 /* #3 write row */
+ movq mm1, [edi+ 8*6] /* #4 load low residue */
+ lea ecx, [ebx + ebx*2] /* make dst_ystride * 3 */
+ movq mm2, [edi+ 8*7] /* #4 load high residue */
+ movq mm3, [edi+ 8*8] /* #5 load low residue */
+ lea esi, [ebx*4 + ebx] /* make dst_ystride * 5 */
+ movq mm4, [edi+ 8*9] /* #5 load high residue */
+ movq mm5, [edi+ 8*10] /* #6 load low residue */
+ lea eax, [ecx*2 + ebx] /* make dst_ystride * 7 */
+ movq mm6, [edi+ 8*11] /* #6 load high residue */
+ paddsw mm1, mm0 /* #4 bias low residue */
+ paddsw mm2, mm0 /* #4 bias high residue */
+ packuswb mm1, mm2 /* #4 pack to byte */
+ paddsw mm3, mm0 /* #5 bias low residue */
+ paddsw mm4, mm0 /* #5 bias high residue */
+ packuswb mm3, mm4 /* #5 pack to byte */
+ paddsw mm5, mm0 /* #6 bias low residue */
+ paddsw mm6, mm0 /* #6 bias high residue */
+ packuswb mm5, mm6 /* #6 pack to byte */
+ movq [edx + ecx], mm1 /* #4 write row */
+ movq [edx + ebx*4], mm3 /* #5 write row */
+ movq [edx + esi], mm5 /* #6 write row */
+ movq mm1, [edi+ 8*12] /* #7 load low residue */
+ movq mm2, [edi+ 8*13] /* #7 load high residue */
+ movq mm3, [edi+ 8*14] /* #8 load low residue */
+ movq mm4, [edi+ 8*15] /* #8 load high residue */
+ paddsw mm1, mm0 /* #7 bias low residue */
+ paddsw mm2, mm0 /* #7 bias high residue */
+ packuswb mm1, mm2 /* #7 pack to byte */
+ paddsw mm3, mm0 /* #8 bias low residue */
+ paddsw mm4, mm0 /* #8 bias high residue */
+ packuswb mm3, mm4 /* #8 pack to byte */
+ movq [edx + ecx*2], mm1 /* #7 write row */
+ movq [edx + eax], mm3 /* #8 write row */
+ }
+}
+
+
+
+void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter reconstruction step with two iterations
+ running in parallel to hide some load-latencies and break the dependency
+ chains. The iteration for each instruction is noted by the #id in the
+ comments (in case you want to reconstruct it)
+ --------------------------------------------------------------------- */
+ _asm{
+ pxor mm0, mm0 /* generate constant 0 */
+ mov esi, [_src]
+ mov edi, [_residue]
+ mov eax, [_src_ystride]
+ mov edx, [_dst]
+ mov ebx, [_dst_ystride]
+ mov ecx, 4
+
+ align 16
+
+nextchunk:
+ movq mm3, [esi] /* #1 load source */
+ movq mm1, [edi+0] /* #1 load residium low */
+ movq mm2, [edi+8] /* #1 load residium high */
+ movq mm7, [esi+eax] /* #2 load source */
+ movq mm4, mm3 /* #1 get copy of src */
+ movq mm5, [edi+16] /* #2 load residium low */
+ punpckhbw mm4, mm0 /* #1 expand high source */
+ movq mm6, [edi+24] /* #2 load residium high */
+ punpcklbw mm3, mm0 /* #1 expand low source */
+ paddsw mm4, mm2 /* #1 add residium high */
+ movq mm2, mm7 /* #2 get copy of src */
+ paddsw mm3, mm1 /* #1 add residium low */
+ punpckhbw mm2, mm0 /* #2 expand high source */
+ packuswb mm3, mm4 /* #1 final row pixels */
+ punpcklbw mm7, mm0 /* #2 expand low source */
+ movq [edx], mm3 /* #1 write row */
+ paddsw mm2, mm6 /* #2 add residium high */
+ add edi, 32 /* residue += 4 */
+ paddsw mm7, mm5 /* #2 add residium low */
+ sub ecx, 1 /* update loop counter */
+ packuswb mm7, mm2 /* #2 final row */
+ lea esi, [esi+eax*2] /* src += stride * 2 */
+ movq [edx + ebx], mm7 /* #2 write row */
+ lea edx, [edx+ebx*2] /* dst += stride * 2 */
+ jne nextchunk
+ }
+}
+
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter2 reconstruction step.The building of the
+ average is done with a bit-twiddeling trick to avoid excessive register
+ copy work during byte to word conversion.
+
+ average = (a & b) + (((a ^ b) & 0xfe) >> 1);
+
+ (shown for a single byte; it's done with 8 of them at a time)
+
+ Slightly faster than the obvious method using add and shift, but not
+ earthshaking improvement either.
+
+ If anyone comes up with a way that produces bit-identical outputs
+ using the pavgb instruction let me know and I'll do the 3dnow codepath.
+ --------------------------------------------------------------------- */
+ _asm{
+ mov eax, 0xfefefefe
+ mov esi, [_src1]
+ mov edi, [_src2]
+ movd mm1, eax
+ mov ebx, [_residue]
+ mov edx, [_dst]
+ mov eax, [_dst_ystride]
+ punpckldq mm1, mm1 /* replicate lsb32 */
+ mov ecx, 8 /* init loop counter */
+ pxor mm0, mm0 /* constant zero */
+ sub edx, eax /* dst -= dst_stride */
+
+ align 16
+
+nextrow:
+ movq mm2, [esi] /* load source1 */
+ movq mm3, [edi] /* load source2 */
+ movq mm5, [ebx + 0] /* load lower residue */
+ movq mm6, [ebx + 8] /* load higer residue */
+ add esi, _src1_ystride /* src1 += src1_stride */
+ add edi, _src2_ystride /* src2 += src1_stride */
+ movq mm4, mm2 /* get copy of source1 */
+ pand mm2, mm3 /* s1 & s2 (avg part) */
+ pxor mm3, mm4 /* s1 ^ s2 (avg part) */
+ add ebx, 16 /* residue++ */
+ pand mm3, mm1 /* mask out low bits */
+ psrlq mm3, 1 /* shift xor avg-part */
+ paddd mm3, mm2 /* build final average */
+ add edx, eax /* dst += dst_stride */
+ movq mm2, mm3 /* get copy of average */
+ punpckhbw mm3, mm0 /* average high */
+ punpcklbw mm2, mm0 /* average low */
+ paddsw mm3, mm6 /* high + residue */
+ paddsw mm2, mm5 /* low + residue */
+ sub ecx, 1 /* update loop counter */
+ packuswb mm2, mm3 /* pack and saturate */
+ movq [edx], mm2 /* write row */
+ jne nextrow
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+ _asm { emms }
+}
+
+#endif
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+#include "../../internal.h"
+
+/* ------------------------------------------------------------------------
+ MMX reconstruction fragment routines for Visual Studio.
+ Tested with VS2005. Should compile for VS2003 and VC6 as well.
+
+ Initial implementation 2007 by Nils Pipenbrinck.
+ ---------------------------------------------------------------------*/
+
+#if defined(USE_ASM)
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter reconstruction step with 8 iterations
+ unrolled. The iteration for each instruction is noted by the #id in the
+ comments (in case you want to reconstruct it)
+ --------------------------------------------------------------------- */
+ _asm{
+ mov edi, [_residue] /* load residue ptr */
+ mov eax, 0x00800080 /* generate constant */
+ mov ebx, [_dst_ystride] /* load dst-stride */
+ mov edx, [_dst] /* load dest pointer */
+
+ /* unrolled loop begins here */
+
+ movd mm0, eax /* load constant */
+ movq mm1, [edi+ 8*0] /* #1 load low residue */
+ movq mm2, [edi+ 8*1] /* #1 load high residue */
+ punpckldq mm0, mm0 /* build constant */
+ movq mm3, [edi+ 8*2] /* #2 load low residue */
+ movq mm4, [edi+ 8*3] /* #2 load high residue */
+ movq mm5, [edi+ 8*4] /* #3 load low residue */
+ movq mm6, [edi+ 8*5] /* #3 load high residue */
+ paddsw mm1, mm0 /* #1 bias low residue */
+ paddsw mm2, mm0 /* #1 bias high residue */
+ packuswb mm1, mm2 /* #1 pack to byte */
+ paddsw mm3, mm0 /* #2 bias low residue */
+ paddsw mm4, mm0 /* #2 bias high residue */
+ packuswb mm3, mm4 /* #2 pack to byte */
+ paddsw mm5, mm0 /* #3 bias low residue */
+ paddsw mm6, mm0 /* #3 bias high residue */
+ packuswb mm5, mm6 /* #3 pack to byte */
+ movq [edx], mm1 /* #1 write row */
+ movq [edx + ebx], mm3 /* #2 write row */
+ movq [edx + ebx*2], mm5 /* #3 write row */
+ movq mm1, [edi+ 8*6] /* #4 load low residue */
+ lea ecx, [ebx + ebx*2] /* make dst_ystride * 3 */
+ movq mm2, [edi+ 8*7] /* #4 load high residue */
+ movq mm3, [edi+ 8*8] /* #5 load low residue */
+ lea esi, [ebx*4 + ebx] /* make dst_ystride * 5 */
+ movq mm4, [edi+ 8*9] /* #5 load high residue */
+ movq mm5, [edi+ 8*10] /* #6 load low residue */
+ lea eax, [ecx*2 + ebx] /* make dst_ystride * 7 */
+ movq mm6, [edi+ 8*11] /* #6 load high residue */
+ paddsw mm1, mm0 /* #4 bias low residue */
+ paddsw mm2, mm0 /* #4 bias high residue */
+ packuswb mm1, mm2 /* #4 pack to byte */
+ paddsw mm3, mm0 /* #5 bias low residue */
+ paddsw mm4, mm0 /* #5 bias high residue */
+ packuswb mm3, mm4 /* #5 pack to byte */
+ paddsw mm5, mm0 /* #6 bias low residue */
+ paddsw mm6, mm0 /* #6 bias high residue */
+ packuswb mm5, mm6 /* #6 pack to byte */
+ movq [edx + ecx], mm1 /* #4 write row */
+ movq [edx + ebx*4], mm3 /* #5 write row */
+ movq [edx + esi], mm5 /* #6 write row */
+ movq mm1, [edi+ 8*12] /* #7 load low residue */
+ movq mm2, [edi+ 8*13] /* #7 load high residue */
+ movq mm3, [edi+ 8*14] /* #8 load low residue */
+ movq mm4, [edi+ 8*15] /* #8 load high residue */
+ paddsw mm1, mm0 /* #7 bias low residue */
+ paddsw mm2, mm0 /* #7 bias high residue */
+ packuswb mm1, mm2 /* #7 pack to byte */
+ paddsw mm3, mm0 /* #8 bias low residue */
+ paddsw mm4, mm0 /* #8 bias high residue */
+ packuswb mm3, mm4 /* #8 pack to byte */
+ movq [edx + ecx*2], mm1 /* #7 write row */
+ movq [edx + eax], mm3 /* #8 write row */
+ }
+}
+
+
+
+void oc_frag_recon_inter_mmx (unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src, int _src_ystride, const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter reconstruction step with two iterations
+ running in parallel to hide some load-latencies and break the dependency
+ chains. The iteration for each instruction is noted by the #id in the
+ comments (in case you want to reconstruct it)
+ --------------------------------------------------------------------- */
+ _asm{
+ pxor mm0, mm0 /* generate constant 0 */
+ mov esi, [_src]
+ mov edi, [_residue]
+ mov eax, [_src_ystride]
+ mov edx, [_dst]
+ mov ebx, [_dst_ystride]
+ mov ecx, 4
+
+ align 16
+
+nextchunk:
+ movq mm3, [esi] /* #1 load source */
+ movq mm1, [edi+0] /* #1 load residium low */
+ movq mm2, [edi+8] /* #1 load residium high */
+ movq mm7, [esi+eax] /* #2 load source */
+ movq mm4, mm3 /* #1 get copy of src */
+ movq mm5, [edi+16] /* #2 load residium low */
+ punpckhbw mm4, mm0 /* #1 expand high source */
+ movq mm6, [edi+24] /* #2 load residium high */
+ punpcklbw mm3, mm0 /* #1 expand low source */
+ paddsw mm4, mm2 /* #1 add residium high */
+ movq mm2, mm7 /* #2 get copy of src */
+ paddsw mm3, mm1 /* #1 add residium low */
+ punpckhbw mm2, mm0 /* #2 expand high source */
+ packuswb mm3, mm4 /* #1 final row pixels */
+ punpcklbw mm7, mm0 /* #2 expand low source */
+ movq [edx], mm3 /* #1 write row */
+ paddsw mm2, mm6 /* #2 add residium high */
+ add edi, 32 /* residue += 4 */
+ paddsw mm7, mm5 /* #2 add residium low */
+ sub ecx, 1 /* update loop counter */
+ packuswb mm7, mm2 /* #2 final row */
+ lea esi, [esi+eax*2] /* src += stride * 2 */
+ movq [edx + ebx], mm7 /* #2 write row */
+ lea edx, [edx+ebx*2] /* dst += stride * 2 */
+ jne nextchunk
+ }
+}
+
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst, int _dst_ystride,
+ const unsigned char *_src1, int _src1_ystride, const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue){
+ /* ---------------------------------------------------------------------
+ This function does the inter2 reconstruction step.The building of the
+ average is done with a bit-twiddeling trick to avoid excessive register
+ copy work during byte to word conversion.
+
+ average = (a & b) + (((a ^ b) & 0xfe) >> 1);
+
+ (shown for a single byte; it's done with 8 of them at a time)
+
+ Slightly faster than the obvious method using add and shift, but not
+ earthshaking improvement either.
+
+ If anyone comes up with a way that produces bit-identical outputs
+ using the pavgb instruction let me know and I'll do the 3dnow codepath.
+ --------------------------------------------------------------------- */
+ _asm{
+ mov eax, 0xfefefefe
+ mov esi, [_src1]
+ mov edi, [_src2]
+ movd mm1, eax
+ mov ebx, [_residue]
+ mov edx, [_dst]
+ mov eax, [_dst_ystride]
+ punpckldq mm1, mm1 /* replicate lsb32 */
+ mov ecx, 8 /* init loop counter */
+ pxor mm0, mm0 /* constant zero */
+ sub edx, eax /* dst -= dst_stride */
+
+ align 16
+
+nextrow:
+ movq mm2, [esi] /* load source1 */
+ movq mm3, [edi] /* load source2 */
+ movq mm5, [ebx + 0] /* load lower residue */
+ movq mm6, [ebx + 8] /* load higer residue */
+ add esi, _src1_ystride /* src1 += src1_stride */
+ add edi, _src2_ystride /* src2 += src1_stride */
+ movq mm4, mm2 /* get copy of source1 */
+ pand mm2, mm3 /* s1 & s2 (avg part) */
+ pxor mm3, mm4 /* s1 ^ s2 (avg part) */
+ add ebx, 16 /* residue++ */
+ pand mm3, mm1 /* mask out low bits */
+ psrlq mm3, 1 /* shift xor avg-part */
+ paddd mm3, mm2 /* build final average */
+ add edx, eax /* dst += dst_stride */
+ movq mm2, mm3 /* get copy of average */
+ punpckhbw mm3, mm0 /* average high */
+ punpcklbw mm2, mm0 /* average low */
+ paddsw mm3, mm6 /* high + residue */
+ paddsw mm2, mm5 /* low + residue */
+ sub ecx, 1 /* update loop counter */
+ packuswb mm2, mm3 /* pack and saturate */
+ movq [edx], mm2 /* write row */
+ jne nextrow
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+ _asm { emms }
+}
+
+#endif
+
Property changes on: trunk/theora/lib/dec/x86_vc/mmxfrag.c
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: trunk/theora/lib/dec/x86_vc/mmxidct.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxidct.c (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxidct.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,2014 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+ MMX based IDCT for the theora codec.
+
+ Originally written by Rudolf Marek, based on code from On2's VP3.
+ Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+ ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include <ogg/ogg.h>
+#include "../dct.h"
+#include "../idct.h"
+#include "x86int.h"
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16)) ogg_uint16_t
+ OC_IDCT_CONSTS[(7+1)*4]={
+ (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+ (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+ (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+ (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+ (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+ (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+ (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+ (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+ (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+ (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+ (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+ (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+ (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+ (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+ 8, 8, 8, 8
+};
+
+
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+ _asm {
+ mov edx, [_y]
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
+ /* emms */
+ }
+}
+
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+ _asm {
+ mov edx, [_y]
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
+ /* emms */
+ }
+}
+
+#endif
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+ MMX based IDCT for the theora codec.
+
+ Originally written by Rudolf Marek, based on code from On2's VP3.
+ Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+ ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include <ogg/ogg.h>
+#include "../dct.h"
+#include "../idct.h"
+#include "x86int.h"
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16)) ogg_uint16_t
+ OC_IDCT_CONSTS[(7+1)*4]={
+ (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+ (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+ (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+ (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+ (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+ (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+ (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+ (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+ (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+ (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+ (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+ (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+ (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+ (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+ 8, 8, 8, 8
+};
+
+
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+ _asm {
+ mov edx, [_y]
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
+ /* emms */
+ }
+}
+
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]){
+ _asm {
+ mov edx, [_y]
+ mov eax, offset OC_IDCT_CONSTS
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 18H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 38H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 28H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 08H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 20H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 10H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 08H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 18H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 10H]
+ movq mm0, mm4
+ movq [edx + 38H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 28H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 10H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 30H], mm4
+ movq [edx + 20H], mm2
+ movq mm2, [edx + 70H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 50H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 60H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 50H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 60H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 40H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 50H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ movq mm3, [edx + 60H]
+ psubw mm4, mm7
+ paddw mm1, mm1
+ paddw mm7, mm7
+ paddw mm1, mm2
+ paddw mm7, mm4
+ psubw mm4, mm3
+ paddw mm3, mm3
+ psubw mm6, mm5
+ paddw mm5, mm5
+ paddw mm3, mm4
+ paddw mm5, mm6
+ psubw mm7, mm0
+ paddw mm0, mm0
+ movq [edx + 50H], mm1
+ paddw mm0, mm7
+ movq mm1, mm4
+ punpcklwd mm4, mm5
+ movq [edx + 40H], mm0
+ punpckhwd mm1, mm5
+ movq mm0, mm6
+ punpcklwd mm6, mm7
+ movq mm5, mm4
+ punpckldq mm4, mm6
+ punpckhdq mm5, mm6
+ movq mm6, mm1
+ movq [edx + 48H], mm4
+ punpckhwd mm0, mm7
+ movq [edx + 58H], mm5
+ punpckhdq mm6, mm0
+ movq mm4, [edx + 40H]
+ punpckldq mm1, mm0
+ movq mm5, [edx + 50H]
+ movq mm0, mm4
+ movq [edx + 78H], mm6
+ punpcklwd mm0, mm5
+ movq [edx + 68H], mm1
+ punpckhwd mm4, mm5
+ movq mm5, mm2
+ punpcklwd mm2, mm3
+ movq mm1, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm1, mm2
+ movq mm2, mm4
+ movq [edx + 40H], mm0
+ punpckhwd mm5, mm3
+ movq [edx + 50H], mm1
+ punpckhdq mm4, mm5
+ punpckldq mm2, mm5
+ movq [edx + 70H], mm4
+ movq [edx + 60H], mm2
+ movq mm2, [edx + 30H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 50H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 10H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 70H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 20H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 60H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 10H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 20H], mm6
+ movq mm2, mm0
+ movq mm6, [edx]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 40H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 10H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 20H]
+ paddw mm7, mm7
+ movq [edx + 20H], mm2
+ paddw mm7, mm4
+ movq [edx + 10H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 40H], mm4
+ psraw mm5, 4
+ movq [edx + 30H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 60H], mm6
+ psraw mm0, 4
+ movq [edx + 50H], mm5
+ movq [edx + 70H], mm7
+ movq [edx], mm0
+ movq mm2, [edx + 38H]
+ movq mm6, [eax + 10H]
+ movq mm4, mm2
+ movq mm7, [edx + 58H]
+ pmulhw mm4, mm6
+ movq mm1, [eax + 20H]
+ pmulhw mm6, mm7
+ movq mm5, mm1
+ pmulhw mm1, mm2
+ movq mm3, [edx + 18H]
+ pmulhw mm5, mm7
+ movq mm0, [eax]
+ paddw mm4, mm2
+ paddw mm6, mm7
+ paddw mm2, mm1
+ movq mm1, [edx + 78H]
+ paddw mm7, mm5
+ movq mm5, mm0
+ pmulhw mm0, mm3
+ paddw mm4, mm7
+ pmulhw mm5, mm1
+ movq mm7, [eax + 30H]
+ psubw mm6, mm2
+ paddw mm0, mm3
+ pmulhw mm3, mm7
+ movq mm2, [edx + 28H]
+ pmulhw mm7, mm1
+ paddw mm5, mm1
+ movq mm1, mm2
+ pmulhw mm2, [eax + 08H]
+ psubw mm3, mm5
+ movq mm5, [edx + 68H]
+ paddw mm0, mm7
+ movq mm7, mm5
+ psubw mm0, mm4
+ pmulhw mm5, [eax + 08H]
+ paddw mm2, mm1
+ pmulhw mm1, [eax + 28H]
+ paddw mm4, mm4
+ paddw mm4, mm0
+ psubw mm3, mm6
+ paddw mm5, mm7
+ paddw mm6, mm6
+ pmulhw mm7, [eax + 28H]
+ paddw mm6, mm3
+ movq [edx + 18H], mm4
+ psubw mm1, mm5
+ movq mm4, [eax + 18H]
+ movq mm5, mm3
+ pmulhw mm3, mm4
+ paddw mm7, mm2
+ movq [edx + 28H], mm6
+ movq mm2, mm0
+ movq mm6, [edx + 08H]
+ pmulhw mm0, mm4
+ paddw mm5, mm3
+ movq mm3, [edx + 48H]
+ psubw mm5, mm1
+ paddw mm2, mm0
+ psubw mm6, mm3
+ movq mm0, mm6
+ pmulhw mm6, mm4
+ paddw mm3, mm3
+ paddw mm1, mm1
+ paddw mm3, mm0
+ paddw mm1, mm5
+ pmulhw mm4, mm3
+ paddw mm6, mm0
+ psubw mm6, mm2
+ paddw mm2, mm2
+ movq mm0, [edx + 18H]
+ paddw mm2, mm6
+ paddw mm4, mm3
+ psubw mm2, mm1
+ paddw mm2, [eax + 38H]
+ paddw mm1, mm1
+ paddw mm1, mm2
+ psraw mm2, 4
+ psubw mm4, mm7
+ psraw mm1, 4
+ movq mm3, [edx + 28H]
+ paddw mm7, mm7
+ movq [edx + 28H], mm2
+ paddw mm7, mm4
+ movq [edx + 18H], mm1
+ psubw mm4, mm3
+ paddw mm4, [eax + 38H]
+ paddw mm3, mm3
+ paddw mm3, mm4
+ psraw mm4, 4
+ psubw mm6, mm5
+ psraw mm3, 4
+ paddw mm6, [eax + 38H]
+ paddw mm5, mm5
+ paddw mm5, mm6
+ psraw mm6, 4
+ movq [edx + 48H], mm4
+ psraw mm5, 4
+ movq [edx + 38H], mm3
+ psubw mm7, mm0
+ paddw mm7, [eax + 38H]
+ paddw mm0, mm0
+ paddw mm0, mm7
+ psraw mm7, 4
+ movq [edx + 68H], mm6
+ psraw mm0, 4
+ movq [edx + 58H], mm5
+ movq [edx + 78H], mm7
+ movq [edx + 08H], mm0
+ /* emms */
+ }
+}
+
+#endif
+
Property changes on: trunk/theora/lib/dec/x86_vc/mmxidct.c
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxloopfilter.c (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxloopfilter.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,756 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+ MMX based loop filter for the theora codec.
+
+ Originally written by Rudolf Marek, based on code from On2's VP3.
+ Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+ Note: I can't test these since my example files never get into the
+ loop filters, but the code has been converted semi-automatic from
+ the GCC sources, so it ought to work.
+ ---------------------------------------------------------------------*/
+#include "../../internal.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+#if defined(USE_ASM)
+
+
+
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ _asm {
+ mov eax, [_pix]
+ mov edx, [_ystride]
+ mov ebx, [_ll]
+
+ /* _pix -= ystride */
+ sub eax, edx
+ /* mm0=0 */
+ pxor mm0, mm0
+ /* _pix -= ystride */
+ sub eax, edx
+ /* esi=_ystride*3 */
+ lea esi, [edx + edx*2]
+
+ /* mm7=_pix[0...8]*/
+ movq mm7, [eax]
+ /* mm4=_pix[0...8+_ystride*3]*/
+ movq mm4, [eax + esi]
+ /* mm6=_pix[0...8]*/
+ movq mm6, mm7
+ /* Expand unsigned _pix[0...3] to 16 bits.*/
+ punpcklbw mm6, mm0
+ movq mm5, mm4
+ /* Expand unsigned _pix[4...7] to 16 bits.*/
+ punpckhbw mm7, mm0
+ punpcklbw mm4, mm0
+ /* Expand other arrays too.*/
+ punpckhbw mm5, mm0
+ /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/
+ psubw mm6, mm4
+ psubw mm7, mm5
+ /*mm5=mm4=_pix[0...7+_ystride]*/
+ movq mm4, [eax + edx]
+ /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/
+ movq mm2, [eax + edx*2]
+ movq mm5, mm4
+ movq mm3, mm2
+ movq mm1, mm2
+ /*Expand these arrays.*/
+ punpckhbw mm5, mm0
+ punpcklbw mm4, mm0
+ punpckhbw mm3, mm0
+ punpcklbw mm2, mm0
+ pcmpeqw mm0, mm0
+ /*mm0=3 3 3 3
+ mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+ psubw mm3, mm5
+ psrlw mm0, 14
+ psubw mm2, mm4
+ /*Scale by 3.*/
+ pmullw mm3, mm0
+ pmullw mm2, mm0
+ /*mm0=4 4 4 4
+ f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+ 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+ psrlw mm0, 1
+ paddw mm3, mm7
+ psllw mm0, 2
+ paddw mm2, mm6
+ /*Add 4.*/
+ paddw mm3, mm0
+ paddw mm2, mm0
+ /*"Divide" by 8.*/
+ psraw mm3, 3
+ psraw mm2, 3
+ /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+ /*Free up mm5.*/
+ packuswb mm4, mm5
+ /*mm0=L L L L*/
+ movq mm0, [ebx]
+ /*if(R_i<-2L||R_i>2L)R_i=0:*/
+ movq mm5, mm2
+ pxor mm6, mm6
+ movq mm7, mm0
+ psubw mm6, mm0
+ psllw mm7, 1
+ psllw mm6, 1
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ pcmpgtw mm7, mm2
+ pcmpgtw mm5, mm6
+ pand mm2, mm7
+ movq mm7, mm0
+ pand mm2, mm5
+ psllw mm7, 1
+ movq mm5, mm3
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ pcmpgtw mm7, mm3
+ pcmpgtw mm5, mm6
+ pand mm3, mm7
+ movq mm7, mm0
+ pand mm3, mm5
+ /*if(R_i<-L)R_i'=R_i+2L;
+ if(R_i>L)R_i'=R_i-2L;
+ if(R_i<-L||R_i>L)R_i=-R_i':*/
+ psraw mm6, 1
+ movq mm5, mm2
+ psllw mm7, 1
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm5=R_i>L?FF:00*/
+ pcmpgtw mm5, mm0
+ /*mm6=-L>R_i?FF:00*/
+ pcmpgtw mm6, mm2
+ /*mm7=R_i>L?2L:0*/
+ pand mm7, mm5
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ psubw mm2, mm7
+ movq mm7, mm0
+ /*mm5=-L>R_i||R_i>L*/
+ por mm5, mm6
+ psllw mm7, 1
+ /*mm7=-L>R_i?2L:0*/
+ pand mm7, mm6
+ pxor mm6, mm6
+ /*mm2=-L>R_i?R_i+2L:R_i*/
+ paddw mm2, mm7
+ psubw mm6, mm0
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ pand mm5, mm2
+ movq mm7, mm0
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ psubw mm2, mm5
+ psllw mm7, 1
+ /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+ psubw mm2, mm5
+ movq mm5, mm3
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm6=-L>R_i?FF:00*/
+ pcmpgtw mm6, mm3
+ /*mm5=R_i>L?FF:00*/
+ pcmpgtw mm5, mm0
+ /*mm7=R_i>L?2L:0*/
+ pand mm7, mm5
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ psubw mm3, mm7
+ psllw mm0, 1
+ /*mm5=-L>R_i||R_i>L*/
+ por mm5, mm6
+ /*mm0=-L>R_i?2L:0*/
+ pand mm0, mm6
+ /*mm3=-L>R_i?R_i+2L:R_i*/
+ paddw mm3, mm0
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ pand mm5, mm3
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ psubw mm3, mm5
+ /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
+ psubw mm3, mm5
+ /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+ saturation op code, so we have to promote things back 16 bits.*/
+ pxor mm0, mm0
+ movq mm5, mm4
+ punpcklbw mm4, mm0
+ punpckhbw mm5, mm0
+ movq mm6, mm1
+ punpcklbw mm1, mm0
+ punpckhbw mm6, mm0
+ /*_pix[0...8+_ystride]+=R_i*/
+ paddw mm4, mm2
+ paddw mm5, mm3
+ /*_pix[0...8+_ystride*2]-=R_i*/
+ psubw mm1, mm2
+ psubw mm6, mm3
+ packuswb mm4, mm5
+ packuswb mm1, mm6
+ /*Write it back out.*/
+ movq [eax + edx], mm4
+ movq [eax + edx*2], mm1
+ }
+}
+
+/*This code implements the bulk of loop_filter_h().
+ Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+ four p0's to one register we must transpose the values in four mmx regs.
+ When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+ const ogg_int16_t *_ll){
+ /* todo: merge the comments from the GCC sources */
+ _asm {
+ mov ecx, [_pix]
+ mov edx, [_ystride]
+ mov eax, [_ll]
+ /*esi=_ystride*3*/
+ lea esi, [edx + edx*2]
+
+ movd mm0, dword ptr [ecx]
+ movd mm1, dword ptr [ecx + edx]
+ movd mm2, dword ptr [ecx + edx*2]
+ movd mm3, dword ptr [ecx + esi]
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpckhwd mm0, mm2
+ punpcklwd mm1, mm2
+ pxor mm7, mm7
+ movq mm5, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm5, mm7
+ movq mm3, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm3, mm7
+ psubw mm1, mm3
+ movq mm4, mm0
+ pcmpeqw mm2, mm2
+ psubw mm0, mm5
+ psrlw mm2, 14
+ pmullw mm0, mm2
+ psrlw mm2, 1
+ paddw mm0, mm1
+ psllw mm2, 2
+ paddw mm0, mm2
+ psraw mm0, 3
+ movq mm6, qword ptr [eax]
+ movq mm1, mm0
+ pxor mm2, mm2
+ movq mm3, mm6
+ psubw mm2, mm6
+ psllw mm3, 1
+ psllw mm2, 1
+ pcmpgtw mm3, mm0
+ pcmpgtw mm1, mm2
+ pand mm0, mm3
+ pand mm0, mm1
+ psraw mm2, 1
+ movq mm1, mm0
+ movq mm3, mm6
+ pcmpgtw mm2, mm0
+ pcmpgtw mm1, mm6
+ psllw mm3, 1
+ psllw mm6, 1
+ pand mm3, mm1
+ pand mm6, mm2
+ psubw mm0, mm3
+ por mm1, mm2
+ paddw mm0, mm6
+ pand mm1, mm0
+ psubw mm0, mm1
+ psubw mm0, mm1
+ paddw mm5, mm0
+ psubw mm4, mm0
+ packuswb mm5, mm7
+ packuswb mm4, mm7
+ punpcklbw mm5, mm4
+ movd edi, mm5
+ mov word ptr [ecx + 01H], di
+ psrlq mm5, 32
+ shr edi, 16
+ mov word ptr [ecx + edx + 01H], di
+ movd edi, mm5
+ mov word ptr [ecx + edx*2 + 01H], di
+ shr edi, 16
+ mov word ptr [ecx + esi + 01H], di
+ }
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ _pix-=2;
+ loop_filter_h4(_pix,_ystride,_ll);
+ loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
+}
+
+
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+ and we can do just a single emms call at the end this way.
+ We also do not use the _bv lookup table, instead computing the values that
+ would lie in it on the fly.*/
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+ The filter may be run on the bottom edge, affecting pixels in the next row of
+ fragments, so this row also needs to be available.
+ _bv: The bounding values array.
+ _refi: The index of the frame buffer to filter.
+ _pli: The color plane to filter.
+ _fragy0: The Y coordinate of the first fragment row to filter.
+ _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+ ogg_int16_t __declspec(align(8)) ll[4];
+ th_img_plane *iplane;
+ oc_fragment_plane *fplane;
+ oc_fragment *frag_top;
+ oc_fragment *frag0;
+ oc_fragment *frag;
+ oc_fragment *frag_end;
+ oc_fragment *frag0_end;
+ oc_fragment *frag_bot;
+ ll[0]=ll[1]=ll[2]=ll[3]=
+ (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
+ iplane=_state->ref_frame_bufs[_refi]+_pli;
+ fplane=_state->fplanes+_pli;
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
+ frag_top=_state->frags+fplane->froffset;
+ frag0=frag_top+_fragy0*fplane->nhfrags;
+ frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+ frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+ while(frag0<frag0_end){
+ frag=frag0;
+ frag_end=frag+fplane->nhfrags;
+ while(frag<frag_end){
+ if(frag->coded){
+ if(frag>frag0){
+ loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+ }
+ if(frag0>frag_top){
+ loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+ }
+ if(frag+1<frag_end&&!(frag+1)->coded){
+ loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+ }
+ if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+ loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+ iplane->ystride,ll);
+ }
+ }
+ frag++;
+ }
+ frag0+=fplane->nhfrags;
+ }
+
+ /*This needs to be removed when decode specific functions are implemented:*/
+ _mm_empty();
+}
+
+#endif
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* -------------------------------------------------------------------
+ MMX based loop filter for the theora codec.
+
+ Originally written by Rudolf Marek, based on code from On2's VP3.
+ Converted to Visual Studio inline assembly by Nils Pipenbrinck.
+
+ Note: I can't test these since my example files never get into the
+ loop filters, but the code has been converted semi-automatic from
+ the GCC sources, so it ought to work.
+ ---------------------------------------------------------------------*/
+#include "../../internal.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+#if defined(USE_ASM)
+
+
+
+static void loop_filter_v(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ _asm {
+ mov eax, [_pix]
+ mov edx, [_ystride]
+ mov ebx, [_ll]
+
+ /* _pix -= ystride */
+ sub eax, edx
+ /* mm0=0 */
+ pxor mm0, mm0
+ /* _pix -= ystride */
+ sub eax, edx
+ /* esi=_ystride*3 */
+ lea esi, [edx + edx*2]
+
+ /* mm7=_pix[0...8]*/
+ movq mm7, [eax]
+ /* mm4=_pix[0...8+_ystride*3]*/
+ movq mm4, [eax + esi]
+ /* mm6=_pix[0...8]*/
+ movq mm6, mm7
+ /* Expand unsigned _pix[0...3] to 16 bits.*/
+ punpcklbw mm6, mm0
+ movq mm5, mm4
+ /* Expand unsigned _pix[4...7] to 16 bits.*/
+ punpckhbw mm7, mm0
+ punpcklbw mm4, mm0
+ /* Expand other arrays too.*/
+ punpckhbw mm5, mm0
+ /*mm7:mm6=_p[0...7]-_p[0...7+_ystride*3]:*/
+ psubw mm6, mm4
+ psubw mm7, mm5
+ /*mm5=mm4=_pix[0...7+_ystride]*/
+ movq mm4, [eax + edx]
+ /*mm1=mm3=mm2=_pix[0..7]+_ystride*2]*/
+ movq mm2, [eax + edx*2]
+ movq mm5, mm4
+ movq mm3, mm2
+ movq mm1, mm2
+ /*Expand these arrays.*/
+ punpckhbw mm5, mm0
+ punpcklbw mm4, mm0
+ punpckhbw mm3, mm0
+ punpcklbw mm2, mm0
+ pcmpeqw mm0, mm0
+ /*mm0=3 3 3 3
+ mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+ psubw mm3, mm5
+ psrlw mm0, 14
+ psubw mm2, mm4
+ /*Scale by 3.*/
+ pmullw mm3, mm0
+ pmullw mm2, mm0
+ /*mm0=4 4 4 4
+ f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+ 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+ psrlw mm0, 1
+ paddw mm3, mm7
+ psllw mm0, 2
+ paddw mm2, mm6
+ /*Add 4.*/
+ paddw mm3, mm0
+ paddw mm2, mm0
+ /*"Divide" by 8.*/
+ psraw mm3, 3
+ psraw mm2, 3
+ /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
+ /*Free up mm5.*/
+ packuswb mm4, mm5
+ /*mm0=L L L L*/
+ movq mm0, [ebx]
+ /*if(R_i<-2L||R_i>2L)R_i=0:*/
+ movq mm5, mm2
+ pxor mm6, mm6
+ movq mm7, mm0
+ psubw mm6, mm0
+ psllw mm7, 1
+ psllw mm6, 1
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ pcmpgtw mm7, mm2
+ pcmpgtw mm5, mm6
+ pand mm2, mm7
+ movq mm7, mm0
+ pand mm2, mm5
+ psllw mm7, 1
+ movq mm5, mm3
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-2L -2L -2L -2L*/
+ /*mm7==2L 2L 2L 2L*/
+ pcmpgtw mm7, mm3
+ pcmpgtw mm5, mm6
+ pand mm3, mm7
+ movq mm7, mm0
+ pand mm3, mm5
+ /*if(R_i<-L)R_i'=R_i+2L;
+ if(R_i>L)R_i'=R_i-2L;
+ if(R_i<-L||R_i>L)R_i=-R_i':*/
+ psraw mm6, 1
+ movq mm5, mm2
+ psllw mm7, 1
+ /*mm2==R_3 R_2 R_1 R_0*/
+ /*mm5==R_3 R_2 R_1 R_0*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm5=R_i>L?FF:00*/
+ pcmpgtw mm5, mm0
+ /*mm6=-L>R_i?FF:00*/
+ pcmpgtw mm6, mm2
+ /*mm7=R_i>L?2L:0*/
+ pand mm7, mm5
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ psubw mm2, mm7
+ movq mm7, mm0
+ /*mm5=-L>R_i||R_i>L*/
+ por mm5, mm6
+ psllw mm7, 1
+ /*mm7=-L>R_i?2L:0*/
+ pand mm7, mm6
+ pxor mm6, mm6
+ /*mm2=-L>R_i?R_i+2L:R_i*/
+ paddw mm2, mm7
+ psubw mm6, mm0
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ pand mm5, mm2
+ movq mm7, mm0
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ psubw mm2, mm5
+ psllw mm7, 1
+ /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
+ psubw mm2, mm5
+ movq mm5, mm3
+ /*mm3==R_7 R_6 R_5 R_4*/
+ /*mm5==R_7 R_6 R_5 R_4*/
+ /*mm6==-L -L -L -L*/
+ /*mm0==L L L L*/
+ /*mm6=-L>R_i?FF:00*/
+ pcmpgtw mm6, mm3
+ /*mm5=R_i>L?FF:00*/
+ pcmpgtw mm5, mm0
+ /*mm7=R_i>L?2L:0*/
+ pand mm7, mm5
+ /*mm2=R_i>L?R_i-2L:R_i*/
+ psubw mm3, mm7
+ psllw mm0, 1
+ /*mm5=-L>R_i||R_i>L*/
+ por mm5, mm6
+ /*mm0=-L>R_i?2L:0*/
+ pand mm0, mm6
+ /*mm3=-L>R_i?R_i+2L:R_i*/
+ paddw mm3, mm0
+ /*mm5=-L>R_i||R_i>L?-R_i':0*/
+ pand mm5, mm3
+ /*mm2=-L>R_i||R_i>L?0:R_i*/
+ psubw mm3, mm5
+ /*mm3=-L>R_i||R_i>L?-R_i':R_i*/
+ psubw mm3, mm5
+ /*Unfortunately, there's no unsigned byte+signed byte with unsigned
+ saturation op code, so we have to promote things back 16 bits.*/
+ pxor mm0, mm0
+ movq mm5, mm4
+ punpcklbw mm4, mm0
+ punpckhbw mm5, mm0
+ movq mm6, mm1
+ punpcklbw mm1, mm0
+ punpckhbw mm6, mm0
+ /*_pix[0...8+_ystride]+=R_i*/
+ paddw mm4, mm2
+ paddw mm5, mm3
+ /*_pix[0...8+_ystride*2]-=R_i*/
+ psubw mm1, mm2
+ psubw mm6, mm3
+ packuswb mm4, mm5
+ packuswb mm1, mm6
+ /*Write it back out.*/
+ movq [eax + edx], mm4
+ movq [eax + edx*2], mm1
+ }
+}
+
+/*This code implements the bulk of loop_filter_h().
+ Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
+ four p0's to one register we must transpose the values in four mmx regs.
+ When half is done we repeat this for the rest.*/
+static void loop_filter_h4(unsigned char *_pix,long _ystride,
+ const ogg_int16_t *_ll){
+ /* todo: merge the comments from the GCC sources */
+ _asm {
+ mov ecx, [_pix]
+ mov edx, [_ystride]
+ mov eax, [_ll]
+ /*esi=_ystride*3*/
+ lea esi, [edx + edx*2]
+
+ movd mm0, dword ptr [ecx]
+ movd mm1, dword ptr [ecx + edx]
+ movd mm2, dword ptr [ecx + edx*2]
+ movd mm3, dword ptr [ecx + esi]
+ punpcklbw mm0, mm1
+ punpcklbw mm2, mm3
+ movq mm1, mm0
+ punpckhwd mm0, mm2
+ punpcklwd mm1, mm2
+ pxor mm7, mm7
+ movq mm5, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm5, mm7
+ movq mm3, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm3, mm7
+ psubw mm1, mm3
+ movq mm4, mm0
+ pcmpeqw mm2, mm2
+ psubw mm0, mm5
+ psrlw mm2, 14
+ pmullw mm0, mm2
+ psrlw mm2, 1
+ paddw mm0, mm1
+ psllw mm2, 2
+ paddw mm0, mm2
+ psraw mm0, 3
+ movq mm6, qword ptr [eax]
+ movq mm1, mm0
+ pxor mm2, mm2
+ movq mm3, mm6
+ psubw mm2, mm6
+ psllw mm3, 1
+ psllw mm2, 1
+ pcmpgtw mm3, mm0
+ pcmpgtw mm1, mm2
+ pand mm0, mm3
+ pand mm0, mm1
+ psraw mm2, 1
+ movq mm1, mm0
+ movq mm3, mm6
+ pcmpgtw mm2, mm0
+ pcmpgtw mm1, mm6
+ psllw mm3, 1
+ psllw mm6, 1
+ pand mm3, mm1
+ pand mm6, mm2
+ psubw mm0, mm3
+ por mm1, mm2
+ paddw mm0, mm6
+ pand mm1, mm0
+ psubw mm0, mm1
+ psubw mm0, mm1
+ paddw mm5, mm0
+ psubw mm4, mm0
+ packuswb mm5, mm7
+ packuswb mm4, mm7
+ punpcklbw mm5, mm4
+ movd edi, mm5
+ mov word ptr [ecx + 01H], di
+ psrlq mm5, 32
+ shr edi, 16
+ mov word ptr [ecx + edx + 01H], di
+ movd edi, mm5
+ mov word ptr [ecx + edx*2 + 01H], di
+ shr edi, 16
+ mov word ptr [ecx + esi + 01H], di
+ }
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride,
+ const ogg_int16_t *_ll){
+ _pix-=2;
+ loop_filter_h4(_pix,_ystride,_ll);
+ loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
+}
+
+
+/*We copy the whole function because the MMX routines will be inlined 4 times,
+ and we can do just a single emms call at the end this way.
+ We also do not use the _bv lookup table, instead computing the values that
+ would lie in it on the fly.*/
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+ The filter may be run on the bottom edge, affecting pixels in the next row of
+ fragments, so this row also needs to be available.
+ _bv: The bounding values array.
+ _refi: The index of the frame buffer to filter.
+ _pli: The color plane to filter.
+ _fragy0: The Y coordinate of the first fragment row to filter.
+ _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+ ogg_int16_t __declspec(align(8)) ll[4];
+ th_img_plane *iplane;
+ oc_fragment_plane *fplane;
+ oc_fragment *frag_top;
+ oc_fragment *frag0;
+ oc_fragment *frag;
+ oc_fragment *frag_end;
+ oc_fragment *frag0_end;
+ oc_fragment *frag_bot;
+ ll[0]=ll[1]=ll[2]=ll[3]=
+ (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
+ iplane=_state->ref_frame_bufs[_refi]+_pli;
+ fplane=_state->fplanes+_pli;
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
+ frag_top=_state->frags+fplane->froffset;
+ frag0=frag_top+_fragy0*fplane->nhfrags;
+ frag0_end=frag0+(_fragy_end-_fragy0)*fplane->nhfrags;
+ frag_bot=_state->frags+fplane->froffset+fplane->nfrags;
+ while(frag0<frag0_end){
+ frag=frag0;
+ frag_end=frag+fplane->nhfrags;
+ while(frag<frag_end){
+ if(frag->coded){
+ if(frag>frag0){
+ loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+ }
+ if(frag0>frag_top){
+ loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+ }
+ if(frag+1<frag_end&&!(frag+1)->coded){
+ loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+ }
+ if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
+ loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+ iplane->ystride,ll);
+ }
+ }
+ frag++;
+ }
+ frag0+=fplane->nhfrags;
+ }
+
+ /*This needs to be removed when decode specific functions are implemented:*/
+ _mm_empty();
+}
+
+#endif
+
Property changes on: trunk/theora/lib/dec/x86_vc/mmxloopfilter.c
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: trunk/theora/lib/dec/x86_vc/mmxstate.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/mmxstate.c (rev 0)
+++ trunk/theora/lib/dec/x86_vc/mmxstate.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,382 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* ------------------------------------------------------------------------
+ MMX acceleration of complete fragment reconstruction algorithm.
+ Originally written by Rudolf Marek.
+
+ Conversion to MSC intrinsics by Nils Pipenbrinck.
+ ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include "../../internal.h"
+#include "../idct.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+static const unsigned char OC_FZIG_ZAGMMX[64]=
+{
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63
+};
+
+/* Fill a block with value */
+static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
+ __m64 t = _value;
+ _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t;
+ _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t;
+ _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t;
+ _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t;
+}
+
+/* copy a block of 8 byte elements using different strides */
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride,
+ unsigned char * _src, int _src_ystride){
+ __m64 a,b,c,d,e,f,g,h;
+ a = *(__m64*)(_src + 0 * _src_ystride);
+ b = *(__m64*)(_src + 1 * _src_ystride);
+ c = *(__m64*)(_src + 2 * _src_ystride);
+ d = *(__m64*)(_src + 3 * _src_ystride);
+ e = *(__m64*)(_src + 4 * _src_ystride);
+ f = *(__m64*)(_src + 5 * _src_ystride);
+ g = *(__m64*)(_src + 6 * _src_ystride);
+ h = *(__m64*)(_src + 7 * _src_ystride);
+ *(__m64*)(_dst + 0 * _dst_ystride) = a;
+ *(__m64*)(_dst + 1 * _dst_ystride) = b;
+ *(__m64*)(_dst + 2 * _dst_ystride) = c;
+ *(__m64*)(_dst + 3 * _dst_ystride) = d;
+ *(__m64*)(_dst + 4 * _dst_ystride) = e;
+ *(__m64*)(_dst + 5 * _dst_ystride) = f;
+ *(__m64*)(_dst + 6 * _dst_ystride) = g;
+ *(__m64*)(_dst + 7 * _dst_ystride) = h;
+}
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+ ogg_int16_t __declspec(align(16)) res_buf[64];
+ int dst_framei;
+ int dst_ystride;
+ int zzi;
+ /*_last_zzi is subtly different from an actual count of the number of
+ coefficients we decoded for this block.
+ It contains the value of zzi BEFORE the final token in the block was
+ decoded.
+ In most cases this is an EOB token (the continuation of an EOB run from a
+ previous block counts), and so this is the same as the coefficient count.
+ However, in the case that the last token was NOT an EOB token, but filled
+ the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+ Provided the last token was not a pure zero run, the minimum value it can
+ be is 46, and so that doesn't affect any of the cases in this routine.
+ However, if the last token WAS a pure zero run of length 63, then _last_zzi
+ will be 1 while the number of coefficients decoded is 64.
+ Thus, we will trigger the following special case, where the real
+ coefficient count would not.
+ Note also that a zero run of length 64 will give _last_zzi a value of 0,
+ but we still process the DC coefficient, which might have a non-zero value
+ due to DC prediction.
+ Although convoluted, this is arguably the correct behavior: it allows us to
+ dequantize fewer coefficients and use a smaller transform when the block
+ ends with a long zero run instead of a normal EOB token.
+ It could be smarter... multiple separate zero runs at the end of a block
+ will fool it, but an encoder that generates these really deserves what it
+ gets.
+ Needless to say we inherited this approach from VP3.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ __m64 p;
+ /*Why is the iquant product rounded in this case and no others? Who knows.*/
+ p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+ /* broadcast 16 bits into all 4 mmx subregisters */
+ p = _m_punpcklwd (p,p);
+ p = _m_punpckldq (p,p);
+ loc_fill_mmx_value ((__m64 *)res_buf, p);
+ }
+ else{
+ /*Then, fill in the remainder of the coefficients with 0's, and perform
+ the iDCT.*/
+ /*First zero the buffer.*/
+ /*On K7, etc., this could be replaced with movntq and sfence.*/
+ loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+
+ res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+ /*This is planned to be rewritten in MMX.*/
+ for(zzi=1;zzi<_ncoefs;zzi++)
+ {
+ int ci;
+ ci=OC_FZIG_ZAG[zzi];
+ res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
+ _ac_iquant[ci]);
+ }
+
+ if(_last_zzi<10){
+ oc_idct8x8_10_mmx(res_buf);
+ }
+ else {
+ oc_idct8x8_mmx(res_buf);
+ }
+ }
+ /*Fill in the target buffer.*/
+ dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+ dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+ /*For now ystride values in all ref frames assumed to be equal.*/
+ if(_frag->mbmode==OC_MODE_INTRA){
+ oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
+ }
+ else{
+ int ref_framei;
+ int ref_ystride;
+ int mvoffset0;
+ int mvoffset1;
+ ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+ ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+ if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+ _frag->mv[1],ref_ystride,_pli)>1){
+ oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+ _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+ }
+ else{
+ oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+ }
+ }
+
+ _mm_empty();
+}
+
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+ const int *fragi;
+ const int *fragi_end;
+ int dst_framei;
+ int dst_ystride;
+ int src_framei;
+ int src_ystride;
+ dst_framei=_state->ref_frame_idx[_dst_frame];
+ src_framei=_state->ref_frame_idx[_src_frame];
+ dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+ src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+ fragi_end=_fragis+_nfragis;
+ for(fragi=_fragis;fragi<fragi_end;fragi++){
+ oc_fragment *frag = _state->frags+*fragi;
+ loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride,
+ frag->buffer[src_framei], src_ystride);
+ }
+ _m_empty();
+}
+
+#endif
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id:
+
+ ********************************************************************/
+
+/* ------------------------------------------------------------------------
+ MMX acceleration of complete fragment reconstruction algorithm.
+ Originally written by Rudolf Marek.
+
+ Conversion to MSC intrinsics by Nils Pipenbrinck.
+ ---------------------------------------------------------------------*/
+#if defined(USE_ASM)
+
+#include "../../internal.h"
+#include "../idct.h"
+#include "x86int.h"
+#include <mmintrin.h>
+
+static const unsigned char OC_FZIG_ZAGMMX[64]=
+{
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63
+};
+
+/* Fill a block with value */
+static __inline void loc_fill_mmx_value (__m64 * _dst, __m64 _value){
+ __m64 t = _value;
+ _dst[0] = t; _dst[1] = t; _dst[2] = t; _dst[3] = t;
+ _dst[4] = t; _dst[5] = t; _dst[6] = t; _dst[7] = t;
+ _dst[8] = t; _dst[9] = t; _dst[10] = t; _dst[11] = t;
+ _dst[12] = t; _dst[13] = t; _dst[14] = t; _dst[15] = t;
+}
+
+/* copy a block of 8 byte elements using different strides */
+static __inline void loc_blockcopy_mmx (unsigned char * _dst, int _dst_ystride,
+ unsigned char * _src, int _src_ystride){
+ __m64 a,b,c,d,e,f,g,h;
+ a = *(__m64*)(_src + 0 * _src_ystride);
+ b = *(__m64*)(_src + 1 * _src_ystride);
+ c = *(__m64*)(_src + 2 * _src_ystride);
+ d = *(__m64*)(_src + 3 * _src_ystride);
+ e = *(__m64*)(_src + 4 * _src_ystride);
+ f = *(__m64*)(_src + 5 * _src_ystride);
+ g = *(__m64*)(_src + 6 * _src_ystride);
+ h = *(__m64*)(_src + 7 * _src_ystride);
+ *(__m64*)(_dst + 0 * _dst_ystride) = a;
+ *(__m64*)(_dst + 1 * _dst_ystride) = b;
+ *(__m64*)(_dst + 2 * _dst_ystride) = c;
+ *(__m64*)(_dst + 3 * _dst_ystride) = d;
+ *(__m64*)(_dst + 4 * _dst_ystride) = e;
+ *(__m64*)(_dst + 5 * _dst_ystride) = f;
+ *(__m64*)(_dst + 6 * _dst_ystride) = g;
+ *(__m64*)(_dst + 7 * _dst_ystride) = h;
+}
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+ ogg_int16_t __declspec(align(16)) res_buf[64];
+ int dst_framei;
+ int dst_ystride;
+ int zzi;
+ /*_last_zzi is subtly different from an actual count of the number of
+ coefficients we decoded for this block.
+ It contains the value of zzi BEFORE the final token in the block was
+ decoded.
+ In most cases this is an EOB token (the continuation of an EOB run from a
+ previous block counts), and so this is the same as the coefficient count.
+ However, in the case that the last token was NOT an EOB token, but filled
+ the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+ Provided the last token was not a pure zero run, the minimum value it can
+ be is 46, and so that doesn't affect any of the cases in this routine.
+ However, if the last token WAS a pure zero run of length 63, then _last_zzi
+ will be 1 while the number of coefficients decoded is 64.
+ Thus, we will trigger the following special case, where the real
+ coefficient count would not.
+ Note also that a zero run of length 64 will give _last_zzi a value of 0,
+ but we still process the DC coefficient, which might have a non-zero value
+ due to DC prediction.
+ Although convoluted, this is arguably the correct behavior: it allows us to
+ dequantize fewer coefficients and use a smaller transform when the block
+ ends with a long zero run instead of a normal EOB token.
+ It could be smarter... multiple separate zero runs at the end of a block
+ will fool it, but an encoder that generates these really deserves what it
+ gets.
+ Needless to say we inherited this approach from VP3.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ __m64 p;
+ /*Why is the iquant product rounded in this case and no others? Who knows.*/
+ p = _m_from_int((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+ /* broadcast 16 bits into all 4 mmx subregisters */
+ p = _m_punpcklwd (p,p);
+ p = _m_punpckldq (p,p);
+ loc_fill_mmx_value ((__m64 *)res_buf, p);
+ }
+ else{
+ /*Then, fill in the remainder of the coefficients with 0's, and perform
+ the iDCT.*/
+ /*First zero the buffer.*/
+ /*On K7, etc., this could be replaced with movntq and sfence.*/
+ loc_fill_mmx_value ((__m64 *)res_buf, _mm_setzero_si64());
+
+ res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
+ /*This is planned to be rewritten in MMX.*/
+ for(zzi=1;zzi<_ncoefs;zzi++)
+ {
+ int ci;
+ ci=OC_FZIG_ZAG[zzi];
+ res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
+ _ac_iquant[ci]);
+ }
+
+ if(_last_zzi<10){
+ oc_idct8x8_10_mmx(res_buf);
+ }
+ else {
+ oc_idct8x8_mmx(res_buf);
+ }
+ }
+ /*Fill in the target buffer.*/
+ dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
+ dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+ /*For now ystride values in all ref frames assumed to be equal.*/
+ if(_frag->mbmode==OC_MODE_INTRA){
+ oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
+ }
+ else{
+ int ref_framei;
+ int ref_ystride;
+ int mvoffset0;
+ int mvoffset1;
+ ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
+ ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
+ if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
+ _frag->mv[1],ref_ystride,_pli)>1){
+ oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
+ _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+ }
+ else{
+ oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+ _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+ }
+ }
+
+ _mm_empty();
+}
+
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli){
+ const int *fragi;
+ const int *fragi_end;
+ int dst_framei;
+ int dst_ystride;
+ int src_framei;
+ int src_ystride;
+ dst_framei=_state->ref_frame_idx[_dst_frame];
+ src_framei=_state->ref_frame_idx[_src_frame];
+ dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+ src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+ fragi_end=_fragis+_nfragis;
+ for(fragi=_fragis;fragi<fragi_end;fragi++){
+ oc_fragment *frag = _state->frags+*fragi;
+ loc_blockcopy_mmx (frag->buffer[dst_framei], dst_ystride,
+ frag->buffer[src_framei], src_ystride);
+ }
+ _m_empty();
+}
+
+#endif
+
Property changes on: trunk/theora/lib/dec/x86_vc/mmxstate.c
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: trunk/theora/lib/dec/x86_vc/x86int.h
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86int.h (rev 0)
+++ trunk/theora/lib/dec/x86_vc/x86int.h 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,98 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_vc_H)
+# define _x86_x86int_vc_H (1)
+# include "../../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
+void oc_restore_fpu_mmx(void);
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_vc_H)
+# define _x86_x86int_vc_H (1)
+# include "../../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
+ int _src2_ystride,const ogg_int16_t *_residue);
+
+void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
+ int _nfragis,int _dst_frame,int _src_frame,int _pli);
+
+void oc_restore_fpu_mmx(void);
+
+void oc_state_frag_recon_mmx(oc_theora_state *_state,const oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+
+void oc_idct8x8_mmx(ogg_int16_t _y[64]);
+void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
+
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
+
+#endif
Property changes on: trunk/theora/lib/dec/x86_vc/x86int.h
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Added: trunk/theora/lib/dec/x86_vc/x86state.c
===================================================================
--- trunk/theora/lib/dec/x86_vc/x86state.c (rev 0)
+++ trunk/theora/lib/dec/x86_vc/x86state.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -0,0 +1,41 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if defined(USE_ASM)
+
+#include "x86int.h"
+#include "../../cpu.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+ _state->cpu_flags=oc_cpu_flags_get();
+
+ /* fill with defaults */
+ oc_state_vtable_init_c(_state);
+
+ /* patch MMX functions */
+ if(_state->cpu_flags&OC_CPU_X86_MMX){
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+ _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+ _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+ _state->opt_vtable.state_loop_filter_frag_rows=oc_state_loop_filter_frag_rows_mmx;
+ }
+}
+
+#endif
Property changes on: trunk/theora/lib/dec/x86_vc/x86state.c
___________________________________________________________________
Name: svn:keywords
+ Id
Name: svn:eol-style
+ native
Modified: trunk/theora/lib/enc/dct_decode.c
===================================================================
--- trunk/theora/lib/enc/dct_decode.c 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/enc/dct_decode.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -1309,8 +1309,11 @@
funcs->FilterVert = FilterVert__c;
funcs->FilterHoriz = FilterHoriz__c;
#if defined(USE_ASM)
+ // Todo: Port the dct for MSC one day.
+#if !defined (_MSC_VER)
if (cpu_flags & OC_CPU_X86_MMX) {
dsp_mmx_dct_decode_init(funcs);
}
#endif
+#endif
}
Modified: trunk/theora/lib/enc/encoder_idct.c
===================================================================
--- trunk/theora/lib/enc/encoder_idct.c 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/enc/encoder_idct.c 2008-04-12 01:04:43 UTC (rev 14714)
@@ -562,8 +562,11 @@
funcs->IDct10 = IDct10__c;
funcs->IDct3 = IDct10__c;
#if defined(USE_ASM)
+ // todo: make mmx encoder idct for MSC one day...
+#if !defined (_MSC_VER)
if (cpu_flags & OC_CPU_X86_MMX) {
dsp_mmx_idct_init(funcs);
}
#endif
+#endif
}
Modified: trunk/theora/lib/internal.h
===================================================================
--- trunk/theora/lib/internal.h 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/lib/internal.h 2008-04-12 01:04:43 UTC (rev 14714)
@@ -39,7 +39,8 @@
/*Thank you Microsoft, I know the order of operations.*/
# if defined(_MSC_VER)
-# pragma warning(disable:4554)
+# pragma warning(disable:4554) /* order of operations */
+# pragma warning(disable:4799) /* disable missing EMMS warnings */
# endif
/*This library's version.*/
@@ -501,15 +502,4 @@
oc_state_granule_time_func granule_time;
};
-#if defined(_MSC_VER) && !defined(TH_REALLY_NO_ASSEMBLY)
-# error You are compiling theora without inline assembly.\
- This is probably not what you want. Instead, please either\
- (1) download the assembly .lib binaries or\
- (2) compile them yourself using MinGW, and make Visual Studio\
- link against them.\
- Please seriously consider this before defining TH_REALLY_NO_ASSEMBLY\
- to disable this message and compile without inline assembly.\
- Thank you!
#endif
-
-#endif
Modified: trunk/theora/win32/VS2005/libtheora/libtheora.vcproj
===================================================================
--- trunk/theora/win32/VS2005/libtheora/libtheora.vcproj 2008-04-11 23:36:00 UTC (rev 14713)
+++ trunk/theora/win32/VS2005/libtheora/libtheora.vcproj 2008-04-12 01:04:43 UTC (rev 14714)
@@ -42,7 +42,7 @@
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
- PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM;DEBUG;"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="1"
@@ -129,7 +129,7 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM;"
StringPooling="true"
ExceptionHandling="0"
RuntimeLibrary="0"
@@ -221,7 +221,7 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM"
StringPooling="true"
ExceptionHandling="0"
RuntimeLibrary="0"
@@ -314,7 +314,7 @@
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
AdditionalIncludeDirectories="..\..\..\include;..\..\..\lib;..\..\..\lib\enc;..\..\..\..\libogg\include;"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBTHEORA_EXPORTS;USE_ASM"
StringPooling="true"
ExceptionHandling="0"
RuntimeLibrary="0"
@@ -2287,25 +2287,29 @@
Name="x86"
>
<File
- RelativePath="..\..\..\lib\dec\x86\mmxfrag.c"
+ RelativePath="..\..\..\lib\dec\x86_vc\mmxfrag.c"
>
</File>
<File
- RelativePath="..\..\..\lib\dec\x86\mmxidct.c"
+ RelativePath="..\..\..\lib\dec\x86_vc\mmxloopfilter.c"
>
</File>
<File
- RelativePath="..\..\..\lib\dec\x86\mmxstate.c"
+ RelativePath="..\..\..\lib\dec\x86_vc\mmxidct.c"
>
</File>
<File
- RelativePath="..\..\..\lib\dec\x86\x86int.h"
+ RelativePath="..\..\..\lib\dec\x86_vc\mmxstate.c"
>
</File>
<File
- RelativePath="..\..\..\lib\dec\x86\x86state.c"
+ RelativePath="..\..\..\lib\dec\x86_vc\x86int.h"
>
</File>
+ <File
+ RelativePath="..\..\..\lib\dec\x86_vc\x86state.c"
+ >
+ </File>
</Filter>
</Filter>
</Filter>
More information about the commits
mailing list