[xiph-commits] r13926 - trunk/theora/lib/enc/x86_32_vs
sping at svn.xiph.org
sping at svn.xiph.org
Wed Oct 3 11:56:42 PDT 2007
Author: sping
Date: 2007-10-03 11:56:42 -0700 (Wed, 03 Oct 2007)
New Revision: 13926
Modified:
trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
Log:
Fix inconsistent line endings
Modified: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
===================================================================
--- trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c 2007-10-03 18:51:06 UTC (rev 13925)
+++ trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c 2007-10-03 18:56:42 UTC (rev 13926)
@@ -1,42 +1,42 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdlib.h>
-
-#include "codec_internal.h"
-#include "dsp.h"
-
-#if 0
-//These are to let me selectively enable the C versions, these are needed
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-#endif
-
-
-static const ogg_int64_t V128 = 0x0080008000800080LL;
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
-
- //Make non-zero to use the C-version
-#if 0
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+
+#include "codec_internal.h"
+#include "dsp.h"
+
+#if 0
+//These are to let me selectively enable the C versions, these are needed
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
+#endif
+
+
+static const ogg_int64_t V128 = 0x0080008000800080LL;
+
+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine)
+{
+
+ //Make non-zero to use the C-version
+#if 0
int i;
/* For each block row */
@@ -54,200 +54,200 @@
FiltPtr += PixelsPerLine;
ReconPtr += ReconPixelsPerLine;
DctInputPtr += 8;
- }
-#else
- __asm {
- align 16
-
- pxor mm7, mm7
-
- mov eax, FiltPtr
- mov ebx, ReconPtr
- mov edx, DctInputPtr
-
- /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/
-
- /* ITERATION 1 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 2 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 3 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 4 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 5 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 6 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 7 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
- /* ITERATION 8 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm1, [ebx] /* mm1 = ReconPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- movq mm3, mm1 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
- psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
- movq [edx], mm0 /* write answer out */
- movq [8 + edx], mm2 /* write answer out */
- /* Increment pointers */
- add edx, 16
- add eax, PixelsPerLine
- add ebx, ReconPixelsPerLine
-
-
-
-
-
- };
-
-#endif
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
-{
-
-#if 0
+ }
+#else
+ __asm {
+ align 16
+
+ pxor mm7, mm7
+
+ mov eax, FiltPtr
+ mov ebx, ReconPtr
+ mov edx, DctInputPtr
+
+ /* You can't use rept in inline masm and macro parsing seems screwed with inline asm*/
+
+ /* ITERATION 1 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 2 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 3 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 4 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 5 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 6 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 7 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+ /* ITERATION 8 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm1, [ebx] /* mm1 = ReconPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ movq mm3, mm1 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 /* mm1 = INT16(ReconPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - ReconPtr */
+ psubw mm2, mm3 /* mm2 = FiltPtr - ReconPtr */
+ movq [edx], mm0 /* write answer out */
+ movq [8 + edx], mm2 /* write answer out */
+ /* Increment pointers */
+ add edx, 16
+ add eax, PixelsPerLine
+ add ebx, ReconPixelsPerLine
+
+
+
+
+
+ };
+
+#endif
+}
+
+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine)
+{
+
+#if 0
int i;
/* For each block row */
for (i=8; i; i--) {
@@ -267,161 +267,161 @@
/* Start next row */
FiltPtr += PixelsPerLine;
DctInputPtr += 8;
- }
-
-#else
- __asm {
- align 16
-
- pxor mm7, mm7
-
- mov eax, FiltPtr
- mov ebx, DctInputPtr
-
- movq mm1, V128
-
- /* ITERATION 1 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 2 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 3 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 4 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 5 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 6 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 7 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
-
- /* ITERATION 8 */
- movq mm0, [eax] /* mm0 = FiltPtr */
- movq mm2, mm0 /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
- punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
- psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
- movq [ebx], mm0 /* write answer out */
- movq [8 + ebx], mm2 /* write answer out */
- /* Increment pointers */
- add ebx, 16
- add eax, PixelsPerLine
-
- };
-
-#endif
-}
-
-
-
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
-
-#if 0
+ }
+
+#else
+ __asm {
+ align 16
+
+ pxor mm7, mm7
+
+ mov eax, FiltPtr
+ mov ebx, DctInputPtr
+
+ movq mm1, V128
+
+ /* ITERATION 1 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 2 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 3 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 4 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 5 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 6 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 7 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+
+ /* ITERATION 8 */
+ movq mm0, [eax] /* mm0 = FiltPtr */
+ movq mm2, mm0 /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 /* mm0 = INT16(FiltPtr) */
+ punpckhbw mm2, mm7 /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ psubw mm0, mm1 /* mm0 = FiltPtr - 128 */
+ psubw mm2, mm1 /* mm2 = FiltPtr - 128 */
+ movq [ebx], mm0 /* write answer out */
+ movq [8 + ebx], mm2 /* write answer out */
+ /* Increment pointers */
+ add ebx, 16
+ add eax, PixelsPerLine
+
+ };
+
+#endif
+}
+
+
+
+
+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
+ ogg_uint32_t PixelsPerLine,
+ ogg_uint32_t ReconPixelsPerLine)
+{
+
+#if 0
int i;
/* For each block row */
@@ -440,271 +440,271 @@
ReconPtr1 += ReconPixelsPerLine;
ReconPtr2 += ReconPixelsPerLine;
DctInputPtr += 8;
- }
-#else
-
- __asm {
- align 16
-
- pxor mm7, mm7
-
- mov eax, FiltPtr
- mov ebx, ReconPtr1
- mov ecx, ReconPtr2
- mov edx, DctInputPtr
-
- /* ITERATION 1 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 2 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 3 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 4 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 5 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 6 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 7 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
-
- /* ITERATION 8 */
- movq mm0, [eax] ; /* mm0 = FiltPtr */
- movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
- movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
- movq mm2, mm0 ; /* dup to prepare for up conversion */
- movq mm3, mm1 ; /* dup to prepare for up conversion */
- movq mm5, mm4 ; /* dup to prepare for up conversion */
- ; /* convert from UINT8 to INT16 */
- punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
- punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
- punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
- punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
- punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
- punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
- ; /* average ReconPtr1 and ReconPtr2 */
- paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
- paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
- psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- movq [edx], mm0 ; /* write answer out */
- movq [8 + edx], mm2 ; /* write answer out */
- ; /* Increment pointers */
- add edx, 16 ;
- add eax, PixelsPerLine ;
- add ebx, ReconPixelsPerLine ;
- add ecx, ReconPixelsPerLine ;
-
- };
-
-
-
-
-
-#endif
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
-
-#if 0
+ }
+#else
+
+ __asm {
+ align 16
+
+ pxor mm7, mm7
+
+ mov eax, FiltPtr
+ mov ebx, ReconPtr1
+ mov ecx, ReconPtr2
+ mov edx, DctInputPtr
+
+ /* ITERATION 1 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 2 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 3 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 4 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 5 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 6 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 7 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+
+ /* ITERATION 8 */
+ movq mm0, [eax] ; /* mm0 = FiltPtr */
+ movq mm1, [ebx] ; /* mm1 = ReconPtr1 */
+ movq mm4, [ecx] ; /* mm1 = ReconPtr2 */
+ movq mm2, mm0 ; /* dup to prepare for up conversion */
+ movq mm3, mm1 ; /* dup to prepare for up conversion */
+ movq mm5, mm4 ; /* dup to prepare for up conversion */
+ ; /* convert from UINT8 to INT16 */
+ punpcklbw mm0, mm7 ; /* mm0 = INT16(FiltPtr) */
+ punpcklbw mm1, mm7 ; /* mm1 = INT16(ReconPtr1) */
+ punpcklbw mm4, mm7 ; /* mm1 = INT16(ReconPtr2) */
+ punpckhbw mm2, mm7 ; /* mm2 = INT16(FiltPtr) */
+ punpckhbw mm3, mm7 ; /* mm3 = INT16(ReconPtr1) */
+ punpckhbw mm5, mm7 ; /* mm3 = INT16(ReconPtr2) */
+ ; /* average ReconPtr1 and ReconPtr2 */
+ paddw mm1, mm4 ; /* mm1 = ReconPtr1 + ReconPtr2 */
+ paddw mm3, mm5 ; /* mm3 = ReconPtr1 + ReconPtr2 */
+ psrlw mm1, 1 ; /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ psrlw mm3, 1 ; /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ psubw mm0, mm1 ; /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ psubw mm2, mm3 ; /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ movq [edx], mm0 ; /* write answer out */
+ movq [8 + edx], mm2 ; /* write answer out */
+ ; /* Increment pointers */
+ add edx, 16 ;
+ add eax, PixelsPerLine ;
+ add ebx, ReconPixelsPerLine ;
+ add ecx, ReconPixelsPerLine ;
+
+ };
+
+
+
+
+
+#endif
+}
+
+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
+{
+
+#if 0
ogg_uint32_t SadValue;
ogg_uint32_t SadValue1;
@@ -720,70 +720,70 @@
SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
- return SadValue;
-
-#else
- ogg_uint32_t MaxSad;
-
-
- __asm {
- align 16
- mov ebx, Src1
- mov ecx, Src2
-
-
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* zero out mm7 for unpack */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ;
-
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
-
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* ; unpack low four bytes to higher precision */
- punpckhbw mm1, mm7 ; /* ; unpack high four bytes to higher precision */
-
- movq mm2, mm0 ;
- movq mm3, mm1 ;
- psrlq mm2, 32 ; /* fold and add */
- psrlq mm3, 32 ;
- paddw mm0, mm2 ;
- paddw mm1, mm3 ;
- movq mm2, mm0 ;
- movq mm3, mm1 ;
- psrlq mm2, 16 ;
- psrlq mm3, 16 ;
- paddw mm0, mm2 ;
- paddw mm1, mm3 ;
-
- psubusw mm1, mm0 ;
- paddw mm1, mm0 ; /* mm1 = max(mm1, mm0) */
- movd eax, mm1 ;
-
- and eax, 0xffff
- mov MaxSad, eax
- };
- return MaxSad;
-
-
-
-
-
-#endif
-}
-
-
-
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
-
-#if 0
+ return SadValue;
+
+#else
+ ogg_uint32_t MaxSad;
+
+
+ __asm {
+ align 16
+ mov ebx, Src1
+ mov ecx, Src2
+
+
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* zero out mm7 for unpack */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ;
+
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* ; unpack low four bytes to higher precision */
+ punpckhbw mm1, mm7 ; /* ; unpack high four bytes to higher precision */
+
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+ psrlq mm2, 32 ; /* fold and add */
+ psrlq mm3, 32 ;
+ paddw mm0, mm2 ;
+ paddw mm1, mm3 ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+ psrlq mm2, 16 ;
+ psrlq mm3, 16 ;
+ paddw mm0, mm2 ;
+ paddw mm1, mm3 ;
+
+ psubusw mm1, mm0 ;
+ paddw mm1, mm0 ; /* mm1 = max(mm1, mm0) */
+ movd eax, mm1 ;
+
+ and eax, 0xffff
+ mov MaxSad, eax
+ };
+ return MaxSad;
+
+
+
+
+
+#endif
+}
+
+
+
+
+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
+ ogg_uint32_t stride)
+{
+
+#if 0
ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
ogg_uint32_t MaxSad = 0;
@@ -824,94 +824,94 @@
MaxSad = SadValue2[i];
}
- return MaxSad;
-#else
- ogg_uint32_t MaxSad;
-
-
- __asm {
- align 16
- mov ebx, Src1
- mov ecx, Src2
-
- pxor mm3, mm3 ; /* zero out mm3 for unpack */
- pxor mm4, mm4 ; /* mm4 low sum */
- pxor mm5, mm5 ; /* mm5 high sum */
- pxor mm6, mm6 ; /* mm6 low sum */
- pxor mm7, mm7 ; /* mm7 high sum */
- mov edi, 4 ; /* 4 rows */
- label_1: ;
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ; /* take 8 bytes */
-
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
- paddw mm4, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
- paddw mm5, mm1 ; /* accumulate difference... */
- add ebx, stride ; /* Inc pointer into the new data */
- add ecx, stride ; /* Inc pointer into the new data */
-
- dec edi ;
- jnz label_1 ;
-
- mov edi, 4 ; /* 4 rows */
- label_2: ;
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [ecx] ; /* take 8 bytes */
-
- movq mm2, mm0 ;
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
- paddw mm6, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
- paddw mm7, mm1 ; /* accumulate difference... */
- add ebx, stride ; /* Inc pointer into the new data */
- add ecx, stride ; /* Inc pointer into the new data */
-
- dec edi ;
- jnz label_2 ;
-
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */
- psubusw mm5, mm4 ;
- paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */
- psubusw mm7, mm5 ;
- paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */
- movq mm6, mm7 ;
- psrlq mm6, 32 ;
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
- movq mm6, mm7 ;
- psrlq mm6, 16 ;
- psubusw mm7, mm6 ;
- paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
- movd eax, mm7 ;
- and eax, 0xffff ;
-
- mov MaxSad, eax
- };
-
- return MaxSad;
-
-
-#endif
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
-
-#if 0
+ return MaxSad;
+#else
+ ogg_uint32_t MaxSad;
+
+
+ __asm {
+ align 16
+ mov ebx, Src1
+ mov ecx, Src2
+
+ pxor mm3, mm3 ; /* zero out mm3 for unpack */
+ pxor mm4, mm4 ; /* mm4 low sum */
+ pxor mm5, mm5 ; /* mm5 high sum */
+ pxor mm6, mm6 ; /* mm6 low sum */
+ pxor mm7, mm7 ; /* mm7 high sum */
+ mov edi, 4 ; /* 4 rows */
+ label_1: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ; /* take 8 bytes */
+
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
+ paddw mm4, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
+ paddw mm5, mm1 ; /* accumulate difference... */
+ add ebx, stride ; /* Inc pointer into the new data */
+ add ecx, stride ; /* Inc pointer into the new data */
+
+ dec edi ;
+ jnz label_1 ;
+
+ mov edi, 4 ; /* 4 rows */
+ label_2: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [ecx] ; /* take 8 bytes */
+
+ movq mm2, mm0 ;
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm3 ; /* unpack to higher precision for accumulation */
+ paddw mm6, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm3 ; /* unpack high four bytes to higher precision */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add ebx, stride ; /* Inc pointer into the new data */
+ add ecx, stride ; /* Inc pointer into the new data */
+
+ dec edi ;
+ jnz label_2 ;
+
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm7, mm6) */
+ psubusw mm5, mm4 ;
+ paddw mm5, mm4 ; /* mm5 = max(mm5, mm4) */
+ psubusw mm7, mm5 ;
+ paddw mm7, mm5 ; /* mm7 = max(mm5, mm7) */
+ movq mm6, mm7 ;
+ psrlq mm6, 32 ;
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
+ movq mm6, mm7 ;
+ psrlq mm6, 16 ;
+ psubusw mm7, mm6 ;
+ paddw mm7, mm6 ; /* mm7 = max(mm5, mm7) */
+ movd eax, mm7 ;
+ and eax, 0xffff ;
+
+ mov MaxSad, eax
+ };
+
+ return MaxSad;
+
+
+#endif
+}
+
+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2)
+{
+
+#if 0
ogg_uint32_t i;
ogg_uint32_t sad = 0;
@@ -930,189 +930,189 @@
ptr2 += stride2;
}
- return sad;
-#else
- ogg_uint32_t DiffVal;
-
- __asm {
- align 16
-
- mov ebx, ptr1
- mov edx, ptr2
-
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* mm7 contains the result */
-
- ; /* ITERATION 1 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
- ; /* ITERATION 2 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
- ; /* ITERATION 3 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
- ; /* ITERATION 4 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
- ; /* ITERATION 5 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
- ; /* ITERATION 6 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
- ; /* ITERATION 7 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
-
- ; /* ITERATION 8 */
- movq mm0, [ebx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, stride1 ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add edx, stride2 ; /* Inc pointer into ref data */
-
-
-
- ; /* ------ */
-
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddw mm7, mm0 ;
- movq mm0, mm7 ;
- psrlq mm7, 16 ;
- paddw mm7, mm0 ;
- movd eax, mm7 ;
- and eax, 0xffff ;
-
- mov DiffVal, eax
- };
-
- return DiffVal;
-
-
-
-#endif
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
-#if 0
+ return sad;
+#else
+ ogg_uint32_t DiffVal;
+
+ __asm {
+ align 16
+
+ mov ebx, ptr1
+ mov edx, ptr2
+
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* mm7 contains the result */
+
+ ; /* ITERATION 1 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+ ; /* ITERATION 2 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+ ; /* ITERATION 3 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+ ; /* ITERATION 4 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+ ; /* ITERATION 5 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+ ; /* ITERATION 6 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+ ; /* ITERATION 7 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+
+ ; /* ITERATION 8 */
+ movq mm0, [ebx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, stride1 ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add edx, stride2 ; /* Inc pointer into ref data */
+
+
+
+ ; /* ------ */
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddw mm7, mm0 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 16 ;
+ paddw mm7, mm0 ;
+ movd eax, mm7 ;
+ and eax, 0xffff ;
+
+ mov DiffVal, eax
+ };
+
+ return DiffVal;
+
+
+
+#endif
+}
+
+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
+ unsigned char *ptr2, ogg_uint32_t stride2,
+ ogg_uint32_t thres)
+{
+#if 0
ogg_uint32_t i;
ogg_uint32_t sad = 0;
@@ -1134,19 +1134,19 @@
ptr2 += stride2;
}
- return sad;
-#else
- return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-#endif
-}
-
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
-#if 0
+ return sad;
+#else
+ return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
+#endif
+}
+
+
+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
+ ogg_uint32_t thres)
+{
+#if 0
ogg_uint32_t i;
ogg_uint32_t sad = 0;
@@ -1169,76 +1169,76 @@
RefDataPtr2 += RefStride;
}
- return sad;
-#else
- ogg_uint32_t DiffVal;
-
- __asm {
- align 16
-
- mov ebx, SrcData
- mov ecx, RefDataPtr1
- mov edx, RefDataPtr2
-
-
- pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */
- paddb mm5, mm5 ;
- ;
- pxor mm6, mm6 ; /* zero out mm6 for unpack */
- pxor mm7, mm7 ; /* mm7 contains the result */
- mov edi, 8 ; /* 8 rows */
- loop_start: ;
- movq mm0, [ebx] ; /* take 8 bytes */
-
- movq mm2, [ecx] ;
- movq mm3, [edx] ; /* take average of mm2 and mm3 */
- movq mm1, mm2 ;
- pand mm1, mm3 ;
- pxor mm3, mm2 ;
- pand mm3, mm5 ;
- psrlq mm3, 1 ;
- paddb mm1, mm3 ;
-
- movq mm2, mm0 ;
-
- psubusb mm0, mm1 ; /* A - B */
- psubusb mm1, mm2 ; /* B - A */
- por mm0, mm1 ; /* and or gives abs difference */
- movq mm1, mm0 ;
-
- punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
- paddw mm7, mm0 ; /* accumulate difference... */
- punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
- add ebx, SrcStride ; /* Inc pointer into the new data */
- paddw mm7, mm1 ; /* accumulate difference... */
- add ecx, RefStride ; /* Inc pointer into ref data */
- add edx, RefStride ; /* Inc pointer into ref data */
-
- dec edi ;
- jnz loop_start ;
-
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddw mm7, mm0 ;
- movq mm0, mm7 ;
- psrlq mm7, 16 ;
- paddw mm7, mm0 ;
- movd eax, mm7 ;
- and eax, 0xffff ;
-
- mov DiffVal, eax
- };
-
- return DiffVal;
-
-
-
-#endif
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
-#if 0
+ return sad;
+#else
+ ogg_uint32_t DiffVal;
+
+ __asm {
+ align 16
+
+ mov ebx, SrcData
+ mov ecx, RefDataPtr1
+ mov edx, RefDataPtr2
+
+
+ pcmpeqd mm5, mm5 ; /* fefefefefefefefe in mm5 */
+ paddb mm5, mm5 ;
+ ;
+ pxor mm6, mm6 ; /* zero out mm6 for unpack */
+ pxor mm7, mm7 ; /* mm7 contains the result */
+ mov edi, 8 ; /* 8 rows */
+ loop_start: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+
+ movq mm2, [ecx] ;
+ movq mm3, [edx] ; /* take average of mm2 and mm3 */
+ movq mm1, mm2 ;
+ pand mm1, mm3 ;
+ pxor mm3, mm2 ;
+ pand mm3, mm5 ;
+ psrlq mm3, 1 ;
+ paddb mm1, mm3 ;
+
+ movq mm2, mm0 ;
+
+ psubusb mm0, mm1 ; /* A - B */
+ psubusb mm1, mm2 ; /* B - A */
+ por mm0, mm1 ; /* and or gives abs difference */
+ movq mm1, mm0 ;
+
+ punpcklbw mm0, mm6 ; /* unpack to higher precision for accumulation */
+ paddw mm7, mm0 ; /* accumulate difference... */
+ punpckhbw mm1, mm6 ; /* unpack high four bytes to higher precision */
+ add ebx, SrcStride ; /* Inc pointer into the new data */
+ paddw mm7, mm1 ; /* accumulate difference... */
+ add ecx, RefStride ; /* Inc pointer into ref data */
+ add edx, RefStride ; /* Inc pointer into ref data */
+
+ dec edi ;
+ jnz loop_start ;
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddw mm7, mm0 ;
+ movq mm0, mm7 ;
+ psrlq mm7, 16 ;
+ paddw mm7, mm0 ;
+ movd eax, mm7 ;
+ and eax, 0xffff ;
+
+ mov DiffVal, eax
+ };
+
+ return DiffVal;
+
+
+
+#endif
+}
+
+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
+{
+#if 0
ogg_uint32_t i;
ogg_uint32_t XSum=0;
ogg_uint32_t XXSum=0;
@@ -1267,73 +1267,73 @@
}
/* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-#else
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm {
- align 16
-
- mov ecx, DataPtr
-
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start:
- movq mm0, [ecx] ; /* take 8 bytes */
- movq mm2, mm0 ;
-
- punpcklbw mm0, mm6 ;
- punpckhbw mm2, mm6 ;
-
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
-
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
-
- add ecx, Stride ; /* Inc pointer into src data */
-
- dec edi ;
- jnz loop_start ;
-
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov eax, edi ;
-
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd ebx, mm7 ;
-
- mov XSum, eax
- mov XXSum, ebx;
-
- };
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-
-
-
-#endif
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
-
-#if 0
+ return (( (XXSum<<6) - XSum*XSum ) );
+#else
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm {
+ align 16
+
+ mov ecx, DataPtr
+
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start:
+ movq mm0, [ecx] ; /* take 8 bytes */
+ movq mm2, mm0 ;
+
+ punpcklbw mm0, mm6 ;
+ punpckhbw mm2, mm6 ;
+
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
+
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
+
+ add ecx, Stride ; /* Inc pointer into src data */
+
+ dec edi ;
+ jnz loop_start ;
+
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov eax, edi ;
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd ebx, mm7 ;
+
+ mov XSum, eax
+ mov XXSum, ebx;
+
+ };
+ /* Compute population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ) );
+
+
+
+#endif
+}
+
+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr, ogg_uint32_t RefStride)
+{
+
+#if 0
ogg_uint32_t i;
ogg_uint32_t XSum=0;
ogg_uint32_t XXSum=0;
@@ -1378,83 +1378,83 @@
}
/* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-#else
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
-
- __asm {
- align 16
-
- mov ecx, SrcData
- mov edx, RefDataPtr
-
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start: ;
- movq mm0, [ecx] ; /* take 8 bytes */
- movq mm1, [edx] ;
- movq mm2, mm0 ;
- movq mm3, mm1 ;
-
- punpcklbw mm0, mm6 ;
- punpcklbw mm1, mm6 ;
- punpckhbw mm2, mm6 ;
- punpckhbw mm3, mm6 ;
-
- psubsw mm0, mm1 ;
- psubsw mm2, mm3 ;
-
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
-
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
-
- add ecx, SrcStride ; /* Inc pointer into src data */
- add edx, RefStride ; /* Inc pointer into ref data */
-
- dec edi ;
- jnz loop_start ;
-
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov eax, edi ;
-
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd ebx, mm7 ;
-
- mov XSum, eax
- mov XXSum, ebx
-
- };
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-
-
-#endif
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
-#if 0
+ return (( (XXSum<<6) - XSum*XSum ));
+#else
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+
+ __asm {
+ align 16
+
+ mov ecx, SrcData
+ mov edx, RefDataPtr
+
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start: ;
+ movq mm0, [ecx] ; /* take 8 bytes */
+ movq mm1, [edx] ;
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+
+ punpcklbw mm0, mm6 ;
+ punpcklbw mm1, mm6 ;
+ punpckhbw mm2, mm6 ;
+ punpckhbw mm3, mm6 ;
+
+ psubsw mm0, mm1 ;
+ psubsw mm2, mm3 ;
+
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
+
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
+
+ add ecx, SrcStride ; /* Inc pointer into src data */
+ add edx, RefStride ; /* Inc pointer into ref data */
+
+ dec edi ;
+ jnz loop_start ;
+
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov eax, edi ;
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd ebx, mm7 ;
+
+ mov XSum, eax
+ mov XXSum, ebx
+
+ };
+
+ /* Compute and return population variance as mis-match metric. */
+ return (( (XXSum<<6) - XSum*XSum ));
+
+
+#endif
+}
+
+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
+ unsigned char *RefDataPtr1,
+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
+{
+#if 0
ogg_uint32_t i;
ogg_uint32_t XSum=0;
ogg_uint32_t XXSum=0;
@@ -1500,107 +1500,107 @@
}
/* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-#else
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm {
- align 16
-
- mov ebx, SrcData
- mov ecx, RefDataPtr1
- mov edx, RefDataPtr2
-
- pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */
- paddb mm4, mm4 ;
- pxor mm5, mm5 ;
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov edi, 8 ;
- loop_start: ;
- movq mm0, [ebx] ; /* take 8 bytes */
-
- movq mm2, [ecx] ;
- movq mm3, [edx] ; /* take average of mm2 and mm3 */
- movq mm1, mm2 ;
- pand mm1, mm3 ;
- pxor mm3, mm2 ;
- pand mm3, mm4 ;
- psrlq mm3, 1 ;
- paddb mm1, mm3 ;
-
- movq mm2, mm0 ;
- movq mm3, mm1 ;
-
- punpcklbw mm0, mm6 ;
- punpcklbw mm1, mm6 ;
- punpckhbw mm2, mm6 ;
- punpckhbw mm3, mm6 ;
-
- psubsw mm0, mm1 ;
- psubsw mm2, mm3 ;
-
- paddw mm5, mm0 ;
- paddw mm5, mm2 ;
-
- pmaddwd mm0, mm0 ;
- pmaddwd mm2, mm2 ;
- ;
- paddd mm7, mm0 ;
- paddd mm7, mm2 ;
-
- add ebx, SrcStride ; /* Inc pointer into src data */
- add ecx, RefStride ; /* Inc pointer into ref data */
- add edx, RefStride ; /* Inc pointer into ref data */
-
- dec edi ;
- jnz loop_start ;
-
- movq mm0, mm5 ;
- psrlq mm5, 32 ;
- paddw mm5, mm0 ;
- movq mm0, mm5 ;
- psrlq mm5, 16 ;
- paddw mm5, mm0 ;
- movd edi, mm5 ;
- movsx edi, di ;
- mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/
-
- movq mm0, mm7 ;
- psrlq mm7, 32 ;
- paddd mm7, mm0 ;
- movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */
- };
-
- return (( (XXSum<<6) - XSum*XSum ));
-
-#endif
-}
-
-static void restore_fpu (void)
-{
-
- __asm {
- emms
- }
-
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
- TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
- funcs->restore_fpu = restore_fpu;
- funcs->sub8x8 = sub8x8__mmx;
- funcs->sub8x8_128 = sub8x8_128__mmx;
- funcs->sub8x8avg2 = sub8x8avg2__mmx;
- funcs->row_sad8 = row_sad8__mmx;
- funcs->col_sad8x8 = col_sad8x8__mmx;
- funcs->sad8x8 = sad8x8__mmx;
- funcs->sad8x8_thres = sad8x8_thres__mmx;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
- funcs->intra8x8_err = intra8x8_err__mmx;
- funcs->inter8x8_err = inter8x8_err__mmx;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-
+ return (( (XXSum<<6) - XSum*XSum ));
+#else
+ ogg_uint32_t XSum;
+ ogg_uint32_t XXSum;
+
+ __asm {
+ align 16
+
+ mov ebx, SrcData
+ mov ecx, RefDataPtr1
+ mov edx, RefDataPtr2
+
+ pcmpeqd mm4, mm4 ; /* fefefefefefefefe in mm4 */
+ paddb mm4, mm4 ;
+ pxor mm5, mm5 ;
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov edi, 8 ;
+ loop_start: ;
+ movq mm0, [ebx] ; /* take 8 bytes */
+
+ movq mm2, [ecx] ;
+ movq mm3, [edx] ; /* take average of mm2 and mm3 */
+ movq mm1, mm2 ;
+ pand mm1, mm3 ;
+ pxor mm3, mm2 ;
+ pand mm3, mm4 ;
+ psrlq mm3, 1 ;
+ paddb mm1, mm3 ;
+
+ movq mm2, mm0 ;
+ movq mm3, mm1 ;
+
+ punpcklbw mm0, mm6 ;
+ punpcklbw mm1, mm6 ;
+ punpckhbw mm2, mm6 ;
+ punpckhbw mm3, mm6 ;
+
+ psubsw mm0, mm1 ;
+ psubsw mm2, mm3 ;
+
+ paddw mm5, mm0 ;
+ paddw mm5, mm2 ;
+
+ pmaddwd mm0, mm0 ;
+ pmaddwd mm2, mm2 ;
+ ;
+ paddd mm7, mm0 ;
+ paddd mm7, mm2 ;
+
+ add ebx, SrcStride ; /* Inc pointer into src data */
+ add ecx, RefStride ; /* Inc pointer into ref data */
+ add edx, RefStride ; /* Inc pointer into ref data */
+
+ dec edi ;
+ jnz loop_start ;
+
+ movq mm0, mm5 ;
+ psrlq mm5, 32 ;
+ paddw mm5, mm0 ;
+ movq mm0, mm5 ;
+ psrlq mm5, 16 ;
+ paddw mm5, mm0 ;
+ movd edi, mm5 ;
+ movsx edi, di ;
+ mov XSum, edi ; /* movl eax, edi ; Modified for vc to resuse eax*/
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32 ;
+ paddd mm7, mm0 ;
+ movd XXSum, mm7 ; /*movd eax, mm7 ; Modified for vc to reuse eax */
+ };
+
+ return (( (XXSum<<6) - XSum*XSum ));
+
+#endif
+}
+
+static void restore_fpu (void)
+{
+
+ __asm {
+ emms
+ }
+
+}
+
+void dsp_mmx_init(DspFunctions *funcs)
+{
+ TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
+ funcs->restore_fpu = restore_fpu;
+ funcs->sub8x8 = sub8x8__mmx;
+ funcs->sub8x8_128 = sub8x8_128__mmx;
+ funcs->sub8x8avg2 = sub8x8avg2__mmx;
+ funcs->row_sad8 = row_sad8__mmx;
+ funcs->col_sad8x8 = col_sad8x8__mmx;
+ funcs->sad8x8 = sad8x8__mmx;
+ funcs->sad8x8_thres = sad8x8_thres__mmx;
+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
+ funcs->intra8x8_err = intra8x8_err__mmx;
+ funcs->inter8x8_err = inter8x8_err__mmx;
+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
+}
+
Property changes on: trunk/theora/lib/enc/x86_32_vs/dsp_mmx.c
___________________________________________________________________
Name: svn:eol-style
+ native
More information about the commits
mailing list