[xiph-commits] r11427 - in trunk/theora: . examples lib lib/x86_32
lib/x86_64
giles at svn.xiph.org
giles at svn.xiph.org
Fri May 26 11:51:14 PDT 2006
Author: giles
Date: 2006-05-26 11:51:09 -0700 (Fri, 26 May 2006)
New Revision: 11427
Added:
trunk/theora/lib/cpu.c
trunk/theora/lib/cpu.h
trunk/theora/lib/dsp.c
trunk/theora/lib/dsp.h
trunk/theora/lib/x86_32/
trunk/theora/lib/x86_32/dsp_mmx.c
trunk/theora/lib/x86_32/dsp_mmxext.c
trunk/theora/lib/x86_32/fdct_mmx.c
trunk/theora/lib/x86_32/recon_mmx.c
trunk/theora/lib/x86_64/
trunk/theora/lib/x86_64/dsp_mmx.c
trunk/theora/lib/x86_64/dsp_mmxext.c
trunk/theora/lib/x86_64/fdct_mmx.c
trunk/theora/lib/x86_64/recon_mmx.c
Removed:
trunk/theora/lib/x86_32/dsp_mmx.c
trunk/theora/lib/x86_32/dsp_mmxext.c
trunk/theora/lib/x86_32/fdct_mmx.c
trunk/theora/lib/x86_32/recon_mmx.c
trunk/theora/lib/x86_64/dsp_mmx.c
trunk/theora/lib/x86_64/dsp_mmxext.c
trunk/theora/lib/x86_64/fdct_mmx.c
trunk/theora/lib/x86_64/recon_mmx.c
Modified:
trunk/theora/configure.ac
trunk/theora/examples/Makefile.am
trunk/theora/lib/Makefile.am
trunk/theora/lib/codec_internal.h
trunk/theora/lib/dct.c
trunk/theora/lib/dct_decode.c
trunk/theora/lib/dct_encode.c
trunk/theora/lib/decode.c
trunk/theora/lib/encode.c
trunk/theora/lib/encoder_toplevel.c
trunk/theora/lib/idct.c
trunk/theora/lib/mcomp.c
trunk/theora/lib/pp.c
trunk/theora/lib/reconstruct.c
trunk/theora/lib/scan.c
trunk/theora/lib/toplevel.c
Log:
Merge theora-mmx branch work. We now use some SIMD assembly acceleration
by default on x86 and x86_64 architectures.
Modified: trunk/theora/configure.ac
===================================================================
--- trunk/theora/configure.ac 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/configure.ac 2006-05-26 18:51:09 UTC (rev 11427)
@@ -95,13 +95,29 @@
case $host in
*)
- DEBUG="-g -W -D__NO_MATH_INLINES"
- CFLAGS="-g -O2 -Wall"
- PROFILE="-W -pg -g -O2 -fno-inline-functions";;
+ DEBUG="-g -Wall -D__NO_MATH_INLINES"
+ CFLAGS="-Wall -O3 -fforce-addr -fomit-frame-pointer -finline-functions -funroll-loops"
+ PROFILE="-Wall -pg -g -O3 -fno-inline-functions";;
esac
fi
CFLAGS="$CFLAGS $cflags_save"
+cpu_optimization="no optimization for your platform, please send a patch"
+cpu_x86_64=no
+cpu_x86_32=no
+case $target_cpu in
+ i[[3456]]86)
+ cpu_x86_32=yes
+ cpu_optimization="32bit x86"
+ ;;
+ x86_64)
+ cpu_x86_64=yes
+ cpu_optimization="64bit x86"
+ ;;
+esac
+AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
+AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
+
# Test whenever ld supports -version-script
AC_PROG_LD
AC_PROG_LD_GNU
@@ -288,8 +304,9 @@
General configuration:
- Encoding support: ............ ${ac_enable_encode}
- Floating point support: ...... ${ac_enable_float}
+ Encoding support: ........... ${ac_enable_encode}
+ Floating point support: ..... ${ac_enable_float}
+ Assembly optimization: ...... ${cpu_optimization}
Installation paths:
Modified: trunk/theora/examples/Makefile.am
===================================================================
--- trunk/theora/examples/Makefile.am 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/examples/Makefile.am 2006-05-26 18:51:09 UTC (rev 11427)
@@ -26,8 +26,8 @@
encoder_example_DEPENDENCIES = $(GETOPT_OBJS)
debug:
- $(MAKE) all CFLAGS="@DEBUG@ $(CFLAGS)"
+ $(MAKE) all CFLAGS="@DEBUG@"
profile:
- $(MAKE) all CFLAGS="@PROFILE@ $(CFLAGS)"
+ $(MAKE) all CFLAGS="@PROFILE@"
Modified: trunk/theora/lib/Makefile.am
===================================================================
--- trunk/theora/lib/Makefile.am 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/Makefile.am 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,6 +1,14 @@
INCLUDES = -I$(top_srcdir)/include
-EXTRA_DIST = Version_script.in
+EXTRA_DIST = Version_script.in \
+ x86_32/dsp_mmx.c \
+ x86_32/dsp_mmxext.c \
+ x86_32/recon_mmx.c \
+ x86_32/fdct_mmx.c \
+ x86_64/dsp_mmx.c \
+ x86_64/dsp_mmxext.c \
+ x86_64/recon_mmx.c \
+ x86_64/fdct_mmx.c
lib_LTLIBRARIES = libtheora.la
@@ -10,6 +18,25 @@
encoder_sources = dct_encode.c encode.c encoder_toplevel.c
endif
+if CPU_x86_64
+arch_dir = x86_64
+arch_sources= \
+ $(arch_dir)/dsp_mmx.c \
+ $(arch_dir)/dsp_mmxext.c \
+ $(arch_dir)/recon_mmx.c \
+ $(arch_dir)/fdct_mmx.c
+else
+if CPU_x86_32
+arch_dir = x86_32
+arch_sources= \
+ $(arch_dir)/dsp_mmx.c \
+ $(arch_dir)/dsp_mmxext.c \
+ $(arch_dir)/recon_mmx.c \
+ $(arch_dir)/fdct_mmx.c
+endif
+endif
+
+
libtheora_la_SOURCES = \
blockmap.c \
comment.c \
@@ -28,6 +55,9 @@
reconstruct.c \
scan.c \
toplevel.c \
+ cpu.c \
+ dsp.c \
+ $(arch_sources) \
$(encoder_sources)
noinst_HEADERS = \
@@ -39,7 +69,9 @@
pp.h \
quant_lookup.h \
toplevel.h \
- toplevel_lookup.h
+ toplevel_lookup.h \
+ cpu.h \
+ dsp.h
libtheora_la_CFLAGS = $(OGG_CFLAGS)
libtheora_la_LDFLAGS = -version-info @V_LIB_CURRENT@:@V_LIB_REVISION@:@V_LIB_AGE@ @SHLIB_VERSION_ARG@
Modified: trunk/theora/lib/codec_internal.h
===================================================================
--- trunk/theora/lib/codec_internal.h 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/codec_internal.h 2006-05-26 18:51:09 UTC (rev 11427)
@@ -24,6 +24,7 @@
#include "theora/theora.h"
#include "huffman.h"
+#include "dsp.h"
#ifndef LIBOGG2
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
@@ -227,6 +228,8 @@
ogg_int32_t ChLocalsCircularBufferSize;
ogg_int32_t PixelMapCircularBufferSize;
+ DspFunctions dsp; /* Selected functions for this platform */
+
} PP_INSTANCE;
/** block coding modes */
@@ -492,6 +495,8 @@
unsigned char *DataOutputInPtr;
+ DspFunctions dsp; /* Selected functions for this platform */
+
} PB_INSTANCE;
/* Encoder (Compressor) instance -- installed in a theora_state */
@@ -678,6 +683,8 @@
int packetflag;
int doneflag;
+ DspFunctions dsp; /* Selected functions for this platform */
+
} CP_INSTANCE;
#define clamp255(x) ((unsigned char)((((x)<0)-1) & ((x) | -((x)>255))))
@@ -687,7 +694,7 @@
ogg_uint32_t * KFIndicator );
extern void ClearPPInstance(PP_INSTANCE *ppi);
-extern void InitPPInstance(PP_INSTANCE *ppi);
+extern void InitPPInstance(PP_INSTANCE *ppi, DspFunctions *funcs);
extern int GetFrameType(PB_INSTANCE *pbi);
extern void InitPBInstance(PB_INSTANCE *pbi);
extern void ClearPBInstance(PB_INSTANCE *pbi);
Copied: trunk/theora/lib/cpu.c (from rev 11426, branches/theora-mmx/lib/cpu.c)
Copied: trunk/theora/lib/cpu.h (from rev 11426, branches/theora-mmx/lib/cpu.h)
Modified: trunk/theora/lib/dct.c
===================================================================
--- trunk/theora/lib/dct.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -16,6 +16,8 @@
********************************************************************/
#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
static ogg_int32_t xC1S7 = 64277;
static ogg_int32_t xC2S6 = 60547;
@@ -28,7 +30,7 @@
#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
int loop;
ogg_int32_t is07, is12, is34, is56;
@@ -251,3 +253,14 @@
op ++;
}
}
+
+void dsp_dct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+ funcs->fdct_short = fdct_short__c;
+#if (defined(__i386__) || defined(__x86_64__))
+ if (cpu_flags & CPU_X86_MMX) {
+ dsp_mmx_fdct_init(funcs);
+ }
+#endif
+}
+
Modified: trunk/theora/lib/dct_decode.c
===================================================================
--- trunk/theora/lib/dct_decode.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct_decode.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include <string.h>
#include "codec_internal.h"
+#include "dsp.h"
#define GOLDEN_FRAME_THRESH_Q 50
@@ -112,22 +113,6 @@
SetupBoundingValueArray_Generic(pbi, FLimit);
}
-void CopyBlock(unsigned char *src,
- unsigned char *dest,
- unsigned int srcstride){
- unsigned char *s = src;
- unsigned char *d = dest;
- unsigned int stride = srcstride;
-
- int j;
- for ( j = 0; j < 8; j++ ){
- ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
- ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
- s+=stride;
- d+=stride;
- }
-}
-
static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
ogg_uint32_t ReconPixelsPerLine;
ogg_int32_t ReconPixelIndex;
@@ -163,8 +148,8 @@
ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
/* Get the pixel index for the first pixel in the fragment. */
- ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
- (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
+ dsp_recon_intra8x8 (pbi->dsp, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
+ (ogg_int16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
}
@@ -248,10 +233,9 @@
/* Reconstruct the pixel data using the last frame reconstruction
and change data when the motion vector is (0,0), the recon is
based on the lastframe without loop filtering---- for testing */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_recon_inter8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
&pbi->LastFrameRecon[ReconPixelIndex],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
-
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}else if ( ModeUsesMC[pbi->CodingMode] ) {
/* The mode uses a motion vector. */
/* Get vector from list */
@@ -298,29 +282,30 @@
if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
/* Reconstruct the pixel dats from the reference frame and change data
(no half pixel in this case as the two references were the same. */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_recon_inter8x8 (pbi->dsp,
+ &pbi->ThisFrameRecon[ReconPixelIndex],
LastFrameRecPtr, pbi->ReconDataBuffer,
- ReconPixelsPerLine );
+ ReconPixelsPerLine);
}else{
/* Fractional pixel reconstruction. */
/* Note that we only use two pixels per reconstruction even for
the diagonal. */
- ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_recon_inter8x8_half(pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
LastFrameRecPtr, LastFrameRecPtr2,
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}
} else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
/* Golden frame with motion vector */
/* Reconstruct the pixel data using the golden frame
reconstruction and change data */
- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
+ dsp_recon_inter8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
&pbi->GoldenFrame[ ReconPixelIndex ],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
} else {
/* Simple Intra coding */
/* Get the pixel index for the first pixel in the fragment. */
- ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
- pbi->ReconDataBuffer, ReconPixelsPerLine );
+ dsp_recon_intra8x8 (pbi->dsp, &pbi->ThisFrameRecon[ReconPixelIndex],
+ pbi->ReconDataBuffer, ReconPixelsPerLine);
}
}
@@ -475,7 +460,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -487,7 +472,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -512,7 +497,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
}
}
@@ -524,7 +509,7 @@
SrcPtr = &SrcReconPtr[ PixelIndex ];
DestPtr = &DestReconPtr[ PixelIndex ];
- CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
+ dsp_copy8x8 (pbi->dsp, SrcPtr, DestPtr, PlaneLineStep);
}
}
Modified: trunk/theora/lib/dct_encode.c
===================================================================
--- trunk/theora/lib/dct_encode.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/dct_encode.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -17,110 +17,10 @@
#include <stdlib.h>
#include "codec_internal.h"
+#include "dsp.h"
static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
- unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine ) {
- int i;
-
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
-
- /* Update the screen canvas in one step*/
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- ReconPtr += ReconPixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- unsigned char *old_ptr1, unsigned char *new_ptr1,
- ogg_uint32_t PixelsPerLine ) {
- int i;
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
- /* INTRA mode so code raw image data */
- /* We convert the data to 8 bit signed (by subtracting 128) as
- this reduces the internal precision requirments in the DCT
- transform. */
- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
-
- /* Update the screen canvas in one step */
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- unsigned char *old_ptr1, unsigned char *new_ptr1,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine ) {
- int i;
-
- /* For each block row */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DctInputPtr[0] = (ogg_int16_t)
- ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
- DctInputPtr[1] = (ogg_int16_t)
- ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
- DctInputPtr[2] = (ogg_int16_t)
- ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
- DctInputPtr[3] = (ogg_int16_t)
- ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
- DctInputPtr[4] = (ogg_int16_t)
- ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
- DctInputPtr[5] = (ogg_int16_t)
- ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
- DctInputPtr[6] = (ogg_int16_t)
- ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
- DctInputPtr[7] = (ogg_int16_t)
- ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
-
- /* Update the screen canvas in one step */
- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
-
- /* Start next row */
- new_ptr1 += PixelsPerLine;
- old_ptr1 += PixelsPerLine;
- FiltPtr += PixelsPerLine;
- ReconPtr1 += ReconPixelsPerLine;
- ReconPtr2 += ReconPixelsPerLine;
- DctInputPtr += BLOCK_HEIGHT_WIDTH;
- }
-}
-
static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
ogg_uint32_t * TokenListPtr ){
unsigned char tokens_added = 0;
@@ -452,13 +352,15 @@
/* Is the MV offset exactly pixel alligned */
if ( AbsRefOffset == 0 ){
- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
- PixelsPerLine, ReconPixelsPerLine );
+ dsp_sub8x8(cpi->dsp, FiltPtr, ReconPtr1, DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
} else {
/* Fractional pixel MVs. */
/* Note that we only use two pixel values even for the diagonal */
- Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
- new_ptr1, PixelsPerLine, ReconPixelsPerLine );
+ dsp_sub8x8avg2(cpi->dsp, FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
}
}
@@ -534,17 +436,18 @@
pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
}
- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
- PixelsPerLine, ReconPixelsPerLine );
+ dsp_sub8x8(cpi->dsp, FiltPtr, ReconPtr1, DctInputPtr,
+ PixelsPerLine, ReconPixelsPerLine);
+ dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
} else if ( cpi->pb.CodingMode==CODE_INTRA ) {
- Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
-
+ dsp_sub8x8_128(cpi->dsp, FiltPtr, DctInputPtr, PixelsPerLine);
+ dsp_copy8x8 (cpi->dsp, new_ptr1, old_ptr1, PixelsPerLine);
}
/* Proceed to encode the data into the encode buffer if the encoder
is enabled. */
/* Perform a 2D DCT transform on the data. */
- fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
+ dsp_fdct_short(cpi->dsp, cpi->DCTDataBuffer, cpi->DCT_codes );
/* Quantize that transform data. */
quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
Modified: trunk/theora/lib/decode.c
===================================================================
--- trunk/theora/lib/decode.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/decode.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -865,7 +865,9 @@
if (pbi->DecoderErrorCode) return;
/* Reconstruct and display the frame */
+ dsp_save_fpu (pbi->dsp);
ReconRefFrames(pbi);
+ dsp_restore_fpu (pbi->dsp);
}
Copied: trunk/theora/lib/dsp.c (from rev 11426, branches/theora-mmx/lib/dsp.c)
Copied: trunk/theora/lib/dsp.h (from rev 11426, branches/theora-mmx/lib/dsp.h)
Modified: trunk/theora/lib/encode.c
===================================================================
--- trunk/theora/lib/encode.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/encode.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -531,8 +531,7 @@
static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
ogg_int32_t BlockIndex ) {
- ogg_uint32_t i;
- ogg_uint32_t ErrorVal = 0;
+ ogg_uint32_t ErrorVal;
unsigned char * SrcDataPtr =
&cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
@@ -550,21 +549,8 @@
RecStride = cpi->pb.UVStride;
}
+ ErrorVal = dsp_sad8x8 (cpi->dsp, SrcDataPtr, SrcStride, RecDataPtr, RecStride);
- /* Decide on standard or MMX implementation */
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
- ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
- ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
- ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
- ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
- ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
- ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
- ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
- /* Step to next row of block. */
- SrcDataPtr += SrcStride;
- RecDataPtr += RecStride;
- }
return ErrorVal;
}
@@ -933,9 +919,13 @@
/* Zero Decoder EOB run count */
cpi->pb.EOB_Run = 0;
+ dsp_save_fpu (cpi->dsp);
+
/* Encode any fragments coded using DCT. */
coded_pixels += QuadCodeDisplayFragments (cpi);
+ dsp_restore_fpu (cpi->dsp);
+
return coded_pixels;
}
Modified: trunk/theora/lib/encoder_toplevel.c
===================================================================
--- trunk/theora/lib/encoder_toplevel.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/encoder_toplevel.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -23,6 +23,7 @@
#include <string.h>
#include "toplevel_lookup.h"
#include "toplevel.h"
+#include "dsp.h"
#define A_TABLE_SIZE 29
#define DF_CANDIDATE_WINDOW 5
@@ -778,12 +779,15 @@
if(c->pixelformat!=OC_PF_420)return OC_IMPL;
th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
+ dsp_static_init (&cpi->dsp);
+ memcpy (&cpi->pb.dsp, &cpi->dsp, sizeof(DspFunctions));
+
c->version_major=VERSION_MAJOR;
c->version_minor=VERSION_MINOR;
c->version_subminor=VERSION_SUB;
InitTmpBuffers(&cpi->pb);
- InitPPInstance(&cpi->pp);
+ InitPPInstance(&cpi->pp, &cpi->dsp);
/* Initialise Configuration structure to legal values */
if(c->quality>63)c->quality=63;
Modified: trunk/theora/lib/idct.c
===================================================================
--- trunk/theora/lib/idct.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/idct.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -10,7 +10,7 @@
* *
********************************************************************
- function:
+ function: C implementation of the Theora iDCT
last mod: $Id: idct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
********************************************************************/
@@ -20,6 +20,8 @@
#include "quant_lookup.h"
#define IdctAdjustBeforeShift 8
+
+/* cos(n*pi/16) or sin(8-n)*pi/16) */
#define xC1S7 64277
#define xC2S6 60547
#define xC3S5 54491
@@ -28,6 +30,85 @@
#define xC6S2 25080
#define xC7S1 12785
+/* compute the 16 bit signed 1D inverse DCT - spec version */
+static void idct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ) {
+ ogg_int32_t t[8], r;
+ ogg_int16_t *y = InputData;
+ ogg_int16_t *x = OutputData;
+
+ t[0] = y[0] + y[4];
+ t[0] &= 0xffff;
+ t[0] = (xC4S4 * t[0]) >> 16;
+
+ t[1] = y[0] - y[4];
+ t[1] &= 0xffff;
+ t[1] = (xC4S4 * t[1]) >> 16;
+
+ t[2] = ((xC6S2 * t[2]) >> 16) - ((xC2S6 * y[6]) >> 16);
+ t[3] = ((xC2S6 * t[2]) >> 16) + ((xC6S2 * y[6]) >> 16);
+ t[4] = ((xC7S1 * t[1]) >> 16) - ((xC1S7 * y[7]) >> 16);
+ t[5] = ((xC3S5 * t[5]) >> 16) - ((xC5S3 * y[3]) >> 16);
+ t[6] = ((xC5S3 * t[5]) >> 16) + ((xC3S5 * y[3]) >> 16);
+ t[7] = ((xC1S7 * t[1]) >> 16) + ((xC7S1 * y[7]) >> 16);
+
+ r = t[4] + t[5];
+ t[5] = t[4] - t[5];
+ t[5] &= 0xffff;
+ t[5] = (xC4S4 * (-t[5])) >> 16;
+ t[4] = r;
+
+ r = t[7] + t[6];
+ t[6] = t[7] - t[6];
+ t[6] &= 0xffff;
+ t[6] = (xC4S4 * t[6]) >> 16;
+ t[7] = r;
+
+ r = t[0] + t[3];
+ t[3] = t[0] - t[3];
+ t[0] = r;
+
+ r = t[1] + t[2];
+ t[2] = t[1] - t[2];
+ t[1] = r;
+
+ r = t[6] + t[5];
+ t[5] = t[6] - t[5];
+ t[6] = r;
+
+ r = t[0] + t[7];
+ r &= 0xffff;
+ x[0] = r;
+
+ r = t[1] + t[6];
+ r &= 0xffff;
+ x[1] = r;
+
+ r = t[2] + t[5];
+ r &= 0xffff;
+ x[2] = r;
+
+ r = t[3] + t[4];
+ r &= 0xffff;
+ x[3] = r;
+
+ r = t[3] - t[4];
+ r &= 0xffff;
+ x[4] = r;
+
+ r = t[2] - t[5];
+ r &= 0xffff;
+ x[5] = r;
+
+ r = t[1] - t[6];
+ r &= 0xffff;
+ x[6] = r;
+
+ r = t[0] - t[7];
+ r &= 0xffff;
+ x[7] = r;
+
+}
+
static void dequant_slow( ogg_int16_t * dequant_coeffs,
ogg_int16_t * quantized_list,
ogg_int32_t * DCT_block) {
@@ -36,6 +117,8 @@
DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
}
+
+
void IDctSlow( Q_LIST_ENTRY * InputData,
ogg_int16_t *QuantMatrix,
ogg_int16_t * OutputData ) {
Modified: trunk/theora/lib/mcomp.c
===================================================================
--- trunk/theora/lib/mcomp.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/mcomp.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -17,6 +17,7 @@
#include <stdlib.h>
#include <stdio.h>
+#include "dsp.h"
#include "codec_internal.h"
/* Initialises motion compentsation. */
@@ -96,288 +97,92 @@
cpi->MVPixelOffsetY[i] = (cpi->MVOffsetY[i]*LineStepY) + cpi->MVOffsetX[i];
}
-static ogg_uint32_t GetInterErr (unsigned char * NewDataPtr,
+static ogg_uint32_t GetInterErr (CP_INSTANCE *cpi, unsigned char * NewDataPtr,
unsigned char * RefDataPtr1,
unsigned char * RefDataPtr2,
ogg_uint32_t PixelsPerLine ) {
- ogg_uint32_t i;
- ogg_int32_t XSum=0;
- ogg_int32_t XXSum=0;
ogg_int32_t DiffVal;
- ogg_int32_t AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
+ ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
+ ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
/* Mode of interpolation chosen based upon on the offset of the
second reference pointer */
- if ( AbsRefOffset == 0 ) {
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
- XSum += DiffVal;
-
- /* negative array indexes are strictly forbidden by ANSI C and C99 */
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
- }
-
+ if ( RefOffset == 0 ) {
+ DiffVal = dsp_inter8x8_err (cpi->dsp, NewDataPtr, PixelsPerLine,
+ RefDataPtr1, RefPixelsPerLine);
}else{
-
- /* Simple two reference interpolation */
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal = ((int)NewDataPtr[0]) -
- (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[1]) -
- (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[2]) -
- (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[3]) -
- (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[4]) -
- (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[5]) -
- (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[6]) -
- (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- DiffVal = ((int)NewDataPtr[7]) -
- (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
- XSum += DiffVal;
- XXSum += DiffVal*DiffVal;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
- RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
- }
+ DiffVal = dsp_inter8x8_err_xy2 (cpi->dsp, NewDataPtr, PixelsPerLine,
+ RefDataPtr1,
+ RefDataPtr2, RefPixelsPerLine);
}
/* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t GetSumAbsDiffs (unsigned char * NewDataPtr,
- unsigned char * RefDataPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ErrorSoFar) {
- ogg_uint32_t i;
- ogg_uint32_t DiffVal = ErrorSoFar;
-
- /* Decide on standard or MMX implementation */
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
- }
-
return DiffVal;
}
-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
- unsigned char * RefDataPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ErrorSoFar,
- ogg_uint32_t BestSoFar ) {
- ogg_uint32_t i;
- ogg_uint32_t DiffVal = ErrorSoFar;
-
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
-
- if ( DiffVal > BestSoFar )break;
-
- /* Step to next row of block. */
- NewDataPtr += PixelsPerLine;
- RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
- }
-
- return DiffVal;
-}
-
-static ogg_uint32_t GetHalfPixelSumAbsDiffs (unsigned char * SrcData,
+static ogg_uint32_t GetHalfPixelSumAbsDiffs (CP_INSTANCE *cpi,
+ unsigned char * SrcData,
unsigned char * RefDataPtr1,
unsigned char * RefDataPtr2,
ogg_uint32_t PixelsPerLine,
ogg_uint32_t ErrorSoFar,
ogg_uint32_t BestSoFar ) {
- ogg_uint32_t i;
ogg_uint32_t DiffVal = ErrorSoFar;
ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
if ( RefOffset == 0 ) {
/* Simple case as for non 0.5 pixel */
- DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
- 0);
+ DiffVal += dsp_sad8x8 (cpi->dsp, SrcData, PixelsPerLine,
+ RefDataPtr1, RefPixelsPerLine);
} else {
- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
- DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
- (int)RefDataPtr2[0]) / 2) );
- DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
- (int)RefDataPtr2[1]) / 2) );
- DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
- (int)RefDataPtr2[2]) / 2) );
- DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
- (int)RefDataPtr2[3]) / 2) );
- DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
- (int)RefDataPtr2[4]) / 2) );
- DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
- (int)RefDataPtr2[5]) / 2) );
- DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
- (int)RefDataPtr2[6]) / 2) );
- DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
- (int)RefDataPtr2[7]) / 2) );
-
- if ( DiffVal > BestSoFar ) break;
-
- /* Step to next row of block. */
- SrcData += PixelsPerLine;
- RefDataPtr1 += RefPixelsPerLine;
- RefDataPtr2 += RefPixelsPerLine;
- }
+ DiffVal += dsp_sad8x8_xy2_thres (cpi->dsp, SrcData, PixelsPerLine,
+ RefDataPtr1,
+ RefDataPtr2, RefPixelsPerLine, BestSoFar);
}
return DiffVal;
}
-static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
- ogg_uint32_t PixelsPerLine ) {
- ogg_uint32_t i;
- ogg_uint32_t XSum=0;
- ogg_uint32_t XXSum=0;
- unsigned char *DiffPtr;
-
- /* Loop expanded out for speed. */
- DiffPtr = DataPtr;
-
- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
-
- /* Examine alternate pixel locations. */
- XSum += DiffPtr[0];
- XXSum += DiffPtr[0]*DiffPtr[0];
- XSum += DiffPtr[1];
- XXSum += DiffPtr[1]*DiffPtr[1];
- XSum += DiffPtr[2];
- XXSum += DiffPtr[2]*DiffPtr[2];
- XSum += DiffPtr[3];
- XXSum += DiffPtr[3]*DiffPtr[3];
- XSum += DiffPtr[4];
- XXSum += DiffPtr[4]*DiffPtr[4];
- XSum += DiffPtr[5];
- XXSum += DiffPtr[5]*DiffPtr[5];
- XSum += DiffPtr[6];
- XXSum += DiffPtr[6]*DiffPtr[6];
- XSum += DiffPtr[7];
- XXSum += DiffPtr[7]*DiffPtr[7];
-
- /* Step to next row of block. */
- DiffPtr += PixelsPerLine;
- }
-
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-}
-
ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
ogg_uint32_t PixelsPerLine ) {
ogg_uint32_t LocalFragIndex = FragIndex;
ogg_uint32_t IntraError = 0;
+ dsp_save_fpu (cpi->dsp);
+
/* Add together the intra errors for those blocks in the macro block
that are coded (Y only) */
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +=
- GetIntraError(&cpi->
+ dsp_intra8x8_err (cpi->dsp, &cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
-
LocalFragIndex++;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +=
- GetIntraError(&cpi->
+ dsp_intra8x8_err (cpi->dsp, &cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
LocalFragIndex = FragIndex + cpi->pb.HFragments;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +=
- GetIntraError(&cpi->
+ dsp_intra8x8_err (cpi->dsp, &cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
LocalFragIndex++;
if ( cpi->pb.display_fragments[LocalFragIndex] )
IntraError +=
- GetIntraError(&cpi->
+ dsp_intra8x8_err (cpi->dsp, &cpi->
ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
- PixelsPerLine );
+ PixelsPerLine);
+ dsp_restore_fpu (cpi->dsp);
+
return IntraError;
}
@@ -400,6 +205,8 @@
unsigned char * SrcPtr1;
unsigned char * RefPtr1;
+ dsp_save_fpu (cpi->dsp);
+
/* Work out pixel offset into source buffer. */
PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
@@ -428,7 +235,7 @@
if ( cpi->pb.display_fragments[LocalFragIndex] ) {
SrcPtr1 = &SrcPtr[PixelIndex];
RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
- InterError += GetInterErr( SrcPtr1, RefPtr1,
+ InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
&RefPtr1[RefPtr2Offset], PixelsPerLine );
}
@@ -438,7 +245,7 @@
RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
SrcPtr1 = &SrcPtr[PixelIndex];
RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
- InterError += GetInterErr( SrcPtr1, RefPtr1,
+ InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
&RefPtr1[RefPtr2Offset], PixelsPerLine );
}
@@ -449,7 +256,7 @@
RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
SrcPtr1 = &SrcPtr[PixelIndex];
RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
- InterError += GetInterErr( SrcPtr1, RefPtr1,
+ InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
&RefPtr1[RefPtr2Offset], PixelsPerLine );
}
@@ -459,9 +266,12 @@
RefPixelIndex = cpi->pb.recon_pixel_index_table[LocalFragIndex];
SrcPtr1 = &SrcPtr[PixelIndex];
RefPtr1 = &RefPtr[RefPixelIndex + RefPixelOffset];
- InterError += GetInterErr( SrcPtr1, RefPtr1,
+ InterError += GetInterErr(cpi, SrcPtr1, RefPtr1,
&RefPtr1[RefPtr2Offset], PixelsPerLine );
}
+
+ dsp_restore_fpu (cpi->dsp);
+
return InterError;
}
@@ -496,6 +306,8 @@
unsigned char * RefDataPtr1;
unsigned char * RefDataPtr2;
+ dsp_save_fpu (cpi->dsp);
+
/* Note which of the four blocks in the macro block are to be
included in the search. */
MBlockDispFrags[0] =
@@ -518,20 +330,20 @@
/* Check the 0,0 candidate. */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, RefPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] ) {
- Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[1], PixelsPerLine, RefPtr + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[2] ) {
- Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[3] ) {
- Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
/* Set starting values to results of 0, 0 vector. */
@@ -554,24 +366,23 @@
/* Get the score for the current offset */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
- PixelsPerLine, Error, MinError );
+ Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( MBlockDispFrags[2] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
- PixelsPerLine, Error, MinError );
+ Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( MBlockDispFrags[3] && (Error < MinError) ) {
- Error = GetNextSumAbsDiffs( SrcPtr[3],
- CandidateBlockPtr + RefRow2Offset + 8,
- PixelsPerLine, Error, MinError );
+ Error += dsp_sad8x8_thres (cpi->dsp, SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA, MinError);
}
if ( Error < MinError ) {
@@ -610,7 +421,7 @@
RefDataPtr1 = BestBlockPtr;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[0], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[0], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -618,7 +429,7 @@
RefDataPtr1 = BestBlockPtr + 8;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[1], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[1], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -626,7 +437,7 @@
RefDataPtr1 = BestBlockPtr + RefRow2Offset;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[2], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[2], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -634,7 +445,7 @@
RefDataPtr1 = BestBlockPtr + RefRow2Offset + 8;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[3], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[3], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -652,6 +463,8 @@
InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
FragIndex, MV->x, MV->y, PixelsPerLine );
+ dsp_restore_fpu (cpi->dsp);
+
/* Return score of best matching block. */
return InterMVError;
}
@@ -684,6 +497,8 @@
unsigned char * RefDataPtr1;
unsigned char * RefDataPtr2;
+ dsp_save_fpu (cpi->dsp);
+
/* Note which of the four blocks in the macro block are to be
included in the search. */
MBlockDispFrags[0] = cpi->
@@ -717,20 +532,20 @@
/* Summ errors for each block. */
if ( MBlockDispFrags[0] ) {
- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[1] ){
- Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[2] ){
- Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
+ PixelsPerLine + STRIDE_EXTRA);
}
if ( MBlockDispFrags[3] ){
- Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,
- PixelsPerLine, Error);
+ Error += dsp_sad8x8 (cpi->dsp, SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
+ PixelsPerLine + STRIDE_EXTRA);
}
/* Was this the best so far */
@@ -766,7 +581,7 @@
RefDataPtr1 = BestBlockPtr;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[0], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[0], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -774,7 +589,7 @@
RefDataPtr1 = BestBlockPtr + 8;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[1], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[1], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -782,7 +597,7 @@
RefDataPtr1 = BestBlockPtr + RefRow2Offset;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[2], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[2], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -790,7 +605,7 @@
RefDataPtr1 = BestBlockPtr + RefRow2Offset + 8;
RefDataPtr2 = RefDataPtr1 + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr[3], RefDataPtr1, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr[3], RefDataPtr1, RefDataPtr2,
PixelsPerLine, HalfPixelError, BestHalfPixelError );
}
@@ -808,6 +623,8 @@
InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
FragIndex, MV->x, MV->y, PixelsPerLine );
+ dsp_restore_fpu (cpi->dsp);
+
/* Return score of best matching block. */
return InterMVError;
}
@@ -850,8 +667,8 @@
for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
/* Get the block error score. */
- Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
- PixelsPerLine, 0);
+ Error = dsp_sad8x8 (cpi->dsp, SrcPtr, PixelsPerLine, CandidateBlockPtr,
+ PixelsPerLine + STRIDE_EXTRA);
/* Was this the best so far */
if ( Error < MinError ) {
@@ -881,7 +698,7 @@
for ( i=0; i < 9; i++ ) {
RefDataPtr2 = BestBlockPtr + cpi->HalfPixelRef2Offset[i];
HalfPixelError =
- GetHalfPixelSumAbsDiffs( SrcPtr, BestBlockPtr, RefDataPtr2,
+ GetHalfPixelSumAbsDiffs(cpi, SrcPtr, BestBlockPtr, RefDataPtr2,
PixelsPerLine, 0, BestHalfPixelError );
if ( HalfPixelError < BestHalfPixelError ){
@@ -898,7 +715,7 @@
RefDataPtr2 = BestBlockPtr + cpi->HalfPixelRef2Offset[BestHalfOffset];
InterMVError =
- GetInterErr( SrcPtr, BestBlockPtr, RefDataPtr2, PixelsPerLine );
+ GetInterErr(cpi, SrcPtr, BestBlockPtr, RefDataPtr2, PixelsPerLine );
/* Return score of best matching block. */
return InterMVError;
@@ -911,6 +728,8 @@
MOTION_VECTOR *MV ) {
ogg_uint32_t InterMVError;
+ dsp_save_fpu (cpi->dsp);
+
/* For the moment the 4MV mode is only deemed to be valid
if all four Y blocks are to be updated */
/* This may be adapted later. */
@@ -941,6 +760,8 @@
InterMVError = HUGE_ERROR;
}
+ dsp_restore_fpu (cpi->dsp);
+
/* Return score of best matching block. */
return InterMVError;
}
Modified: trunk/theora/lib/pp.c
===================================================================
--- trunk/theora/lib/pp.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/pp.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -19,6 +19,7 @@
#include <string.h>
#include "codec_internal.h"
#include "pp.h"
+#include "dsp.h"
#define MAX(a, b) ((a>b)?a:b)
#define MIN(a, b) ((a<b)?a:b)
@@ -150,10 +151,12 @@
}
-void InitPPInstance(PP_INSTANCE *ppi){
+void InitPPInstance(PP_INSTANCE *ppi, DspFunctions *funcs){
memset(ppi,0,sizeof(*ppi));
+ memcpy(&ppi->dsp, funcs, sizeof(DspFunctions));
+
/* Initializations */
ppi->PrevFrameLimit = 3; /* Must not exceed MAX_PREV_FRAMES (Note
that this number includes the current
@@ -490,7 +493,7 @@
} else {
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
@@ -529,7 +532,7 @@
DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
LineLength,Quality,QuantScale);
}else{
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
++Block;
@@ -565,7 +568,7 @@
DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
LineLength,Quality,QuantScale);
}else{
- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
+ dsp_copy8x8(pbi->dsp, SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
}
++Block;
Modified: trunk/theora/lib/reconstruct.c
===================================================================
--- trunk/theora/lib/reconstruct.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/reconstruct.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -16,12 +16,28 @@
********************************************************************/
#include "codec_internal.h"
+#include "dsp.h"
+#include "cpu.h"
-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void copy8x8__c (unsigned char *src,
+ unsigned char *dest,
+ unsigned int stride)
+{
+ int j;
+ for ( j = 0; j < 8; j++ ){
+ ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
+ ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
+ src+=stride;
+ dest+=stride;
+ }
+}
+
+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+ for (i = 8; i; i--){
/* Convert the data back to 8 bit unsigned */
/* Saturate the output to unsigend 8 bit values */
ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
@@ -34,17 +50,16 @@
ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
ReconPtr += LineStep;
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
}
-
}
-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr, ogg_int16_t * ChangePtr,
- ogg_uint32_t LineStep ) {
+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
+ for (i = 8; i; i--){
ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
@@ -54,19 +69,19 @@
ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
ReconPtr += LineStep;
RefPtr += LineStep;
}
-
}
-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
- unsigned char * RefPtr1, unsigned char * RefPtr2,
- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
+ ogg_uint32_t LineStep)
+{
ogg_uint32_t i;
- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
+ for (i = 8; i; i--){
ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
@@ -76,10 +91,22 @@
ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
- ChangePtr += BLOCK_HEIGHT_WIDTH;
+ ChangePtr += 8;
ReconPtr += LineStep;
RefPtr1 += LineStep;
RefPtr2 += LineStep;
}
+}
+void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
+{
+ funcs->copy8x8 = copy8x8__c;
+ funcs->recon_intra8x8 = recon_intra8x8__c;
+ funcs->recon_inter8x8 = recon_inter8x8__c;
+ funcs->recon_inter8x8_half = recon_inter8x8_half__c;
+#if (defined(__i386__) || defined(__x86_64__))
+ if (cpu_flags & CPU_X86_MMX) {
+ dsp_mmx_recon_init(funcs);
+ }
+#endif
}
Modified: trunk/theora/lib/scan.c
===================================================================
--- trunk/theora/lib/scan.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/scan.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -19,9 +19,20 @@
#include <math.h>
#include <string.h>
#include "codec_internal.h"
+#include "dsp.h"
#define MAX_SEARCH_LINE_LEN 7
+#define SET8_0(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
+ ((ogg_uint32_t *)ptr)[1] = 0x00000000;
+#define SET8_1(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
+ ((ogg_uint32_t *)ptr)[1] = 0x01010101;
+#define SET8_8(ptr) \
+ ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
+ ((ogg_uint32_t *)ptr)[1] = 0x08080808;
+
static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
0, 0, 0, 0, 2, 4, 12, 24
};
@@ -384,69 +395,6 @@
ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
}
-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
- unsigned char * Src2 ){
- ogg_uint32_t SadValue;
- ogg_uint32_t SadValue1;
-
- SadValue = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
- abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
-
- SadValue1 = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
- abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
-
- SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
-
- return SadValue;
-}
-
-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
- unsigned char * Src1,
- unsigned char * Src2 ){
- ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
- ogg_uint32_t MaxSad = 0;
- ogg_uint32_t i;
-
- for ( i = 0; i < 4; i++ ){
- SadValue[0] += abs(Src1[0] - Src2[0]);
- SadValue[1] += abs(Src1[1] - Src2[1]);
- SadValue[2] += abs(Src1[2] - Src2[2]);
- SadValue[3] += abs(Src1[3] - Src2[3]);
- SadValue[4] += abs(Src1[4] - Src2[4]);
- SadValue[5] += abs(Src1[5] - Src2[5]);
- SadValue[6] += abs(Src1[6] - Src2[6]);
- SadValue[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += ppi->PlaneStride;
- Src2 += ppi->PlaneStride;
- }
-
- for ( i = 0; i < 4; i++ ){
- SadValue2[0] += abs(Src1[0] - Src2[0]);
- SadValue2[1] += abs(Src1[1] - Src2[1]);
- SadValue2[2] += abs(Src1[2] - Src2[2]);
- SadValue2[3] += abs(Src1[3] - Src2[3]);
- SadValue2[4] += abs(Src1[4] - Src2[4]);
- SadValue2[5] += abs(Src1[5] - Src2[5]);
- SadValue2[6] += abs(Src1[6] - Src2[6]);
- SadValue2[7] += abs(Src1[7] - Src2[7]);
-
- Src1 += ppi->PlaneStride;
- Src2 += ppi->PlaneStride;
- }
-
- for ( i = 0; i < 8; i++ ){
- if ( SadValue[i] > MaxSad )
- MaxSad = SadValue[i];
- if ( SadValue2[i] > MaxSad )
- MaxSad = SadValue2[i];
- }
-
- return MaxSad;
-}
-
-
static int RowSadScan( PP_INSTANCE *ppi,
unsigned char * YuvPtr1,
unsigned char * YuvPtr2,
@@ -475,7 +423,7 @@
for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
/* Calculate the SAD score for the block row */
- GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
+ GrpSad = dsp_row_sad8(ppi->dsp, LocalYuvPtr1,LocalYuvPtr2);
/* Now test the group SAD score */
if ( GrpSad > LocalGrpLowSadThresh ){
@@ -532,7 +480,7 @@
/* Skip if block already marked to be coded. */
if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
/* Calculate the SAD score for the block column */
- MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
+ MaxSad = dsp_col_sad8x8(ppi->dsp, LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
/* Now test the group SAD score */
if ( MaxSad > LocalGrpLowSadThresh ){
@@ -758,7 +706,7 @@
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -777,10 +725,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -816,7 +764,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -839,10 +787,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -876,7 +824,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
@@ -899,10 +847,10 @@
}else{
/* If we are breaking out here mark all pixels as changed. */
if ( *DispFragPtr > BLOCK_NOT_CODED ){
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
@@ -935,7 +883,7 @@
/* Test for break out conditions to save time. */
if (*DispFragPtr == CANDIDATE_BLOCK){
/* Clear down entries in changed locals array */
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
for ( j = 0; j < HFRAGPIXELS; j++ ){
/* Take a local copy of the measured difference. */
@@ -959,10 +907,10 @@
}else{
/* If we are breaking out here mark all pixels as changed.*/
if ( *DispFragPtr > BLOCK_NOT_CODED ) {
- memset(bits_map_ptr,1,8);
- memset(ChLocalsPtr,8,8);
+ SET8_1(bits_map_ptr);
+ SET8_8(ChLocalsPtr);
}else{
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
}
}
/* If we have a lot of changed pixels for this fragment on this
@@ -1071,7 +1019,7 @@
}
}else{
if ( *DispFragPtr > BLOCK_NOT_CODED )
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
/* Step pointers */
ChLocalsPtr += HFRAGPIXELS;
@@ -1133,7 +1081,7 @@
}
}else{
if ( *DispFragPtr > BLOCK_NOT_CODED )
- memset(ChLocalsPtr,0,8);
+ SET8_0(ChLocalsPtr);
/* Step pointers */
ChLocalsPtr += HFRAGPIXELS;
@@ -2126,10 +2074,12 @@
/* Fast break out test for obvious yes and no cases in this row of
blocks */
if ( i < ppi->PlaneVFragments ){
+ dsp_save_fpu (ppi->dsp);
UpdatedOrCandidateBlocks =
RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
- if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
- UpdatedOrCandidateBlocks = 1;
+ UpdatedOrCandidateBlocks |=
+ ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
+ dsp_restore_fpu (ppi->dsp);
}else{
/* Make sure we still call other functions if RowSadScan() disabled */
UpdatedOrCandidateBlocks = 1;
Modified: trunk/theora/lib/toplevel.c
===================================================================
--- trunk/theora/lib/toplevel.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/toplevel.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -23,6 +23,7 @@
#include <string.h>
#include "theora/theora.h"
#include "toplevel.h"
+#include "dsp.h"
static int _ilog(unsigned int v){
int ret=0;
@@ -309,6 +310,9 @@
th->internal_encode=NULL;
InitPBInstance(pbi);
+
+ dsp_static_init (&pbi->dsp);
+
memcpy(&pbi->info,c,sizeof(*c));
pbi->info.codec_setup=NULL;
th->i=&pbi->info;
Copied: trunk/theora/lib/x86_32 (from rev 11426, branches/theora-mmx/lib/x86_32)
Deleted: trunk/theora/lib/x86_32/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/dsp_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/dsp_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,644 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
- " movq %%mm0, (%2) \n\t" /* write answer out */
- " movq %%mm2, 8(%2) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %2 \n\t"
- " add %3, %0 \n\t"
- " add %4, %1 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
- " movq "M(V128)", %%mm1 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
- " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
- " movq %%mm0, (%1) \n\t" /* write answer out */
- " movq %%mm2, 8(%1) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %1 \n\t"
- " add %2, %0 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine)
- : "memory"
- );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
- " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
- " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
- " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
- /* average ReconPtr1 and ReconPtr2 */
- " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
- " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " movq %%mm0, (%3) \n\t" /* write answer out */
- " movq %%mm2, 8(%3) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %3 \n\t"
- " add %4, %0 \n\t"
- " add %5, %1 \n\t"
- " add %5, %2 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr1),
- "+r" (ReconPtr2),
- "+r" (DctInputPtr)
- : "m" (PixelsPerLine),
- "m" (ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
-
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
- " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $32, %%mm2 \n\t" /* fold and add */
- " psrlq $32, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
- " psrlq $16, %%mm2 \n\t"
- " psrlq $16, %%mm3 \n\t"
- " paddw %%mm2, %%mm0 \n\t"
- " paddw %%mm3, %%mm1 \n\t"
-
- " psubusw %%mm0, %%mm1 \n\t"
- " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
- " movd %%mm1, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%edi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 2b \n\t"
-
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
- " psubusw %%mm4, %%mm5 \n\t"
- " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
- " psubusw %%mm5, %%mm7 \n\t"
- " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " psubusw %%mm6, %%mm7 \n\t"
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" (stride)
- : "memory", "edi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
- " paddb %%mm5, %%mm5 \n\t"
-
- " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- " mov $8, %%edi \n\t" /* 8 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%2), %%mm2 \n\t"
- " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm5, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
-
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $16, %%mm7 \n\t"
- " paddw %%mm0, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq %%mm0, %%mm2 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %3, %2 \n\t" /* Inc pointer into src data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=r" (XSum),
- "=r" (XXSum),
- "+r" (DataPtr)
- : "r" (Stride)
- : "edi", "memory"
- );
-
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq (%3), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %4, %2 \n\t" /* Inc pointer into src data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
- " paddb %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
- " movq %%mm2, %%mm1 \n\t"
- " pand %%mm3, %%mm1 \n\t"
- " pxor %%mm2, %%mm3 \n\t"
- " pand %%mm4, %%mm3 \n\t"
- " psrlq $1, %%mm3 \n\t"
- " paddb %%mm3, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
- __asm__ __volatile__ (
- " emms \n\t"
- );
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_32 mmx dsp functions.\n");
- funcs->restore_fpu = restore_fpu;
- funcs->sub8x8 = sub8x8__mmx;
- funcs->sub8x8_128 = sub8x8_128__mmx;
- funcs->sub8x8avg2 = sub8x8avg2__mmx;
- funcs->row_sad8 = row_sad8__mmx;
- funcs->col_sad8x8 = col_sad8x8__mmx;
- funcs->sad8x8 = sad8x8__mmx;
- funcs->sad8x8_thres = sad8x8_thres__mmx;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
- funcs->intra8x8_err = intra8x8_err__mmx;
- funcs->inter8x8_err = inter8x8_err__mmx;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
-}
-
Copied: trunk/theora/lib/x86_32/dsp_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/dsp_mmx.c)
Deleted: trunk/theora/lib/x86_32/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/x86_32/dsp_mmxext.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/dsp_mmxext.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,318 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 7 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" (stride1),
- "r" (stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq (%3), %%mm2 \n\t"
- " pavgb %%mm2, %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
-
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movd (%1), %%mm0 \n\t"
- " movd (%2), %%mm1 \n\t"
- " psadbw %%mm0, %%mm1 \n\t"
- " movd 4(%1), %%mm2 \n\t"
- " movd 4(%2), %%mm3 \n\t"
- " psadbw %%mm2, %%mm3 \n\t"
-
- " pmaxsw %%mm1, %%mm3 \n\t"
- " movd %%mm3, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%edi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%edi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%edi \n\t"
- " jnz 2b \n\t"
-
- " pmaxsw %%mm6, %%mm7 \n\t"
- " pmaxsw %%mm4, %%mm5 \n\t"
- " pmaxsw %%mm5, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" (stride)
- : "memory", "edi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint32_t XSum;
- ogg_uint32_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%edi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
- " pavgb %%mm2, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm4, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm4, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%edi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "m" (SrcStride),
- "m" (RefStride)
- : "edi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_mmxext_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_32 mmxext dsp functions.\n");
- funcs->row_sad8 = row_sad8__mmxext;
- funcs->col_sad8x8 = col_sad8x8__mmxext;
- funcs->sad8x8 = sad8x8__mmxext;
- funcs->sad8x8_thres = sad8x8_thres__mmxext;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}
-
Copied: trunk/theora/lib/x86_32/dsp_mmxext.c (from rev 11426, branches/theora-mmx/lib/x86_32/dsp_mmxext.c)
Deleted: trunk/theora/lib/x86_32/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/fdct_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/fdct_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,342 +0,0 @@
-;//==========================================================================
-;//
-;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;// PURPOSE.
-;//
-;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- * File: fdct_m.asm
- *
- * Description:
- * This function perform 2-D Forward DCT on a 8x8 block
- *
- *
- * Input: Pointers to input source data buffer and destination
- * buffer.
- *
- * Note: none
- *
- * Special Notes: We try to do the truncation right to match the result
- * of the c version.
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
- " movq " #ip0 ", %%mm0 \n\t" \
- " movq " #ip1 ", %%mm1 \n\t" \
- " movq " #ip3 ", %%mm2 \n\t" \
- " movq " #ip5 ", %%mm3 \n\t" \
- " movq %%mm0, %%mm4 \n\t" \
- " movq %%mm1, %%mm5 \n\t" \
- " movq %%mm2, %%mm6 \n\t" \
- " movq %%mm3, %%mm7 \n\t" \
- \
- " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
- " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
- " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
- " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
- " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
- " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
- \
- " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
- \
- " paddsw %%mm2, %%mm2 \n\t" \
- \
- " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
- \
- " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
- " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
- " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
- \
- " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
- " paddsw %%mm7, %%mm7 \n\t" \
- " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- \
- " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
- " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
- \
- " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
- " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
- \
- " movq %%mm3, %%mm2 \n\t" \
- " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
- \
- " movq %%mm3, %%mm0 \n\t" \
- " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
- \
- " movq %%mm3," #ip0 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
- " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
- \
- " movq " #temp ", %%mm2 \n\t" \
- " movq %%mm2, %%mm0 \n\t" \
- \
- " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
- " paddw %%mm0, %%mm3 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " movq %%mm5, %%mm0 \n\t" \
- \
- " movq %%mm5, %%mm2 \n\t" \
- " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
- " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
- \
- " movq %%mm5, %%mm0 \n\t" \
- " movq %%mm5, %%mm2 \n\t" \
- \
- " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " movq " #temp ", %%mm3 \n\t" \
- " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
- \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- " movq %%mm3, %%mm2 \n\t" \
- \
- " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " psubsw %%mm5, %%mm3 \n\t" \
- \
- " movq %%mm3," #ip6 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC4S4)", %%mm0 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- " movq %%mm1, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
- " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
- \
- " movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
- " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
- /* ------------------------------------------------------------------- */ \
- " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
- " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
- \
- " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
- " paddsw %%mm6, %%mm6 \n\t" \
- " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
- \
- " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
- " paddsw %%mm1, %%mm1 \n\t" \
- " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- \
- " movq %%mm1, %%mm3 \n\t" \
- " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
- " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
- \
- " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- \
- " movq %%mm0, %%mm5 \n\t" \
- " movq %%mm0, %%mm2 \n\t" \
- \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- \
- " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
- " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
- \
- " movq %%mm1," #ip1 " \n\t" \
- " movq %%mm3," #ip7 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC3S5)", %%mm0 \n\t" \
- " movq "M(xC5S3)", %%mm1 \n\t" \
- \
- " movq %%mm6, %%mm5 \n\t" \
- " movq %%mm6, %%mm7 \n\t" \
- \
- " movq %%mm4, %%mm2 \n\t" \
- " movq %%mm4, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " psrlw $15, %%mm5 \n\t" \
- \
- " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
- " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
- " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
- \
- " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
- " movq %%mm4," #ip3 " \n\t" \
- \
- " movq %%mm3, %%mm4 \n\t" \
- " movq %%mm7, %%mm6 \n\t" \
- \
- " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" \
- " paddw %%mm5, %%mm6 \n\t" \
- \
- " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
- \
- " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
- " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
- " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
- " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
- " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
- " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
- " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
- " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
- " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
- " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
- /* Transpose 2x8 block */ \
- " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
- " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
- " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
- " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
- " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
- " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
- " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
- " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
- " movq %%mm4," #op4 " \n\t" \
- " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
- " movq %%mm5," #op5 " \n\t" \
- " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
- " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
- " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
- " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
- " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
- " movq %%mm6," #op7 " \n\t" \
- " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
- " movq %%mm1," #op6 " \n\t" \
- " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
- " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
- " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
- " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
- " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
- " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
- " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
- " movq %%mm0," #op0 " \n\t" \
- " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
- " movq %%mm1," #op1 " \n\t" \
- " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
- " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
- " movq %%mm4," #op3 " \n\t" \
- " movq %%mm2," #op2 " \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
- ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
- ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- /*
- * Input data is an 8x8 block. To make processing of the data more efficent
- * we will transpose the block of data to two 4x8 blocks???
- */
- Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
- Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
- Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
- Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
- Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- " emms \n\t"
-
- : "+r" (InputData),
- "+r" (OutputData)
- : "r" (temp)
- : "memory"
- );
-}
-
-void dsp_mmx_fdct_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_32 mmx fdct function.\n");
- funcs->fdct_short = fdct_short__mmx;
-}
Copied: trunk/theora/lib/x86_32/fdct_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/fdct_mmx.c)
Deleted: trunk/theora/lib/x86_32/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_32/recon_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_32/recon_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,187 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- unsigned int stride)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " lea (%2, %2, 2), %%edi \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
-
- " lea (%1, %2, 4), %1 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
-
- " lea (%0, %2, 4), %0 \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%edi), %%mm3 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%edi) \n\t"
- : "+a" (dest)
- : "c" (src),
- "d" (stride)
- : "memory", "edi"
- );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
-
- " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
- "1: \n\t"
- " movq (%1), %%mm2 \n\t" /* First four input values */
-
- " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
- " por %%mm0, %%mm0 \n\t"
- " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
- " lea 16(%1), %1 \n\t" /* Step source buffer */
- " cmp %%edi, %1 \n\t" /* are we done */
-
- " movq %%mm2, (%0) \n\t" /* store results */
-
- " lea (%0, %2), %0 \n\t" /* Step output buffer */
- " jc 1b \n\t" /* Loop back if we are not done */
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (LineStep)
- : "memory", "edi"
- );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq (%1), %%mm4 \n\t" /* first 4 changes */
- " movq %%mm2, %%mm3 \n\t"
- " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
- " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
- " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
- " add %3, %2 \n\t" /* next row of reference pixels */
- " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " cmp %%edi, %1 \n\t" /* are we done? */
-
- " movq %%mm2, (%0) \n\t" /* store result */
-
- " lea (%0, %3), %0 \n\t" /* next row of output */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr),
- "r" (LineStep)
- : "memory", "edi"
- );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%edi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
- " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq %%mm2, %%mm3 \n\t"
- " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
- " movq %%mm4, %%mm5 \n\t"
- " movq (%1), %%mm6 \n\t" /* first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
- " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
- " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
- " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
- " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
- " paddw %%mm6, %%mm2 \n\t" /* add changes to start */
- " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */
- " add %4, %2 \n\t" /* next row of reference pixels */
- " add %4, %3 \n\t" /* next row of reference pixels */
- " movq %%mm2, (%0) \n\t" /* store result */
- " add %4, %0 \n\t" /* next row of output */
- " cmp %%edi, %1 \n\t" /* are we done? */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr1),
- "r" (RefPtr2),
- "m" (LineStep)
- : "memory", "edi"
- );
-}
-
-void dsp_mmx_recon_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_32 mmx recon functions.\n");
- funcs->copy8x8 = copy8x8__mmx;
- funcs->recon_intra8x8 = recon_intra8x8__mmx;
- funcs->recon_inter8x8 = recon_inter8x8__mmx;
- funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-
Copied: trunk/theora/lib/x86_32/recon_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_32/recon_mmx.c)
Copied: trunk/theora/lib/x86_64 (from rev 11426, branches/theora-mmx/lib/x86_64)
Deleted: trunk/theora/lib/x86_64/dsp_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/dsp_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/dsp_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,298 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
- ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
- " movq %%mm0, (%2) \n\t" /* write answer out */
- " movq %%mm2, 8(%2) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %2 \n\t"
- " add %3, %0 \n\t"
- " add %4, %1 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr),
- "+r" (DctInputPtr)
- : "r" ((ogg_uint64_t)PixelsPerLine),
- "r" ((ogg_uint64_t)ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine)
-{
- ogg_uint64_t ppl = PixelsPerLine;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
- " movq %[V128], %%mm1 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- /* start calculation */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
- " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
- " movq %%mm0, (%1) \n\t" /* write answer out */
- " movq %%mm2, 8(%1) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %1 \n\t"
- " add %2, %0 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (DctInputPtr)
- : "r" (ppl), /* gcc bug? a cast won't work here, e.g. (ogg_uint64_t)PixelsPerLine */
- [V128] "m" (V128)
- : "memory"
- );
-}
-
-static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
- ogg_uint32_t PixelsPerLine,
- ogg_uint32_t ReconPixelsPerLine)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm7, %%mm7 \n\t"
-
- ".rept 8 \n\t"
- " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
- " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
- " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
- " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
- " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
- " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
- /* convert from UINT8 to INT16 */
- " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
- " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
- " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
- " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
- " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
- " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
- /* average ReconPtr1 and ReconPtr2 */
- " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
- " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
- " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
- " movq %%mm0, (%3) \n\t" /* write answer out */
- " movq %%mm2, 8(%3) \n\t" /* write answer out */
- /* Increment pointers */
- " add $16, %3 \n\t"
- " add %4, %0 \n\t"
- " add %5, %1 \n\t"
- " add %5, %2 \n\t"
- ".endr \n\t"
-
- : "+r" (FiltPtr),
- "+r" (ReconPtr1),
- "+r" (ReconPtr2),
- "+r" (DctInputPtr)
- : "r" ((ogg_uint64_t)PixelsPerLine),
- "r" ((ogg_uint64_t)ReconPixelsPerLine)
- : "memory"
- );
-}
-
-static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
-{
- ogg_uint64_t XSum;
- ogg_uint64_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%rdi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq %%mm0, %%mm2 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %3, %2 \n\t" /* Inc pointer into src data */
-
- " dec %%rdi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%rdi \n\t"
- " movsx %%di, %%rdi \n\t"
- " mov %%rdi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=r" (XSum),
- "=r" (XXSum),
- "+r" (DataPtr)
- : "r" ((ogg_uint64_t)Stride)
- : "rdi", "memory"
- );
-
- /* Compute population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ) );
-}
-
-static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr, ogg_uint32_t RefStride)
-{
- ogg_uint64_t XSum;
- ogg_uint64_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%rdi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
- " movq (%3), %%mm1 \n\t"
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm6, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm6, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %4, %2 \n\t" /* Inc pointer into src data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
-
- " dec %%rdi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%rdi \n\t"
- " movsx %%di, %%rdi \n\t"
- " mov %%rdi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr)
- : "r" ((ogg_uint64_t)SrcStride),
- "r" ((ogg_uint64_t)RefStride)
- : "rdi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-static void restore_fpu (void)
-{
- __asm__ __volatile__ (
- " emms \n\t"
- );
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
- fprintf(stderr, "setting accelerated x86_64 mmx dsp functions.\n");
- funcs->restore_fpu = restore_fpu;
- funcs->sub8x8 = sub8x8__mmx;
- funcs->sub8x8_128 = sub8x8_128__mmx;
- funcs->sub8x8avg2 = sub8x8avg2__mmx;
- funcs->intra8x8_err = intra8x8_err__mmx;
- funcs->inter8x8_err = inter8x8_err__mmx;
-}
-
Copied: trunk/theora/lib/x86_64/dsp_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/dsp_mmx.c)
Deleted: trunk/theora/lib/x86_64/dsp_mmxext.c
===================================================================
--- branches/theora-mmx/lib/x86_64/dsp_mmxext.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/dsp_mmxext.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,317 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "dsp.h"
-
-static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 7 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" ((ogg_uint64_t)stride1),
- "r" ((ogg_uint64_t)stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
- unsigned char *ptr2, ogg_uint32_t stride2,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
-
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %4, %2 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
-
- : "=r" (DiffVal),
- "+r" (ptr1),
- "+r" (ptr2)
- : "r" ((ogg_uint64_t)stride1),
- "r" ((ogg_uint64_t)stride2)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
- ogg_uint32_t thres)
-{
- ogg_uint32_t DiffVal;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
- ".rept 8 \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t"
- " movq (%3), %%mm2 \n\t"
- " pavgb %%mm2, %%mm1 \n\t"
- " psadbw %%mm1, %%mm0 \n\t"
-
- " add %4, %1 \n\t" /* Inc pointer into the new data */
- " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
- " add %5, %2 \n\t" /* Inc pointer into ref data */
- " add %5, %3 \n\t" /* Inc pointer into ref data */
- ".endr \n\t"
-
- " movd %%mm7, %0 \n\t"
- : "=m" (DiffVal),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "r" ((ogg_uint64_t)SrcStride),
- "r" ((ogg_uint64_t)RefStride)
- : "memory"
- );
-
- return DiffVal;
-}
-
-static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movd (%1), %%mm0 \n\t"
- " movd (%2), %%mm1 \n\t"
- " psadbw %%mm0, %%mm1 \n\t"
- " movd 4(%1), %%mm2 \n\t"
- " movd 4(%2), %%mm3 \n\t"
- " psadbw %%mm2, %%mm3 \n\t"
-
- " pmaxsw %%mm1, %%mm3 \n\t"
- " movd %%mm3, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=m" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- :
- : "memory"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
- ogg_uint32_t stride)
-{
- ogg_uint32_t MaxSad;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
- " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
- " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
- " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
- " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
- " mov $4, %%rdi \n\t" /* 4 rows */
- "1: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%rdi \n\t"
- " jnz 1b \n\t"
-
- " mov $4, %%rdi \n\t" /* 4 rows */
- "2: \n\t"
- " movq (%1), %%mm0 \n\t" /* take 8 bytes */
- " movq (%2), %%mm1 \n\t" /* take 8 bytes */
-
- " movq %%mm0, %%mm2 \n\t"
- " psubusb %%mm1, %%mm0 \n\t" /* A - B */
- " psubusb %%mm2, %%mm1 \n\t" /* B - A */
- " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
- " movq %%mm0, %%mm1 \n\t"
-
- " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
- " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
- " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
- " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
- " add %3, %1 \n\t" /* Inc pointer into the new data */
- " add %3, %2 \n\t" /* Inc pointer into the new data */
-
- " dec %%rdi \n\t"
- " jnz 2b \n\t"
-
- " pmaxsw %%mm6, %%mm7 \n\t"
- " pmaxsw %%mm4, %%mm5 \n\t"
- " pmaxsw %%mm5, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $32, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movq %%mm7, %%mm6 \n\t"
- " psrlq $16, %%mm6 \n\t"
- " pmaxsw %%mm6, %%mm7 \n\t"
- " movd %%mm7, %0 \n\t"
- " andl $0xffff, %0 \n\t"
-
- : "=r" (MaxSad),
- "+r" (Src1),
- "+r" (Src2)
- : "r" ((ogg_uint64_t)stride)
- : "memory", "rdi"
- );
-
- return MaxSad;
-}
-
-static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
- unsigned char *RefDataPtr1,
- unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
-{
- ogg_uint64_t XSum;
- ogg_uint64_t XXSum;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm4, %%mm4 \n\t"
- " pxor %%mm5, %%mm5 \n\t"
- " pxor %%mm6, %%mm6 \n\t"
- " pxor %%mm7, %%mm7 \n\t"
- " mov $8, %%rdi \n\t"
- "1: \n\t"
- " movq (%2), %%mm0 \n\t" /* take 8 bytes */
-
- " movq (%3), %%mm2 \n\t"
- " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
- " pavgb %%mm2, %%mm1 \n\t"
-
- " movq %%mm0, %%mm2 \n\t"
- " movq %%mm1, %%mm3 \n\t"
-
- " punpcklbw %%mm6, %%mm0 \n\t"
- " punpcklbw %%mm4, %%mm1 \n\t"
- " punpckhbw %%mm6, %%mm2 \n\t"
- " punpckhbw %%mm4, %%mm3 \n\t"
-
- " psubsw %%mm1, %%mm0 \n\t"
- " psubsw %%mm3, %%mm2 \n\t"
-
- " paddw %%mm0, %%mm5 \n\t"
- " paddw %%mm2, %%mm5 \n\t"
-
- " pmaddwd %%mm0, %%mm0 \n\t"
- " pmaddwd %%mm2, %%mm2 \n\t"
-
- " paddd %%mm0, %%mm7 \n\t"
- " paddd %%mm2, %%mm7 \n\t"
-
- " add %5, %2 \n\t" /* Inc pointer into src data */
- " add %6, %3 \n\t" /* Inc pointer into ref data */
- " add %6, %4 \n\t" /* Inc pointer into ref data */
-
- " dec %%rdi \n\t"
- " jnz 1b \n\t"
-
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $32, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movq %%mm5, %%mm0 \n\t"
- " psrlq $16, %%mm5 \n\t"
- " paddw %%mm0, %%mm5 \n\t"
- " movd %%mm5, %%edi \n\t"
- " movsx %%di, %%edi \n\t"
- " movl %%edi, %0 \n\t"
-
- " movq %%mm7, %%mm0 \n\t"
- " psrlq $32, %%mm7 \n\t"
- " paddd %%mm0, %%mm7 \n\t"
- " movd %%mm7, %1 \n\t"
-
- : "=m" (XSum),
- "=m" (XXSum),
- "+r" (SrcData),
- "+r" (RefDataPtr1),
- "+r" (RefDataPtr2)
- : "r" ((ogg_uint64_t)SrcStride),
- "r" ((ogg_uint64_t)RefStride)
- : "rdi", "memory"
- );
-
- /* Compute and return population variance as mis-match metric. */
- return (( (XXSum<<6) - XSum*XSum ));
-}
-
-void dsp_mmxext_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accerated x86_64 mmxext dsp functions.\n");
- funcs->row_sad8 = row_sad8__mmxext;
- funcs->col_sad8x8 = col_sad8x8__mmxext;
- funcs->sad8x8 = sad8x8__mmxext;
- funcs->sad8x8_thres = sad8x8_thres__mmxext;
- funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
- funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
-}
Copied: trunk/theora/lib/x86_64/dsp_mmxext.c (from rev 11426, branches/theora-mmx/lib/x86_64/dsp_mmxext.c)
Deleted: trunk/theora/lib/x86_64/fdct_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/fdct_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/fdct_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,349 +0,0 @@
-;//==========================================================================
-;//
-;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
-;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
-;// PURPOSE.
-;//
-;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
-;//
-;//--------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <theora/theora.h>
-#include "dsp.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-/***********************************************************************
- * File: fdct_m.asm
- *
- * Description:
- * This function perform 2-D Forward DCT on a 8x8 block
- *
- *
- * Input: Pointers to input source data buffer and destination
- * buffer.
- *
- * Note: none
- *
- * Special Notes: We try to do the truncation right to match the result
- * of the c version.
- *
- ************************************************************************/
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
- " movq " #ip0 ", %%mm0 \n\t" \
- " movq " #ip1 ", %%mm1 \n\t" \
- " movq " #ip3 ", %%mm2 \n\t" \
- " movq " #ip5 ", %%mm3 \n\t" \
- " movq %%mm0, %%mm4 \n\t" \
- " movq %%mm1, %%mm5 \n\t" \
- " movq %%mm2, %%mm6 \n\t" \
- " movq %%mm3, %%mm7 \n\t" \
- \
- " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
- " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
- " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
- " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
- " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
- " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
- \
- " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
- \
- " paddsw %%mm2, %%mm2 \n\t" \
- \
- " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
- \
- " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
- " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
- " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
- \
- " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
- " paddsw %%mm7, %%mm7 \n\t" \
- " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- \
- " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
- " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
- \
- " pmulhw %[xC4S4], %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
- " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
- \
- " movq %%mm3, %%mm2 \n\t" \
- " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
- \
- " movq %%mm3, %%mm0 \n\t" \
- " pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
- \
- " movq %%mm3," #ip0 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
- " pmulhw %[xC2S6], %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
- \
- " movq " #temp ", %%mm2 \n\t" \
- " movq %%mm2, %%mm0 \n\t" \
- \
- " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
- " paddw %%mm0, %%mm3 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " movq %%mm5, %%mm0 \n\t" \
- \
- " movq %%mm5, %%mm2 \n\t" \
- " pmulhw %[xC6S2], %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
- " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
- \
- " movq %%mm5, %%mm0 \n\t" \
- " movq %%mm5, %%mm2 \n\t" \
- \
- " pmulhw %[xC2S6], %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " movq " #temp ", %%mm3 \n\t" \
- " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
- \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- " movq %%mm3, %%mm2 \n\t" \
- \
- " pmulhw %[xC6S2], %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " psubsw %%mm5, %%mm3 \n\t" \
- \
- " movq %%mm3," #ip6 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq %[xC4S4], %%mm0 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- " movq %%mm1, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
- " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
- \
- " movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
- " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
- /* ------------------------------------------------------------------- */ \
- " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
- " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
- \
- " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
- " paddsw %%mm6, %%mm6 \n\t" \
- " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
- \
- " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
- " paddsw %%mm1, %%mm1 \n\t" \
- " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
- /* ------------------------------------------------------------------- */ \
- " movq %[xC1S7], %%mm7 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- \
- " movq %%mm1, %%mm3 \n\t" \
- " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
- \
- " movq %[xC7S1], %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
- " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
- \
- " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- \
- " movq %%mm0, %%mm5 \n\t" \
- " movq %%mm0, %%mm2 \n\t" \
- \
- " movq %[xC1S7], %%mm7 \n\t" \
- " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
- \
- " movq %[xC7S1], %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- \
- " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
- " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
- \
- " movq %%mm1," #ip1 " \n\t" \
- " movq %%mm3," #ip7 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq %[xC3S5], %%mm0 \n\t" \
- " movq %[xC5S3], %%mm1 \n\t" \
- \
- " movq %%mm6, %%mm5 \n\t" \
- " movq %%mm6, %%mm7 \n\t" \
- \
- " movq %%mm4, %%mm2 \n\t" \
- " movq %%mm4, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " psrlw $15, %%mm5 \n\t" \
- \
- " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
- " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
- " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
- \
- " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
- " movq %%mm4," #ip3 " \n\t" \
- \
- " movq %%mm3, %%mm4 \n\t" \
- " movq %%mm7, %%mm6 \n\t" \
- \
- " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" \
- " paddw %%mm5, %%mm6 \n\t" \
- \
- " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
- \
- " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
- " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
- " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
- " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
- " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
- " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
- " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
- " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
- " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
- " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
- /* Transpose 2x8 block */ \
- " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
- " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
- " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
- " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
- " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
- " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
- " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
- " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
- " movq %%mm4," #op4 " \n\t" \
- " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
- " movq %%mm5," #op5 " \n\t" \
- " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
- " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
- " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
- " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
- " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
- " movq %%mm6," #op7 " \n\t" \
- " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
- " movq %%mm1," #op6 " \n\t" \
- " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
- " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
- " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
- " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
- " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
- " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
- " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
- " movq %%mm0," #op0 " \n\t" \
- " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
- " movq %%mm1," #op1 " \n\t" \
- " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
- " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
- " movq %%mm4," #op3 " \n\t" \
- " movq %%mm2," #op2 " \n\t"
-
-
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
- ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
- ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- /*
- * Input data is an 8x8 block. To make processing of the data more efficent
- * we will transpose the block of data to two 4x8 blocks???
- */
- Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
- Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
- Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
- Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
- Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- " emms \n\t"
-
- : "+r" (InputData),
- "+r" (OutputData)
- : "r" (temp),
- [xC1S7] "m" (xC1S7), /* gcc 3.1+ allows named asm parameters */
- [xC2S6] "m" (xC2S6),
- [xC3S5] "m" (xC3S5),
- [xC4S4] "m" (xC4S4),
- [xC5S3] "m" (xC5S3),
- [xC6S2] "m" (xC6S2),
- [xC7S1] "m" (xC7S1)
- : "memory"
- );
-}
-
-void dsp_mmx_fdct_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_64 mmx fdct function.\n");
- funcs->fdct_short = fdct_short__mmx;
-}
Copied: trunk/theora/lib/x86_64/fdct_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/fdct_mmx.c)
Deleted: trunk/theora/lib/x86_64/recon_mmx.c
===================================================================
--- branches/theora-mmx/lib/x86_64/recon_mmx.c 2006-05-26 18:28:02 UTC (rev 11426)
+++ trunk/theora/lib/x86_64/recon_mmx.c 2006-05-26 18:51:09 UTC (rev 11427)
@@ -1,181 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
- * by the Xiph.Org Foundation http://www.xiph.org/ *
- * *
- ********************************************************************
-
- function:
- last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
-
- ********************************************************************/
-
-#include <stdio.h>
-#include "codec_internal.h"
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
-
-static void copy8x8__mmx (unsigned char *src,
- unsigned char *dest,
- ogg_uint32_t stride)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " lea (%2, %2, 2), %%rdi \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%rdi), %%mm3 \n\t"
-
- " lea (%1, %2, 4), %1 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%rdi) \n\t"
-
- " lea (%0, %2, 4), %0 \n\t"
-
- " movq (%1), %%mm0 \n\t"
- " movq (%1, %2), %%mm1 \n\t"
- " movq (%1, %2, 2), %%mm2 \n\t"
- " movq (%1, %%rdi), %%mm3 \n\t"
-
- " movq %%mm0, (%0) \n\t"
- " movq %%mm1, (%0, %2) \n\t"
- " movq %%mm2, (%0, %2, 2) \n\t"
- " movq %%mm3, (%0, %%rdi) \n\t"
- : "+a" (dest)
- : "c" (src),
- "d" ((ogg_uint64_t)stride)
- : "memory", "rdi"
- );
-}
-
-static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " movq %[V128], %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
-
- " lea 128(%1), %%rdi \n\t" /* Endpoint in input buffer */
- "1: \n\t"
- " movq (%1), %%mm2 \n\t" /* First four input values */
-
- " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
- " por %%mm0, %%mm0 \n\t"
- " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
- " lea 16(%1), %1 \n\t" /* Step source buffer */
- " cmp %%rdi, %1 \n\t" /* are we done */
-
- " movq %%mm2, (%0) \n\t" /* store results */
-
- " lea (%0, %2), %0 \n\t" /* Step output buffer */
- " jc 1b \n\t" /* Loop back if we are not done */
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" ((ogg_uint64_t)LineStep),
- [V128] "m" (V128)
- : "memory", "rdi"
- );
-}
-
-static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
- ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%rdi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq (%1), %%mm4 \n\t" /* first 4 changes */
- " movq %%mm2, %%mm3 \n\t"
- " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
- " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
- " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
- " add %3, %2 \n\t" /* next row of reference pixels */
- " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " cmp %%rdi, %1 \n\t" /* are we done? */
-
- " movq %%mm2, (%0) \n\t" /* store result */
-
- " lea (%0, %3), %0 \n\t" /* next row of output */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr),
- "r" ((ogg_uint64_t)LineStep)
- : "memory", "rdi"
- );
-}
-
-static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
- unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
- ogg_uint32_t LineStep)
-{
- __asm__ __volatile__ (
- " .balign 16 \n\t"
-
- " pxor %%mm0, %%mm0 \n\t"
- " lea 128(%1), %%rdi \n\t"
-
- "1: \n\t"
- " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
- " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
-
- " movq %%mm2, %%mm3 \n\t"
- " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
- " movq %%mm4, %%mm5 \n\t"
- " movq (%1), %%mm6 \n\t" /* first 4 changes */
- " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
- " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
- " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
- " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
- " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
- " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
- " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
- " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
- " paddw %%mm6, %%mm2 \n\t" /* add changes to start */
- " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
- " lea 16(%1), %1 \n\t" /* next row of changes */
- " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */
- " add %4, %2 \n\t" /* next row of reference pixels */
- " add %4, %3 \n\t" /* next row of reference pixels */
- " movq %%mm2, (%0) \n\t" /* store result */
- " add %4, %0 \n\t" /* next row of output */
- " cmp %%rdi, %1 \n\t" /* are we done? */
- " jc 1b \n\t"
- : "+r" (ReconPtr)
- : "r" (ChangePtr),
- "r" (RefPtr1),
- "r" (RefPtr2),
- "r" ((ogg_uint64_t)LineStep)
- : "memory", "rdi"
- );
-}
-
-void dsp_mmx_recon_init(DspFunctions *funcs)
-{
- fprintf(stderr, "enabling accelerated x86_64 mmx recon functions.\n");
- funcs->copy8x8 = copy8x8__mmx;
- funcs->recon_intra8x8 = recon_intra8x8__mmx;
- funcs->recon_inter8x8 = recon_inter8x8__mmx;
- funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
-}
-
Copied: trunk/theora/lib/x86_64/recon_mmx.c (from rev 11426, branches/theora-mmx/lib/x86_64/recon_mmx.c)
More information about the commits
mailing list