[xiph-commits] r3850 - in liboggplay/trunk: . include/oggplay src/examples src/liboggplay
wiking at svn.annodex.net
wiking at svn.annodex.net
Fri Feb 20 01:19:46 PST 2009
Author: wiking
Date: 2009-02-20 01:19:45 -0800 (Fri, 20 Feb 2009)
New Revision: 3850
Added:
liboggplay/trunk/src/liboggplay/cpu.c
liboggplay/trunk/src/liboggplay/cpu.h
liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c
liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h
liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c
liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h
liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h
Modified:
liboggplay/trunk/configure.ac
liboggplay/trunk/include/oggplay/oggplay_tools.h
liboggplay/trunk/src/examples/glut-player.c
liboggplay/trunk/src/examples/mac-player.c
liboggplay/trunk/src/examples/win32-player.c
liboggplay/trunk/src/liboggplay/Makefile.am
liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c
Log:
yuv to rgb conversion enhancments:
- runtime cpu extension detection (from theora) + altivec detection support
- altivec implementation of yuv2rgb functions
- sse2 implementation of yuv2rgb functions
!!!WARNING!!!
The API for the conversion has been changed, in order to avoid confusions:
oggplay_yuv2bgr -> oggplay_yuv2bgra
oggplay_yuv2rgb -> oggplay_yuv2rgba
new function 'oggplay_yuv2argb' has been introduced, as e.g. directx uses ARGB packaging.
Modified: liboggplay/trunk/configure.ac
===================================================================
--- liboggplay/trunk/configure.ac 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/configure.ac 2009-02-20 09:19:45 UTC (rev 3850)
@@ -277,41 +277,6 @@
fi
dnl changequote([,])dnl
-dnl Define __SSE2__ to include MMX intrinsics (SSE2) code.
-dnl Sets also SSE2_CFLAGS for the compile time.
-
-SSE2_CFLAGS="-msse2 -march=pentium3"
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(For MMX/SSE intrinsics in the compiler)
-liboggplay_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $SSE2_CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for MMX SSE2 intrinsics"
-#endif
-#include <xmmintrin.h>
-int main () {
- __m64 zero =_mm_setzero_si64();
- __m64 ut = _m_from_int(0);
- __m64 vt = _m_from_int(0);
- ut = _m_punpcklbw(ut, zero);
- vt = _m_punpcklbw(vt, zero);
- vt = _mm_or_si64 (ut, vt);
- return _mm_cvtsi64_si32 (vt);
-}], have_mmx_intrinsics=yes)
-CFLAGS=$liboggplay_save_CFLAGS
-AC_MSG_RESULT($have_mmx_intrinsics)
-
-if test $have_mmx_intrinsics = yes ; then
- AC_DEFINE(__SSE2__, 1, [use MMX SSE2 compiler intrinsics])
-else
- SSE2_CFLAGS=
-fi
-AC_SUBST(SSE2_CFLAGS)
-
-AM_CONDITIONAL(__SSE2__, test $have_mmx_intrinsics = yes)
-
dnl
dnl Configuration tests complete -- provide summary of results.
dnl
Modified: liboggplay/trunk/include/oggplay/oggplay_tools.h
===================================================================
--- liboggplay/trunk/include/oggplay/oggplay_tools.h 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/include/oggplay/oggplay_tools.h 2009-02-20 09:19:45 UTC (rev 3850)
@@ -72,11 +72,14 @@
} OggPlayRGBChannels;
void
-oggplay_yuv2rgb(OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
+oggplay_yuv2rgba(const OggPlayYUVChannels *yuv, OggPlayRGBChannels * rgb);
void
-oggplay_yuv2bgr(OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
+oggplay_yuv2bgra(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
+void
+oggplay_yuv2argb(const OggPlayYUVChannels *yuv, OggPlayRGBChannels * rgb);
+
ogg_int64_t
oggplay_sys_time_in_ms(void);
Modified: liboggplay/trunk/src/examples/glut-player.c
===================================================================
--- liboggplay/trunk/src/examples/glut-player.c 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/glut-player.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -141,7 +141,7 @@
rgb.rgb_width = texture_width;
rgb.rgb_height = texture_height;
- oggplay_yuv2rgb(&yuv, &rgb);
+ oggplay_yuv2rgba(&yuv, &rgb);
}
Modified: liboggplay/trunk/src/examples/mac-player.c
===================================================================
--- liboggplay/trunk/src/examples/mac-player.c 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/mac-player.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -170,7 +170,7 @@
rgb.rgb_width = texture_width;
rgb.rgb_height = texture_height;
- oggplay_yuv2rgb(&yuv, &rgb);
+ oggplay_yuv2rgba(&yuv, &rgb);
}
Modified: liboggplay/trunk/src/examples/win32-player.c
===================================================================
--- liboggplay/trunk/src/examples/win32-player.c 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/win32-player.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -178,58 +178,11 @@
rgb.rgb_height = texture_height;
#if OPENGL
- oggplay_yuv2rgb(&yuv, &rgb);
+ oggplay_yuv2rgba(&yuv, &rgb);
#else
- oggplay_yuv2bgr(&yuv, &rgb);
+ oggplay_yuv2bgra(&yuv, &rgb);
#endif
-
-#else
- ptry = video_data->y;
- ptru = video_data->u;
- ptrv = video_data->v;
- ptro = texture_bits;
-
- for (i = 0; i < y_height; i++) {
- int j;
- ptro2 = ptro;
- for (j = 0; j < y_width; j += 2) {
-
- short pr, pg, pb;
- short r, g, b;
-
- //pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
- pr = (-41344 + ptrv[j/2] * 292) >> 8;
- //pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16;
- // /* 0.395 & 0.581 */
- pg = (28032 - ptru[j/2] * 101 - ptrv[j/2] * 149) >> 8;
- //pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
- pb = (-70528 + ptru[j/2] * 520) >> 8;
-
- r = ptry[j] + pr;
- g = ptry[j] + pg;
- b = ptry[j] + pb;
-
- *ptro2++ = CLAMP(r);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(b);
-
- r = ptry[j + 1] + pr;
- g = ptry[j + 1] + pg;
- b = ptry[j + 1] + pb;
-
- *ptro2++ = CLAMP(b);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(r);
- }
- ptry += y_width;
- if (i & 1) {
- ptru += uv_width;
- ptrv += uv_width;
- }
- ptro += po2_width * 3;
- }
-
#endif
}
Modified: liboggplay/trunk/src/liboggplay/Makefile.am
===================================================================
--- liboggplay/trunk/src/liboggplay/Makefile.am 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/liboggplay/Makefile.am 2009-02-20 09:19:45 UTC (rev 3850)
@@ -1,6 +1,6 @@
## Process this file with automake to produce Makefile.in
-AM_CFLAGS = -Wall $(SSE2_CFLAGS) -Wdeclaration-after-statement
+AM_CFLAGS = -Wall -Wdeclaration-after-statement
INCLUDES = $(INCLTDL) -I$(top_srcdir)/include
@@ -9,13 +9,17 @@
# Libraries to build
lib_LTLIBRARIES = liboggplay.la
-noinst_HEADERS = \
- oggplay_buffer.h \
- oggplay_callback.h \
- oggplay_data.h \
- oggplay_file_reader.h \
- oggplay_private.h \
- oggplay_tcp_reader.h \
+noinst_HEADERS = \
+ cpu.h \
+ oggplay_buffer.h \
+ oggplay_callback.h \
+ oggplay_data.h \
+ oggplay_file_reader.h \
+ oggplay_private.h \
+ oggplay_tcp_reader.h \
+ oggplay_yuv2rgb_template.h \
+ yuv2rgb_x86.h \
+ yuv2rgb_x86_vs.h \
std_semaphore.h
liboggplay_la_SOURCES = \
@@ -27,10 +31,10 @@
oggplay_data.c \
oggplay_callback_info.c \
oggplay_buffer.c \
- oggplay_yuv2rgb.c \
+ oggplay_yuv2rgb.c \
oggplay_seek.c \
oggplay_tools.c
-liboggplay_la_CFLAGS = $(AM_CFLAGS) $(OGGZ_CFLAGS) $(FISHSOUND_CFLAGS)
+liboggplay_la_CFLAGS = $(AM_CFLAGS) $(OGGZ_CFLAGS) $(FISHSOUND_CFLAGS) $(ALTIVEC_CFLAGS)
liboggplay_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@ @SHLIB_VERSION_ARG@
liboggplay_la_LIBADD = @SEMAPHORE_LIBS@ @OGGZ_LIBS@ @FISHSOUND_LIBS@ @THEORA_LIBS@ @KATE_LIBS@
Added: liboggplay/trunk/src/liboggplay/cpu.c
===================================================================
--- liboggplay/trunk/src/liboggplay/cpu.c (rev 0)
+++ liboggplay/trunk/src/liboggplay/cpu.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,270 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+ Originally written by Rudolf Marek.
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+/* for detecting AltiVec support */
+# if (defined(__ppc__) || defined(__ppc64__))
+# if defined(__APPLE__) || defined(__MACOSX__)
+#include <sys/sysctl.h>
+# else
+#include <signal.h>
+#include <setjmp.h>
+# endif
+# endif
+
+# if (defined(__ppc__) || defined(__ppc64__)) && !(defined(__APPLE__) || defined(__MACOSX__))
+static jmp_buf jmpbuf;
+
+static void illegal_instruction(int sig)
+{
+ longjmp(jmpbuf, 1);
+}
+# endif
+
+
+# if !defined(_MSC_VER)
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+ compiling with -fPIC.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "cpuid\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# else
+/*On x86-32, not so much.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ "cpuid\n\t" \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# endif
+# else
+/*Why does MSVC need this complicated rigamarole?
+ At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+ For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+ for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+ _asm {
+ mov eax,[_op]
+ mov esi,_cpu_info
+ cpuid
+ mov [esi+0],eax
+ mov [esi+4],ebx
+ mov [esi+8],ecx
+ mov [esi+12],edx
+ }
+}
+
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ do{ \
+ ogg_uint32_t cpu_info[4]; \
+ oc_cpuid_helper(cpu_info,_op); \
+ (_eax)=cpu_info[0]; \
+ (_ebx)=cpu_info[1]; \
+ (_ecx)=cpu_info[2]; \
+ (_edx)=cpu_info[3]; \
+ }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+ _asm{
+ pushfd
+ pushfd
+ pop eax
+ mov ebx,eax
+ xor eax,200000h
+ push eax
+ popfd
+ pushfd
+ pop eax
+ popfd
+ mov ecx,_eax
+ mov [ecx],eax
+ mov ecx,_ebx
+ mov [ecx],ebx
+ }
+}
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+ if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+ if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+ if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+ if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+ if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+ return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+ if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+ if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+ if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+ if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+ return flags;
+}
+
+static ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags = 0;
+# if defined(__ppc__) || defined(__ppc64__)
+/* detect AltiVec extension if compiling it for ppc */
+# if defined(__APPLE__) || defined(__MACOSX__) || defined(__DARWIN__)
+ int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+ int i_has_altivec = 0;
+ size_t i_length = sizeof( i_has_altivec );
+ int i_error = sysctl( selectors, 2, &i_has_altivec, &i_length, NULL, 0);
+
+ if( i_error == 0 && i_has_altivec != 0 )
+ flags |= OC_CPU_PPC_ALTIVEC;
+# else
+ void (*handler) (int sig);
+ handler = signal(SIGILL, illegal_instruction);
+ if (setjmp(jmpbuf) == 0)
+ {
+ __asm__ __volatile__ (
+ "mtspr 256, %0\n\t"
+ "vand %%v0, %%v0, %%v0"
+ : : "r"(-1) );
+
+ flags |= OC_CPU_PPC_ALTIVEC;
+ }
+ signal(SIGILL, handler);
+# endif
+/* detect x86 CPU extensions */
+# elif defined(i386) || defined(__x86_64__) || defined(_M_IX86)
+ ogg_uint32_t eax;
+ ogg_uint32_t ebx;
+ ogg_uint32_t ecx;
+ ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+ /*Not all x86-32 chips support cpuid, so we have to check.*/
+# if !defined(_MSC_VER)
+ __asm__ __volatile__(
+ "pushfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "movl %[a],%[b]\n\t"
+ "xorl $0x200000,%[a]\n\t"
+ "pushl %[a]\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "popfl\n\t"
+ :[a]"=r"(eax),[b]"=r"(ebx)
+ :
+ :"cc"
+ );
+# else
+ oc_detect_cpuid_helper(&eax,&ebx);
+# endif
+ /*No cpuid.*/
+ if(eax==ebx)return 0;
+# endif
+ cpuid(0,eax,ebx,ecx,edx);
+ /* l e t n I e n i u n e G*/
+ if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+ /* 6 8 x M T e n i u n e G*/
+ ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+ /*Intel, Transmeta (tested with Crusoe TM5800):*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ }
+ /* D M A c i t n e h t u A*/
+ else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+ /* C S N y b e d o e G*/
+ ecx==0x43534E20&&edx==0x79622065&&ebx==0x646F6547){
+ /*AMD, Geode:*/
+ cpuid(0x80000000,eax,ebx,ecx,edx);
+ if(eax<0x80000001)flags=0;
+ else{
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ flags=oc_parse_amd_flags(edx,ecx);
+ }
+ /*Also check for SSE.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags|=oc_parse_intel_flags(edx,ecx);
+ }
+ /*Technically some VIA chips can be configured in the BIOS to return any
+ string here the user wants.
+ There is a special detection method that can be used to identify such
+ processors, but in my opinion, if the user really wants to change it, they
+ deserve what they get.*/
+ /* s l u a H r u a t n e C*/
+ else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+ /*VIA:*/
+ /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+ chips (thanks to the engineers from Centaur Technology who provided it).
+ These chips support Intel-like cpuid info.
+ The C3-2 (Nehemiah) cores appear to, as well.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ cpuid(0x80000000,eax,ebx,ecx,edx);
+ if(eax>=0x80000001){
+ /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+ We need to check this even if the Intel test succeeds to pick up 3DNow!
+ support on these processors.
+ Unlike actual AMD processors, we cannot _rely_ on this info, since
+ some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+ this function, yet return edx=0, despite the Intel test indicating
+ MMX support.
+ Therefore the features detected here are strictly added to those
+ detected by the Intel test.*/
+ /*TODO: How about earlier chips?*/
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ /*Note: As of the C7, this function returns Intel-style extended feature
+ flags, not AMD-style.
+ Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+ do not conflict with any of the AMD flags we inspect.
+ For the remaining bits, Intel tells us, "Do not count on their value",
+ but VIA assures us that they will all be zero (at least on the C7 and
+ Isaiah chips).
+ In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+ (0xC0C00000) for something else, we will have to add code to detect
+ the model to decide when it is appropriate to inspect them.*/
+ flags|=oc_parse_amd_flags(edx,ecx);
+ }
+ }
+ else{
+ /*Implement me.*/
+ flags=0;
+ }
+# else
+ /* not x86 or ppc */
+# endif
+ return flags;
+}
Added: liboggplay/trunk/src/liboggplay/cpu.h
===================================================================
--- liboggplay/trunk/src/liboggplay/cpu.h (rev 0)
+++ liboggplay/trunk/src/liboggplay/cpu.h 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,34 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#ifndef __CPU_H__
+#define __CPU_H__
+
+#define OC_CPU_X86_MMX (1<<0)
+#define OC_CPU_X86_3DNOW (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT (1<<3)
+#define OC_CPU_X86_SSE (1<<4)
+#define OC_CPU_X86_SSE2 (1<<5)
+#define OC_CPU_X86_PNI (1<<6)
+#define OC_CPU_X86_SSSE3 (1<<7)
+#define OC_CPU_X86_SSE4_1 (1<<8)
+#define OC_CPU_X86_SSE4_2 (1<<9)
+#define OC_CPU_X86_SSE4A (1<<10)
+#define OC_CPU_X86_SSE5 (1<<11)
+#define OC_CPU_PPC_ALTIVEC (1<<12)
+
+#endif
Modified: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c 2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -38,413 +38,209 @@
* Shane Stephens <shane.stephens at annodex.net>
* Michael Martin
* Marcin Lubonski
+ * Viktor Gal
*/
#include "oggplay_private.h"
+#include "oggplay_yuv2rgb_template.h"
-/*
- * YUV -> RGB conversion
- * R = Y + 1.140V
- * G = Y - 0.395U - 0.581V
- * B = Y + 2.032U
- *
- * RGB -> YUV conversion
- * Y = 0.299 R + 0.587 G + 0.114 B
- * U = 0.147 R - 0.289 G + 0.436 B
- * V = 0.615 R - 0.515 G - 0.100 B
- */
+/* cpu extension detection */
+#include "cpu.c"
-#if defined(__MMX__) || defined(__SSE__) || defined(__SSE2__) || defined(__SSE3__)
-
-#if defined(WIN32)
-#define restrict
-#include <emmintrin.h>
-#else
-#include <xmmintrin.h>
-#ifndef restrict
-#define restrict __restrict__
+/* although we use cpu runtime detection, we still need these
+ * macros as there's no way e.g. we could compile a x86 asm code
+ * on a ppc machine and vica-versa
+ */
+#if defined(i386) || defined(__x86__) || defined(__x86_64__) || defined(_M_IX86)
+#include "oggplay_yuv2rgb_x86.c"
+#elif defined(__ppc__) || defined(__ppc64__)
+//altivec intristics only working with -maltivec gcc flag,
+//but we want runtime altivec detection, hence this has to be
+//fixed!
+//#include "oggplay_yuv2rgb_altivec.c"
#endif
-#endif
-/* YUV -> RGB Intel MMX implementation */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+/**
+ * yuv_convert_fptr type is a function pointer type for
+ * the various yuv-rgb converters
+ */
+typedef void (*yuv_convert_fptr) (const OggPlayYUVChannels *yuv,
+ OggPlayRGBChannels *rgb);
- int i;
- unsigned char * restrict ptry;
- unsigned char * restrict ptru;
- unsigned char * restrict ptrv;
- unsigned char * ptro;
+/* it is useless to determine each YUV conversion run
+ * the cpu type/featurs, thus we save the conversion function
+ * pointers
+ */
+static struct OggPlayYUVConverters {
+ yuv_convert_fptr yuv2rgba; /**< YUV420 to RGBA */
+ yuv_convert_fptr yuv2bgra; /**< YUV420 to BGRA */
+ yuv_convert_fptr yuv2argb; /**< YUV420 to ARGB */
+} yuv_conv = {NULL, NULL, NULL};
- register __m64 *y, *o;
- register __m64 zero, ut, vt, imm, imm2;
- register __m64 r, g, b;
- register __m64 tmp, tmp2;
+/**
+ * vanilla implementation of YUV-to-RGB conversion.
+ *
+ * - using table-lookups instead of multiplication
+ * - avoid CLAMPing by incorporating
+ *
+ */
- zero = _mm_setzero_si64();
+#define CLAMP(v) ((v) > 255 ? 255 : (v) < 0 ? 0 : (v))
- ptro = rgb->ptro;
- ptry = yuv->ptry;
- ptru = yuv->ptru;
- ptrv = yuv->ptrv;
+#define prec 15
+static const int CoY = (int)(1.164 * (1 << prec) + 0.5);
+static const int CoRV = (int)(1.596 * (1 << prec) + 0.5);
+static const int CoGU = (int)(0.391 * (1 << prec) + 0.5);
+static const int CoGV = (int)(0.813 * (1 << prec) + 0.5);
+static const int CoBU = (int)(2.018 * (1 << prec) + 0.5);
- for (i = 0; i < yuv->y_height; i++) {
- int j;
- o = (__m64*)ptro;
- ptro += rgb->rgb_width * 4;
- for (j = 0; j < yuv->y_width; j += 8) {
+static int CoefsGU[256] = {0};
+static int CoefsGV[256];
+static int CoefsBU[256];
+static int CoefsRV[256];
+static int CoefsY[256];
- y = (__m64*)&ptry[j];
+/**
+ * Initialize the lookup-table for vanilla yuv to rgb conversion.
+ */
+static void
+init_tables()
+{
+ int i;
- ut = _m_from_int(*(int *)(ptru + j/2));
- vt = _m_from_int(*(int *)(ptrv + j/2));
-
- //ut = _m_from_int(0);
- //vt = _m_from_int(0);
-
- ut = _m_punpcklbw(ut, zero);
- vt = _m_punpcklbw(vt, zero);
-
- /* subtract 128 from u and v */
- imm = _mm_set1_pi16(128);
- ut = _m_psubw(ut, imm);
- vt = _m_psubw(vt, imm);
-
- /* transfer and multiply into r, g, b registers */
- imm = _mm_set1_pi16(-51);
- g = _m_pmullw(ut, imm);
- imm = _mm_set1_pi16(130);
- b = _m_pmullw(ut, imm);
- imm = _mm_set1_pi16(146);
- r = _m_pmullw(vt, imm);
- imm = _mm_set1_pi16(-74);
- imm = _m_pmullw(vt, imm);
- g = _m_paddsw(g, imm);
-
- /* add 64 to r, g and b registers */
- imm = _mm_set1_pi16(64);
- r = _m_paddsw(r, imm);
- g = _m_paddsw(g, imm);
- imm = _mm_set1_pi16(32);
- b = _m_paddsw(b, imm);
-
- /* shift r, g and b registers to the right */
- r = _m_psrawi(r, 7);
- g = _m_psrawi(g, 7);
- b = _m_psrawi(b, 6);
-
- /* subtract 16 from r, g and b registers */
- imm = _mm_set1_pi16(16);
- r = _m_psubsw(r, imm);
- g = _m_psubsw(g, imm);
- b = _m_psubsw(b, imm);
-
- y = (__m64*)&ptry[j];
-
- /* duplicate u and v channels and add y
- * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
- * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
- * then add y, then interleave again
- * then pack with saturation, to get the desired output of
- * [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
- */
- tmp = _m_punpckhwd(r, r);
- imm = _m_punpckhbw(*y, zero);
- //printf("tmp: %llx imm: %llx\n", tmp, imm);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_punpcklwd(r, r);
- imm2 = _m_punpcklbw(*y, zero);
- tmp2 = _m_paddsw(tmp2, imm2);
- r = _m_packuswb(tmp2, tmp);
-
- tmp = _m_punpckhwd(g, g);
- tmp2 = _m_punpcklwd(g, g);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_paddsw(tmp2, imm2);
- g = _m_packuswb(tmp2, tmp);
-
- tmp = _m_punpckhwd(b, b);
- tmp2 = _m_punpcklwd(b, b);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_paddsw(tmp2, imm2);
- b = _m_packuswb(tmp2, tmp);
- //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
-
- /* now we have 8 8-bit r, g and b samples. we want these to be packed
- * into 32-bit values.
- */
- //r = _m_from_int(0);
- //b = _m_from_int(0);
- imm = _mm_set1_pi32(0xFFFFFFFF);
- tmp = _m_punpcklbw(r, b);
- tmp2 = _m_punpcklbw(g, imm);
- *o++ = _m_punpcklbw(tmp, tmp2);
- *o++ = _m_punpckhbw(tmp, tmp2);
- //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2,
- // _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
- tmp = _m_punpckhbw(r, b);
- tmp2 = _m_punpckhbw(g, imm);
- *o++ = _m_punpcklbw(tmp, tmp2);
- *o++ = _m_punpckhbw(tmp, tmp2);
-
- //exit(1);
- }
- if (i & 0x1) {
- ptru += yuv->uv_width;
- ptrv += yuv->uv_width;
- }
- ptry += yuv->y_width;
- }
- _m_empty();
-
+ for(i = 0; i < 256; ++i)
+ {
+ CoefsGU[i] = -CoGU * (i - 128);
+ CoefsGV[i] = -CoGV * (i - 128);
+ CoefsBU[i] = CoBU * (i - 128);
+ CoefsRV[i] = CoRV * (i - 128);
+ CoefsY[i] = CoY * (i - 16) + (prec/2);
+ }
}
-/* YUV -> BGR Intel MMX implementation */
-void oggplay_yuv2bgr(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+#define VANILLA_YUV2RGB_PIXEL(y, ruv, guv, buv) \
+r = (CoefsY[y] + ruv) >> prec; \
+g = (CoefsY[y] + guv) >> prec; \
+b = (CoefsY[y] + buv) >> prec; \
- int i;
- unsigned char * restrict ptry;
- unsigned char * restrict ptru;
- unsigned char * restrict ptrv;
- unsigned char * ptro;
+#define VANILLA_RGBA_OUT(out, r, g, b) \
+out[0] = CLAMP(r); \
+out[1] = CLAMP(g); \
+out[2] = CLAMP(b); \
+out[3] = 255;
- register __m64 *y, *o;
- register __m64 zero, ut, vt, imm, imm2;
- register __m64 r, g, b;
- register __m64 tmp, tmp2;
+#define VANILLA_BGRA_OUT(out, r, g, b) \
+out[0] = CLAMP(b); \
+out[1] = CLAMP(g); \
+out[2] = CLAMP(r); \
+out[3] = 255;
- zero = _mm_setzero_si64();
+#define VANILLA_ARGB_OUT(out, r, g, b) \
+out[0] = 255; \
+out[1] = CLAMP(r); \
+out[2] = CLAMP(g); \
+out[3] = CLAMP(b);
- ptry = yuv->ptry;
- ptru = yuv->ptru;
- ptrv = yuv->ptrv;
- ptro = rgb->ptro;
+/* yuv420p -> */
+#define LOOKUP_COEFFS int ruv = CoefsRV[*pv]; \
+ int guv = CoefsGU[*pu] + CoefsGV[*pv]; \
+ int buv = CoefsBU[*pu]; \
+ int r, g, b;
- for (i = 0; i < yuv->y_height; i++) {
- int j;
- o = (__m64*)ptro;
- ptro += rgb->rgb_width * 4;
- for (j = 0; j < yuv->y_width; j += 8) {
+#define CONVERT(OUTPUT_FUNC) LOOKUP_COEFFS \
+ VANILLA_YUV2RGB_PIXEL(py[0], ruv, guv, buv);\
+ OUTPUT_FUNC(dst, r, g, b); \
+ VANILLA_YUV2RGB_PIXEL(py[1], ruv, guv, buv);\
+ OUTPUT_FUNC((dst+4), r, g, b);
- y = (__m64*)&ptry[j];
+#define CLEANUP
- ut = _m_from_int(*(int *)(ptru + j/2));
- vt = _m_from_int(*(int *)(ptrv + j/2));
+YUV_CONVERT(yuv420_to_rgba_vanilla, CONVERT(VANILLA_RGBA_OUT), 2, 8, 2, 1)
+YUV_CONVERT(yuv420_to_bgra_vanilla, CONVERT(VANILLA_BGRA_OUT), 2, 8, 2, 1)
+YUV_CONVERT(yuv420_to_argb_vanilla, CONVERT(VANILLA_ARGB_OUT), 2, 8, 2, 1)
- //ut = _m_from_int(0);
- //vt = _m_from_int(0);
+#undef CONVERT
+#undef CLEANUP
- ut = _m_punpcklbw(ut, zero);
- vt = _m_punpcklbw(vt, zero);
-
- /* subtract 128 from u and v */
- imm = _mm_set1_pi16(128);
- ut = _m_psubw(ut, imm);
- vt = _m_psubw(vt, imm);
-
- /* transfer and multiply into r, g, b registers */
- imm = _mm_set1_pi16(-51);
- g = _m_pmullw(ut, imm);
- imm = _mm_set1_pi16(130);
- b = _m_pmullw(ut, imm);
- imm = _mm_set1_pi16(146);
- r = _m_pmullw(vt, imm);
- imm = _mm_set1_pi16(-74);
- imm = _m_pmullw(vt, imm);
- g = _m_paddsw(g, imm);
-
- /* add 64 to r, g and b registers */
- imm = _mm_set1_pi16(64);
- r = _m_paddsw(r, imm);
- g = _m_paddsw(g, imm);
- imm = _mm_set1_pi16(32);
- b = _m_paddsw(b, imm);
-
- /* shift r, g and b registers to the right */
- r = _m_psrawi(r, 7);
- g = _m_psrawi(g, 7);
- b = _m_psrawi(b, 6);
-
- /* subtract 16 from r, g and b registers */
- imm = _mm_set1_pi16(16);
- r = _m_psubsw(r, imm);
- g = _m_psubsw(g, imm);
- b = _m_psubsw(b, imm);
-
- y = (__m64*)&ptry[j];
-
- /* duplicate u and v channels and add y
- * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
- * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
- * then add y, then interleave again
- * then pack with saturation, to get the desired output of
- * [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
- */
- tmp = _m_punpckhwd(r, r);
- imm = _m_punpckhbw(*y, zero);
- //printf("tmp: %llx imm: %llx\n", tmp, imm);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_punpcklwd(r, r);
- imm2 = _m_punpcklbw(*y, zero);
- tmp2 = _m_paddsw(tmp2, imm2);
- r = _m_packuswb(tmp2, tmp);
-
- tmp = _m_punpckhwd(g, g);
- tmp2 = _m_punpcklwd(g, g);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_paddsw(tmp2, imm2);
- g = _m_packuswb(tmp2, tmp);
-
- tmp = _m_punpckhwd(b, b);
- tmp2 = _m_punpcklwd(b, b);
- tmp = _m_paddsw(tmp, imm);
- tmp2 = _m_paddsw(tmp2, imm2);
- b = _m_packuswb(tmp2, tmp);
- //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
-
- /* now we have 8 8-bit r, g and b samples. we want these to be packed
- * into 32-bit values.
- */
- //r = _m_from_int(0);
- //b = _m_from_int(0);
- imm = _mm_set1_pi32(0xFFFFFFFF);
- tmp = _m_punpcklbw(b, r);
- tmp2 = _m_punpcklbw(g, imm);
- *o++ = _m_punpcklbw(tmp, tmp2);
- *o++ = _m_punpckhbw(tmp, tmp2);
- //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2,
- // _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
- tmp = _m_punpckhbw(b, r);
- tmp2 = _m_punpckhbw(g, imm);
- *o++ = _m_punpcklbw(tmp, tmp2);
- *o++ = _m_punpckhbw(tmp, tmp2);
-
- //exit(1);
- }
- if (i & 0x1) {
- ptru += yuv->uv_width;
- ptrv += yuv->uv_width;
- }
- ptry += yuv->y_width;
- }
- _m_empty();
-
-}
-
-#elif defined(__xxAPPLExx__)
-/*
- * TODO: implement the SIMD method above using Apple's AltiVec code;
- * for now, we'll use the vanilla implementation for Macs.
+/**
+ * Initialize the function pointers in yuv_conv.
*
- * Also, there's probably a better preprocessor macro for detecting
- * the presence of AltiVec than __APPLE__.
+ * Initialize the function pointers in yuv_conv, based on the
+ * the available CPU extensions.
*/
+static void
+init_yuv_converters(void)
+{
+ ogg_uint32_t features = 0;
-/* Macintosh AltiVec implementation */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+ if ( yuv_conv.yuv2rgba == NULL )
+ {
+ features = oc_cpu_flags_get();
+#if defined(i386) || defined(__x86__) || defined(__x86_64__) || defined(_M_IX86)
+ if (features & (OC_CPU_X86_SSE2|OC_CPU_X86_MMX|OC_CPU_X86_SSE))
+ {
+ yuv_conv.yuv2rgba = yuv420_to_rgba_sse2;
+ yuv_conv.yuv2bgra = yuv420_to_bgra_sse2;
+ yuv_conv.yuv2argb = yuv420_to_argb_sse2;
+ return;
+ }
+ else if (features & (OC_CPU_X86_MMX|OC_CPU_X86_SSE))
+ {
+ yuv_conv.yuv2rgba = yuv420_to_rgba_mmx;
+ yuv_conv.yuv2bgra = yuv420_to_bgra_mmx;
+ yuv_conv.yuv2argb = yuv420_to_argb_mmx;
+ return;
+ }
+ else if (features & OC_CPU_X86_MMX)
+ {
+ yuv_conv.yuv2rgba = yuv420_to_rgba_mmx;
+ yuv_conv.yuv2bgra = yuv420_to_bgra_mmx;
+ yuv_conv.yuv2argb = yuv420_to_argb_mmx;
+ return;
+ }
+#elif defined(__ppc__) || defined(__ppc64__)
+ if (features & OC_CPU_PPC_ALTIVEC)
+ {
+ init_altivec();
+ yuv_conv.yuv2rgba = yuv420_to_rgba_vanilla;
+ yuv_conv.yuv2bgra = yuv420_to_bgra_vanilla;
+ yuv_conv.yuv2argb = yuv420_to_argb_vanilla;
+ return;
+ }
+#endif
+ /* no CPU extension was found... using vanilla converter */
+ init_tables();
+ yuv_conv.yuv2rgba = yuv420_to_rgba_vanilla;
+ yuv_conv.yuv2bgra = yuv420_to_bgra_vanilla;
+ yuv_conv.yuv2argb = yuv420_to_argb_vanilla;
+ }
}
-#else
-#define CLAMP(v) ((v) > 255 ? 255 : (v) < 0 ? 0 : (v))
+void
+oggplay_yuv2rgba(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb)
+{
+ if (yuv_conv.yuv2rgba == NULL)
+ init_yuv_converters();
-/* Vanilla implementation if YUV->RGB conversion */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+ yuv_conv.yuv2rgba(yuv, rgb);
+}
- unsigned char * ptry = yuv->ptry;
- unsigned char * ptru = yuv->ptru;
- unsigned char * ptrv = yuv->ptrv;
- unsigned char * ptro = rgb->ptro;
- unsigned char * ptro2;
- int i, j;
+void
+oggplay_yuv2bgra(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb)
+{
+ if (yuv_conv.yuv2bgra == NULL)
+ init_yuv_converters();
- for (i = 0; i < yuv->y_height; i++) {
- ptro2 = ptro;
- for (j = 0; j < yuv->y_width; j += 2) {
-
- short pr, pg, pb, y;
- short r, g, b;
-
- pr = (-56992 + ptrv[j/2] * 409) >> 8;
- pg = (34784 - ptru[j/2] * 100 - ptrv[j/2] * 208) >> 8;
- pb = (-70688 + ptru[j/2] * 516) >> 8;
-
- y = 298*ptry[j] >> 8;
- r = y + pr;
- g = y + pg;
- b = y + pb;
-
- *ptro2++ = CLAMP(r);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(b);
- *ptro2++ = 255;
-
- y = 298*ptry[j + 1] >> 8;
- r = y + pr;
- g = y + pg;
- b = y + pb;
-
- *ptro2++ = CLAMP(r);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(b);
- *ptro2++ = 255;
- }
- ptry += yuv->y_width;
- if (i & 1) {
- ptru += yuv->uv_width;
- ptrv += yuv->uv_width;
- }
- ptro += rgb->rgb_width * 4;
- }
+ yuv_conv.yuv2bgra(yuv, rgb);
}
-/* Vanilla implementation of YUV->BGR conversion*/
-void oggplay_yuv2bgr(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+void
+oggplay_yuv2argb(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb)
+{
+ if (yuv_conv.yuv2argb == NULL)
+ init_yuv_converters();
- unsigned char * ptry = yuv->ptry;
- unsigned char * ptru = yuv->ptru;
- unsigned char * ptrv = yuv->ptrv;
- unsigned char * ptro = rgb->ptro;
- unsigned char * ptro2;
- int i, j;
-
- for (i = 0; i < yuv->y_height; i++) {
- ptro2 = ptro;
- for (j = 0; j < yuv->y_width; j += 2) {
-
- short pr, pg, pb, y;
- short r, g, b;
-
- pr = (-56992 + ptrv[j/2] * 409) >> 8;
- pg = (34784 - ptru[j/2] * 100 - ptrv[j/2] * 208) >> 8;
- pb = (-70688 + ptru[j/2] * 516) >> 8;
-
- y = 298*ptry[j] >> 8;
- r = y + pr;
- g = y + pg;
- b = y + pb;
-
- *ptro2++ = CLAMP(b);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(r);
- *ptro2++ = 255;
-
- y = 298*ptry[j + 1] >> 8;
- r = y + pr;
- g = y + pg;
- b = y + pb;
-
- *ptro2++ = CLAMP(b);
- *ptro2++ = CLAMP(g);
- *ptro2++ = CLAMP(r);
- *ptro2++ = 255;
- }
- ptry += yuv->y_width;
- if (i & 1) {
- ptru += yuv->uv_width;
- ptrv += yuv->uv_width;
- }
- ptro += rgb->rgb_width * 4;
- }
+ yuv_conv.yuv2argb(yuv, rgb);
}
-#endif
Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,235 @@
+#include <altivec.h>
+
+/* coeffiecients for AltiVec YUV->RGB conversion */
+static vector signed short CY;
+static vector signed short CRV;
+static vector signed short CBU;
+static vector signed short CGU;
+static vector signed short CGV;
+static vector unsigned short CSHIFT;
+
+
+/**
+ * Initialize the static coef vectors
+ */
+static void
+init_altivec()
+{
+ CY = vec_splat ((vector signed short){0x253f}, 0);
+ CRV = vec_splat ((vector signed short){0x3312}, 0);
+ CBU = vec_splat ((vector signed short){0x4093}, 0);
+ CGU = vec_splat ((vector signed short){0xf37d}, 0);
+ CGV = vec_splat ((vector signed short){0xe5fc}, 0);
+ CSHIFT = vec_splat ((vector unsigned short){0x2}, 0);
+}
+
+
+#define vec_unh(x) \
+ (vector signed short) \
+ vec_perm(x,(__typeof__(x)){0}, \
+ ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
+ 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
+#define vec_unl(x) \
+ (vector signed short) \
+ vec_perm(x,(__typeof__(x)){0}, \
+ ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
+ 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
+
+#define vec_packclp(x,y) \
+ (vector unsigned char)vec_packs \
+ ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
+ (vector unsigned short)vec_max (y,((vector signed short) {0})))
+
+#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
+do { \
+ T _0,_1,_2,_3; \
+ _0 = vec_mergeh (x0,x1); \
+ _1 = vec_mergeh (x2,x3); \
+ _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+ _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+ vec_st (_2, 0*16, (T *)ptr); \
+ vec_st (_3, 1*16, (T *)ptr); \
+ _0 = vec_mergel (x0,x1); \
+ _1 = vec_mergel (x2,x3); \
+ _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+ _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+ vec_st (_2, 2*16, (T *)ptr); \
+ vec_st (_3, 3*16, (T *)ptr); \
+ ptr += 4; \
+} while (0);
+
+/**
+ * macros for various output
+ */
+#define OUTPUT_BGRA(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){0xFF}),ptr)
+#define OUTPUT_RGBA(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){0xFF}),ptr)
+#define OUTPUT_ARGB(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0xFF}),a,b,c,ptr)
+
+/**
+ * Function template for YUV420->RGB palet conversion with AltiVec
+ */
+#define YUV420_CONVERT_ALTIVEC(FUNC, OUTPUT_FUNC) \
+static void \
+(FUNC)(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb) \
+{ \
+ int i,j, w, h; \
+ unsigned char *py1, *py2, *pu, *pv, *po0, *po1, *dst0, *dst1; \
+ unsigned char *ptry0, *ptry1, *ptru, *ptrv; \
+ vector unsigned char y0,y1; \
+ \
+ vector signed char u,v; \
+ \
+ vector signed short Y0,Y1,Y2,Y3; \
+ vector signed short U,V; \
+ vector signed short vx,ux,uvx; \
+ vector signed short vx0,ux0,uvx0; \
+ vector signed short vx1,ux1,uvx1; \
+ vector signed short R0,G0,B0; \
+ vector signed short R1,G1,B1; \
+ vector unsigned char R,G,B; \
+ \
+ vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP, *out0, *out1; \
+ vector unsigned char align_perm; \
+ \
+ ptry0 = yuv->ptry; \
+ ptry1 = yuv->ptry+yuv->y_width; \
+ ptru = yuv->ptru; \
+ ptrv = yuv->ptrv; \
+ \
+ /* po0 even, po1 even lines */ \
+ po0 = rgb->ptro; \
+ po1 = rgb->ptro+(rgb->rgb_width*4); \
+ \
+ w = yuv->y_width/16; \
+ h = yuv->y_height/2; \
+ for (i = 0; i < h; ++i) \
+ { \
+ dst0 = po0; \
+ dst1 = po1; \
+ pu = ptru; \
+ pv = ptrv; \
+ py1 = ptry0; \
+ py2 = ptry1; \
+ for (j = 0; j < w; ++j, \
+ dst0 += 64, dst1 +=64, \
+ py1 += 16, py2 += 16, \
+ pu += 8, pv += 8) \
+ { \
+ out0 = (vector unsigned char *) dst0; \
+ out1 = (vector unsigned char *) dst1; \
+ y1ivP = (vector unsigned char *) py1; \
+ y2ivP = (vector unsigned char *) py2; \
+ uivP = (vector unsigned char *) pu; \
+ vivP = (vector unsigned char *) pv; \
+ \
+ align_perm = vec_lvsl (0, py1); \
+ y0 = (vector unsigned char) \
+ vec_perm (y1ivP[0], y1ivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl (0, py2); \
+ y1 = (vector unsigned char) \
+ vec_perm (y2ivP[0], y2ivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl (0, pu); \
+ u = (vector signed char) \
+ vec_perm (uivP[0], uivP[1], align_perm); \
+ \
+ align_perm = vec_lvsl (0, pv); \
+ v = (vector signed char) \
+ vec_perm (vivP[0], vivP[1], align_perm); \
+ \
+ /* U -= 128, V -=128 */ \
+ u = (vector signed char) \
+ vec_sub (u,(vector signed char) \
+ vec_splat((vector signed char){128},0));\
+ v = (vector signed char) \
+ vec_sub (v,(vector signed char) \
+ vec_splat((vector signed char){128},0));\
+ \
+ U = vec_unpackh (u); \
+ V = vec_unpackh (v); \
+ U = vec_sl (U, CSHIFT); \
+ V = vec_sl (V, CSHIFT); \
+ \
+ /* Y -= 16 */ \
+ y0 = (vector unsigned char) \
+ vec_sub (y0, (vector unsigned char) \
+ vec_splat((vector unsigned char){16},0));\
+ y1 = (vector unsigned char) \
+ vec_sub (y1, (vector unsigned char) \
+ vec_splat((vector unsigned char){16},0));\
+ \
+ Y0 = vec_unh (y0); \
+ Y1 = vec_unl (y0); \
+ Y2 = vec_unh (y1); \
+ Y3 = vec_unl (y1); \
+ \
+ Y0 = vec_sl (Y0, CSHIFT); \
+ Y1 = vec_sl (Y1, CSHIFT); \
+ Y2 = vec_sl (Y2, CSHIFT); \
+ Y3 = vec_sl (Y3, CSHIFT); \
+ \
+ /* Y *= CY */ \
+ Y0 = vec_mradds (Y0, CY, (vector signed short){0}); \
+ Y1 = vec_mradds (Y1, CY, (vector signed short){0}); \
+ Y2 = vec_mradds (Y2, CY, (vector signed short){0}); \
+ Y3 = vec_mradds (Y3, CY, (vector signed short){0}); \
+ \
+ /* ux = CBU*U >> 8 */ \
+ ux = vec_mradds (U, CBU, (vector signed short){0}); \
+ ux0 = vec_mergeh (ux,ux); \
+ ux1 = vec_mergel (ux,ux); \
+ \
+ /* vx = CRV*V >> 8*/ \
+ vx = vec_mradds (V, CRV, (vector signed short){0}); \
+ vx0 = vec_mergeh (vx,vx); \
+ vx1 = vec_mergel (vx,vx); \
+ \
+ /* uvx = ((CGU*u) + (CGV*V)) >> 8 */ \
+ uvx = vec_mradds (U, CGU, (vector signed short){0}); \
+ uvx = vec_mradds (V, CGV, uvx); \
+ uvx0 = vec_mergeh (uvx,uvx); \
+ uvx1 = vec_mergel (uvx,uvx); \
+ \
+ R0 = vec_add (Y0,vx0); \
+ G0 = vec_add (Y0,uvx0); \
+ B0 = vec_add (Y0,ux0); \
+ R1 = vec_add (Y1,vx1); \
+ G1 = vec_add (Y1,uvx1); \
+ B1 = vec_add (Y1,ux1); \
+ \
+ R = vec_packclp (R0,R1); \
+ G = vec_packclp (G0,G1); \
+ B = vec_packclp (B0,B1); \
+ \
+ OUTPUT_FUNC(R,G,B,out0); \
+ \
+ R0 = vec_add (Y2,vx0); \
+ G0 = vec_add (Y2,uvx0); \
+ B0 = vec_add (Y2,ux0); \
+ R1 = vec_add (Y3,vx1); \
+ G1 = vec_add (Y3,uvx1); \
+ B1 = vec_add (Y3,ux1); \
+ R = vec_packclp (R0,R1); \
+ G = vec_packclp (G0,G1); \
+ B = vec_packclp (B0,B1); \
+ \
+ OUTPUT_FUNC(R,G,B,out1); \
+ } \
+ po0 += (rgb->rgb_width*8); \
+ po1 += (rgb->rgb_width*8); \
+ \
+ ptry0 += yuv->y_width * 2; \
+ ptry1 += yuv->y_width * 2; \
+ ptru += yuv->uv_width; \
+ ptrv += yuv->uv_width; \
+ } \
+}
+
+/**
+ * yuv420 ->
+ */
+YUV420_CONVERT_ALTIVEC(yuv420_to_rgba_altivec, OUTPUT_RGBA)
+YUV420_CONVERT_ALTIVEC(yuv420_to_bgra_altivec, OUTPUT_BGRA)
+YUV420_CONVERT_ALTIVEC(yuv420_to_argb_altivec, OUTPUT_ARGB)
+
Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,68 @@
+#ifndef __OGGPLAY_YUV2RGB_TEMPLATE_H__
+#define __OGGPLAY_YUV2RGB_TEMPLATE_H__
+
+#if defined(WIN32)
+#define restrict
+#else
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/**
+ * Template for YUV to RGB conversion
+ *
+ * @param FUNC function name
+ * @param CONVERT a macro that defines
+ * @param NUM_PIXELS number of pixels processed in one iteration
+ * @param OUT_SHIFT number of pixels to shift after one iteration in rgb data stream
+ * @param Y_SHIFT number of pixels to shift after one iteration in Y data stream
+ * @param UV_SHIFT
+ */
+#define YUV_CONVERT(FUNC, CONVERT, NUM_PIXELS, OUT_SHIFT, Y_SHIFT, UV_SHIFT)\
+static void \
+(FUNC)(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb) \
+{ \
+ int i,j, w, h; \
+ unsigned char* restrict ptry; \
+ unsigned char* restrict ptru; \
+ unsigned char* restrict ptrv; \
+ unsigned char* restrict ptro; \
+ unsigned char *dst, *py, *pu, *pv; \
+ \
+ ptro = rgb->ptro; \
+ ptry = yuv->ptry; \
+ ptru = yuv->ptru; \
+ ptrv = yuv->ptrv; \
+ \
+ w = yuv->y_width/NUM_PIXELS; \
+ h = yuv->y_height; \
+ for (i = 0; i < h; ++i) \
+ { \
+ py = ptry; \
+ pu = ptru; \
+ pv = ptrv; \
+ dst = ptro; \
+ for (j = 0; j < w; ++j, \
+ dst += OUT_SHIFT, \
+ py += Y_SHIFT, \
+ pu += UV_SHIFT, \
+ pv += UV_SHIFT) \
+ { \
+ /* use the given conversion function */ \
+ CONVERT \
+ } \
+ ptro += rgb->rgb_width * 4; \
+ ptry += yuv->y_width; \
+ \
+ if (i & 0x1) \
+ { \
+ ptru += yuv->uv_width; \
+ ptrv += yuv->uv_width; \
+ } \
+ } \
+ CLEANUP \
+}
+
+#endif
+
Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,143 @@
+/*
+ Copyright (C) 2003 Commonwealth Scientific and Industrial Research
+ Organisation (CSIRO) Australia
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ - Neither the name of CSIRO Australia nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ORGANISATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * YUV to RGB conversion using x86 CPU extensions
+ */
+
+#if defined(_MSC_VER)
+#include "yuv2rgb_x86_vs.h"
+#elif defined(__GNUC__)
+#include "yuv2rgb_x86.h"
+#endif
+
+#ifdef ATTRIBUTE_ALIGNED_MAX
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
+#else
+#define ATTR_ALIGN(align)
+#endif
+
+typedef union
+{
+ long long q[2];
+ unsigned long long uq[2];
+ int d[4];
+ unsigned int ud[4];
+ short w[8];
+ unsigned short uw[8];
+ char b[16];
+ unsigned char ub[16];
+ float s[4];
+#if defined(__GNUC__)
+ long long __attribute__ ((__vector_size__ (16), __may_alias__)) int128;
+#endif
+} ATTR_ALIGN(16) simd_t;
+
+#define UV_128 0x0080008000800080LL
+#define Y_16 0x1010101010101010LL
+#define Y_Co 0x253f253f253f253fLL
+#define GU_Co 0xf37df37df37df37dLL
+#define GV_Co 0xe5fce5fce5fce5fcLL
+#define BU_Co 0x4093409340934093LL
+#define RV_Co 0x3312331233123312LL
+#define Y_MASK 0x00ff00ff00ff00ffLL
+#define ALFA 0xffffffffffffffffLL
+
+/**
+ * coefficients and constants for yuv to rgb SIMD conversion
+ */
+static const simd_t simd_80w = {{UV_128, UV_128}};
+static const simd_t simd_U_green = {{GU_Co, GU_Co}};
+static const simd_t simd_U_blue = {{BU_Co, BU_Co}};
+static const simd_t simd_V_red = {{RV_Co, RV_Co}};
+static const simd_t simd_V_green = {{GV_Co, GV_Co}};
+static const simd_t simd_Y_coeff = {{Y_Co, Y_Co}};
+static const simd_t simd_10w = {{Y_16, Y_16}};
+static const simd_t simd_00ffw = {{Y_MASK, Y_MASK}};
+static const simd_t simd_alpha = {{ALFA, ALFA}};
+
+/**
+ * the conversion functions using MMX instructions
+ */
+
+/* template for the MMX conversion functions */
+#define YUV_CONVERT_MMX(FUNC, CONVERT) YUV_CONVERT(FUNC, CONVERT, 8, 32, 8, 4)
+#define CLEANUP emms()
+#define OUT_RGBA_32 OUTPUT_RGBA_32(movq, mm, 8, 16, 24)
+#define OUT_ARGB_32 OUTPUT_ARGB_32(movq, mm, 8, 16, 24)
+#define OUT_BGRA_32 OUTPUT_BGRA_32(movq, mm, 8, 16, 24)
+#define MOVNTQ MMX_MOVNTQ
+
+/* yuv420 -> */
+#define CONVERT(OUTPUT_FUNC) LOAD_YUV_PLANAR_2(movq, mm) \
+ YUV_2_RGB(movq, mm) \
+ OUTPUT_FUNC
+
+YUV_CONVERT_MMX(yuv420_to_rgba_mmx, CONVERT(OUT_RGBA_32))
+YUV_CONVERT_MMX(yuv420_to_bgra_mmx, CONVERT(OUT_BGRA_32))
+YUV_CONVERT_MMX(yuv420_to_argb_mmx, CONVERT(OUT_ARGB_32))
+#undef CONVERT
+
+#undef CLEANUP
+#undef OUT_RGBA_32
+#undef OUT_ARGB_32
+#undef OUT_BGRA_32
+#undef MOVNTQ
+
+/**
+ * the conversion functions using SSE2 instructions
+ */
+
+/* template for the SSE2 conversion functions */
+#define YUV_CONVERT_SSE2(FUNC, CONVERT) YUV_CONVERT(FUNC, CONVERT, 16, 64, 16, 8)
+#define OUT_RGBA_32 OUTPUT_RGBA_32(movdqa, xmm, 16, 32, 48)
+#define OUT_ARGB_32 OUTPUT_ARGB_32(movdqa, xmm, 16, 32, 48)
+#define OUT_BGRA_32 OUTPUT_BGRA_32(movdqa, xmm, 16, 32, 48)
+#define MOVNTQ SSE2_MOVNTQ
+#define CLEANUP
+
+/* yuv420 -> */
+#define CONVERT(OUTPUT_FUNC) LOAD_YUV_PLANAR_2(movdqu, xmm) \
+ YUV_2_RGB(movdqa, xmm) \
+ OUTPUT_FUNC
+
+YUV_CONVERT_SSE2(yuv420_to_rgba_sse2, CONVERT(OUT_RGBA_32))
+YUV_CONVERT_SSE2(yuv420_to_bgra_sse2, CONVERT(OUT_BGRA_32))
+YUV_CONVERT_SSE2(yuv420_to_argb_sse2, CONVERT(OUT_ARGB_32))
+#undef CONVERT
+
+#undef OUT_RGBA_32
+#undef OUT_ARGB_32
+#undef OUT_BGRA_32
+#undef MOVNTQ
+#undef CLEANUP
+
Added: liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h
===================================================================
--- liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h (rev 0)
+++ liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,135 @@
+#ifndef __YUV2RGB_X86_H__
+#define __YUV2RGB_X86_H__
+
+#define emms() __asm__ __volatile ( "emms;" );
+#define MMX_MOVNTQ "movntq"
+#define SSE2_MOVNTQ "movdqu"
+
+#define YUV_2_RGB(mov_instr, reg_type) \
+ __asm__ __volatile__ ( \
+ "punpcklbw %%"#reg_type"4, %%"#reg_type"0;" /* mm0 = u3 u2 u1 u0 */\
+ "punpcklbw %%"#reg_type"4, %%"#reg_type"1;" /* mm1 = v3 v2 v1 v0 */\
+ "psubsw simd_80w, %%"#reg_type"0;" /* u -= 128 */\
+ "psubsw simd_80w, %%"#reg_type"1;" /* v -= 128 */\
+ "psllw $3, %%"#reg_type"0;" /* promote precision */\
+ "psllw $3, %%"#reg_type"1;" /* promote precision */\
+ #mov_instr " %%"#reg_type"0, %%"#reg_type"2;" /* mm2 = u3 u2 u1 u0 */\
+ #mov_instr " %%"#reg_type"1, %%"#reg_type"3;" /* mm3 = v3 v2 v1 v0 */\
+ "pmulhw simd_U_green, %%"#reg_type"2;" /* mm2 = u * u_green */\
+ "pmulhw simd_V_green, %%"#reg_type"3;" /* mm3 = v * v_green */\
+ "pmulhw simd_U_blue, %%"#reg_type"0;" /* mm0 = chroma_b */\
+ "pmulhw simd_V_red, %%"#reg_type"1;" /* mm1 = chroma_r */\
+ "paddsw %%"#reg_type"3, %%"#reg_type"2;" /* mm2 = chroma_g */\
+ "psubusb simd_10w, %%"#reg_type"6;" /* Y -= 16 */\
+ #mov_instr " %%"#reg_type"6, %%"#reg_type"7;" /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
+ "pand simd_00ffw, %%"#reg_type"6;" /* mm6 = Y6 Y4 Y2 Y0 */\
+ "psrlw $8, %%"#reg_type"7;" /* mm7 = Y7 Y5 Y3 Y1 */\
+ "psllw $3, %%"#reg_type"6;" /* promote precision */\
+ "psllw $3, %%"#reg_type"7;" /* promote precision */\
+ "pmulhw simd_Y_coeff, %%"#reg_type"6;" /* mm6 = luma_rgb even */\
+ "pmulhw simd_Y_coeff, %%"#reg_type"7;" /* mm7 = luma_rgb odd */\
+ #mov_instr " %%"#reg_type"0, %%"#reg_type"3;" /* mm3 = chroma_b */\
+ #mov_instr " %%"#reg_type"1, %%"#reg_type"4;" /* mm4 = chroma_r */\
+ #mov_instr " %%"#reg_type"2, %%"#reg_type"5;" /* mm5 = chroma_g */\
+ "paddsw %%"#reg_type"6, %%"#reg_type"0;" /* mm0 = B6 B4 B2 B0 */\
+ "paddsw %%"#reg_type"7, %%"#reg_type"3;" /* mm3 = B7 B5 B3 B1 */\
+ "paddsw %%"#reg_type"6, %%"#reg_type"1;" /* mm1 = R6 R4 R2 R0 */\
+ "paddsw %%"#reg_type"7, %%"#reg_type"4;" /* mm4 = R7 R5 R3 R1 */\
+ "paddsw %%"#reg_type"6, %%"#reg_type"2;" /* mm2 = G6 G4 G2 G0 */\
+ "paddsw %%"#reg_type"7, %%"#reg_type"5;" /* mm5 = G7 G5 G3 G1 */\
+ "packuswb %%"#reg_type"0, %%"#reg_type"0;" /* saturate to 0-255 */\
+ "packuswb %%"#reg_type"1, %%"#reg_type"1;" /* saturate to 0-255 */\
+ "packuswb %%"#reg_type"2, %%"#reg_type"2;" /* saturate to 0-255 */\
+ "packuswb %%"#reg_type"3, %%"#reg_type"3;" /* saturate to 0-255 */\
+ "packuswb %%"#reg_type"4, %%"#reg_type"4;" /* saturate to 0-255 */\
+ "packuswb %%"#reg_type"5, %%"#reg_type"5;" /* saturate to 0-255 */\
+ "punpcklbw %%"#reg_type"3, %%"#reg_type"0;" /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
+ "punpcklbw %%"#reg_type"4, %%"#reg_type"1;" /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
+ "punpcklbw %%"#reg_type"5, %%"#reg_type"2;" /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
+ ::"m" (simd_80w), \
+ "m" (simd_U_green), \
+ "m" (simd_V_green), \
+ "m" (simd_U_blue), \
+ "m" (simd_V_red), \
+ "m" (simd_10w), \
+ "m" (simd_00ffw), \
+ "m" (simd_Y_coeff));
+
+#define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm__ __volatile__ ( \
+ /* r0=B, r1=R, r2=G */ \
+ #mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+ #mov_instr " %%"#reg_type"0, %%"#reg_type"4;\n\t"\
+ #mov_instr " %%"#reg_type"1, %%"#reg_type"5;\n\t"\
+ "punpcklbw %%"#reg_type"2, %%"#reg_type"0;\n\t" /* GB GB GB GB low */\
+ "punpcklbw %%"#reg_type"3, %%"#reg_type"1;\n\t" /* FR FR FR FR low */\
+ "punpckhbw %%"#reg_type"2, %%"#reg_type"4;\n\t" /* GB GB GB GB high */\
+ "punpckhbw %%"#reg_type"3, %%"#reg_type"5;\n\t" /* FR FR FR FR high */\
+ #mov_instr " %%"#reg_type"0, %%"#reg_type"6;\n\t"\
+ #mov_instr " %%"#reg_type"4, %%"#reg_type"7;\n\t"\
+ "punpcklwd %%"#reg_type"1, %%"#reg_type"0;\n\t" /* FRGB FRGB 0 */\
+ "punpckhwd %%"#reg_type"1, %%"#reg_type"6;\n\t" /* FRGB FRGB 1 */\
+ "punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* FRGB FRGB 2 */\
+ "punpckhwd %%"#reg_type"5, %%"#reg_type"7;\n\t" /* FRGB FRGB 3 */\
+ MOVNTQ " %%"#reg_type"0, (%0);\n\t"\
+ MOVNTQ " %%"#reg_type"6, "#offset0"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"7, "#offset2"(%0);\n\t"\
+ :: "r" (dst), "m" (simd_alpha));
+
+
+#define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm__ __volatile__ ( \
+ /* r0=B, r1=R, r2=G */ \
+ #mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+ #mov_instr " %%"#reg_type"3, %%"#reg_type"4;\n\t"\
+ #mov_instr " %%"#reg_type"2, %%"#reg_type"5;\n\t"\
+ "punpcklbw %%"#reg_type"0, %%"#reg_type"2;\n\t" /* BG BG BG BG low */\
+ "punpcklbw %%"#reg_type"1, %%"#reg_type"3;\n\t" /* RF RF RF RF low */\
+ "punpckhbw %%"#reg_type"0, %%"#reg_type"5;\n\t" /* BG BG BG BG high */\
+ "punpckhbw %%"#reg_type"1, %%"#reg_type"4;\n\t" /* RF RF RF RF high */\
+ #mov_instr " %%"#reg_type"3, %%"#reg_type"0;\n\t"\
+ #mov_instr " %%"#reg_type"4, %%"#reg_type"1;\n\t"\
+ "punpcklwd %%"#reg_type"2, %%"#reg_type"3;\n\t" /* BGRF BGRF 0 */\
+ "punpckhwd %%"#reg_type"2, %%"#reg_type"0;\n\t" /* BGRF BGRF 1 */\
+ "punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* BGRF BGRF 2 */\
+ "punpckhwd %%"#reg_type"5, %%"#reg_type"1;\n\t" /* BGRF BGRF 3 */\
+ MOVNTQ " %%"#reg_type"3, (%0);\n\t"\
+ MOVNTQ " %%"#reg_type"0, "#offset0"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"1, "#offset2"(%0);\n\t"\
+ :: "r" (dst), "m" (simd_alpha));
+
+#define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm__ __volatile__ ( \
+ /* r0=B, r1=R, r2=G */ \
+ #mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+ #mov_instr " %%"#reg_type"1, %%"#reg_type"4;\n\t"\
+ #mov_instr " %%"#reg_type"0, %%"#reg_type"5;\n\t"\
+ "punpcklbw %%"#reg_type"2, %%"#reg_type"1;\n\t" /* GR GR GR GR low */\
+ "punpcklbw %%"#reg_type"3, %%"#reg_type"0;\n\t" /* 0B 0B 0B 0B low */\
+ "punpckhbw %%"#reg_type"2, %%"#reg_type"4;\n\t" /* GR GR GR GR high */\
+ "punpckhbw %%"#reg_type"3, %%"#reg_type"5;\n\t" /* 0B 0B 0B 0B high */\
+ #mov_instr " %%"#reg_type"1, %%"#reg_type"6;\n\t"\
+ #mov_instr " %%"#reg_type"4, %%"#reg_type"7;\n\t"\
+ "punpcklwd %%"#reg_type"0, %%"#reg_type"1;\n\t" /* 0BGR 0BGR 0 */\
+ "punpckhwd %%"#reg_type"0, %%"#reg_type"6;\n\t" /* 0BGR 0BGR 1 */\
+ "punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* 0BGR 0BGR 2 */\
+ "punpckhwd %%"#reg_type"5, %%"#reg_type"7;\n\t" /* 0BGR 0BGR 3 */\
+ MOVNTQ " %%"#reg_type"1, (%0);\n\t"\
+ MOVNTQ " %%"#reg_type"6, "#offset0"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+ MOVNTQ " %%"#reg_type"7, "#offset2"(%0);\n\t"\
+ :: "r" (dst), "m" (simd_alpha));
+
+#define LOAD_YUV_PLANAR_2(mov_instr, reg_type) \
+ __asm__ __volatile__ ( \
+ #mov_instr " %0, %%"#reg_type"6;\n\t" \
+ #mov_instr " %1, %%"#reg_type"0;\n\t" \
+ #mov_instr " %2, %%"#reg_type"1;\n\t" \
+ "pxor %%"#reg_type"4, %%"#reg_type"4;\n\t" \
+ :: "m" (*py), "m" (*pu), "m" (*pv));
+
+
+#endif /* __YUV2RGB_X86_H__ */
+
Added: liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h
===================================================================
--- liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h (rev 0)
+++ liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h 2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,129 @@
+#ifndef __OGGPLAY_YUV2RGB_VS_H__
+#define __OGGPLAY_YUV2RGB_VS_H__
+
+#define emms() __asm emms
+#define MMX_MOVNTQ movntq
+#define SSE2_MOVNTQ movdqu
+
+#define LOAD_YUV_PLANAR_2(mov_instr, reg_type) \
+ __asm { \
+ __asm mov eax, py \
+ __asm mov edx, pu \
+ __asm mov_instr reg_type##6, [eax] \
+ __asm mov_instr reg_type##0, [edx] \
+ __asm mov eax, pv \
+ __asm mov_instr reg_type##1, [eax] \
+ __asm pxor reg_type##4, reg_type##4 \
+ }
+
+#define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm { \
+ __asm mov eax, dst \
+ __asm mov_instr reg_type##3, simd_alpha \
+ __asm mov_instr reg_type##4, reg_type##1 \
+ __asm mov_instr reg_type##5, reg_type##0 \
+ __asm punpcklbw reg_type##1, reg_type##2 \
+ __asm punpcklbw reg_type##0, reg_type##3 \
+ __asm punpckhbw reg_type##4, reg_type##2 \
+ __asm punpckhbw reg_type##5, reg_type##3 \
+ __asm mov_instr reg_type##6, reg_type##1 \
+ __asm mov_instr reg_type##7, reg_type##4 \
+ __asm punpcklwd reg_type##1, reg_type##0 \
+ __asm punpckhwd reg_type##6, reg_type##0 \
+ __asm punpcklwd reg_type##4, reg_type##5 \
+ __asm punpckhwd reg_type##7, reg_type##5 \
+ __asm MOVNTQ [eax], reg_type##1 \
+ __asm MOVNTQ [eax+offset0], reg_type##6 \
+ __asm MOVNTQ [eax+offset1], reg_type##4 \
+ __asm MOVNTQ [eax+offset2], reg_type##7 \
+ }
+
+#define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm { \
+ __asm mov eax, dst \
+ __asm mov_instr reg_type##3, simd_alpha \
+ __asm mov_instr reg_type##4, reg_type##3 \
+ __asm mov_instr reg_type##5, reg_type##2 \
+ __asm punpcklbw reg_type##2, reg_type##0 \
+ __asm punpcklbw reg_type##3, reg_type##1 \
+ __asm punpckhbw reg_type##5, reg_type##0 \
+ __asm punpckhbw reg_type##4, reg_type##1 \
+ __asm mov_instr reg_type##0, reg_type##3 \
+ __asm mov_instr reg_type##1, reg_type##4 \
+ __asm punpcklwd reg_type##3, reg_type##2 \
+ __asm punpckhwd reg_type##0, reg_type##2 \
+ __asm punpcklwd reg_type##4, reg_type##5 \
+ __asm punpckhwd reg_type##1, reg_type##5 \
+ __asm MOVNTQ [eax], reg_type##3 \
+ __asm MOVNTQ [eax+offset0], reg_type##0 \
+ __asm MOVNTQ [eax+offset1], reg_type##4 \
+ __asm MOVNTQ [eax+offset2], reg_type##1 \
+ }
+
+#define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+ __asm { \
+ __asm mov eax, dst \
+ __asm mov_instr reg_type##3, simd_alpha \
+ __asm mov_instr reg_type##4, reg_type##0 \
+ __asm mov_instr reg_type##5, reg_type##1 \
+ __asm punpcklbw reg_type##0, reg_type##2 \
+ __asm punpcklbw reg_type##1, reg_type##3 \
+ __asm punpckhbw reg_type##4, reg_type##2 \
+ __asm punpckhbw reg_type##5, reg_type##3 \
+ __asm mov_instr reg_type##6, reg_type##0 \
+ __asm mov_instr reg_type##7, reg_type##4 \
+ __asm punpcklwd reg_type##0, reg_type##1 \
+ __asm punpckhwd reg_type##6, reg_type##1 \
+ __asm punpcklwd reg_type##4, reg_type##5 \
+ __asm punpckhwd reg_type##7, reg_type##5 \
+ __asm MOVNTQ [eax], reg_type##0 \
+ __asm MOVNTQ [eax+offset0], reg_type##6 \
+ __asm MOVNTQ [eax+offset1], reg_type##4 \
+ __asm MOVNTQ [eax+offset2], reg_type##7 \
+ }
+
+#define YUV_2_RGB(mov_instr, reg_type) \
+ __asm { \
+ __asm punpcklbw reg_type##0, reg_type##4 /* mm0 = u3 u2 u1 u0 */\
+ __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = v3 v2 v1 v0 */\
+ __asm psubsw reg_type##0, simd_80w /* u -= 128 */\
+ __asm psubsw reg_type##1, simd_80w /* v -= 128 */\
+ __asm psllw reg_type##0, 3 /* promote precision */\
+ __asm psllw reg_type##1, 3 /* promote precision */\
+ __asm mov_instr reg_type##2, reg_type##0 /* mm2 = u3 u2 u1 u0 */\
+ __asm mov_instr reg_type##3, reg_type##1 /* mm3 = v3 v2 v1 v0 */\
+ __asm pmulhw reg_type##2, simd_U_green /* mm2 = u * u_green */\
+ __asm pmulhw reg_type##3, simd_V_green /* mm3 = v * v_green */\
+ __asm pmulhw reg_type##0, simd_U_blue /* mm0 = chroma_b */\
+ __asm pmulhw reg_type##1, simd_V_red /* mm1 = chroma_r */\
+ __asm paddsw reg_type##2, reg_type##3 /* mm2 = chroma_g */\
+ __asm psubusb reg_type##6, simd_10w /* Y -= 16 */\
+ __asm mov_instr reg_type##7, reg_type##6 /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
+ __asm pand reg_type##6, simd_00ffw /* mm6 = Y6 Y4 Y2 Y0 */\
+ __asm psrlw reg_type##7, 8 /* mm7 = Y7 Y5 Y3 Y1 */\
+ __asm psllw reg_type##6, 3 /* promote precision */\
+ __asm psllw reg_type##7, 3 /* promote precision */\
+ __asm pmulhw reg_type##6, simd_Y_coeff /* mm6 = luma_rgb even */\
+ __asm pmulhw reg_type##7, simd_Y_coeff /* mm7 = luma_rgb odd */\
+ __asm mov_instr reg_type##3, reg_type##0 /* mm3 = chroma_b */\
+ __asm mov_instr reg_type##4, reg_type##1 /* mm4 = chroma_r */\
+ __asm mov_instr reg_type##5, reg_type##2 /* mm5 = chroma_g */\
+ __asm paddsw reg_type##0, reg_type##6 /* mm0 = B6 B4 B2 B0 */\
+ __asm paddsw reg_type##3, reg_type##7 /* mm3 = B7 B5 B3 B1 */\
+ __asm paddsw reg_type##1, reg_type##6 /* mm1 = R6 R4 R2 R0 */\
+ __asm paddsw reg_type##4, reg_type##7 /* mm4 = R7 R5 R3 R1 */\
+ __asm paddsw reg_type##2, reg_type##6 /* mm2 = G6 G4 G2 G0 */\
+ __asm paddsw reg_type##5, reg_type##7 /* mm5 = G7 G5 G3 G1 */\
+ __asm packuswb reg_type##0, reg_type##0 /* saturate to 0-255 */\
+ __asm packuswb reg_type##1, reg_type##1 /* saturate to 0-255 */\
+ __asm packuswb reg_type##2, reg_type##2 /* saturate to 0-255 */\
+ __asm packuswb reg_type##3, reg_type##3 /* saturate to 0-255 */\
+ __asm packuswb reg_type##4, reg_type##4 /* saturate to 0-255 */\
+ __asm packuswb reg_type##5, reg_type##5 /* saturate to 0-255 */\
+ __asm punpcklbw reg_type##0, reg_type##3 /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
+ __asm punpcklbw reg_type##1, reg_type##4 /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
+ __asm punpcklbw reg_type##2, reg_type##5 /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
+ }
+
+#endif
+
More information about the commits
mailing list