[xiph-commits] r3850 - in liboggplay/trunk: . include/oggplay src/examples src/liboggplay

Fri Feb 20 01:19:46 PST 2009

Author: wiking
Date: 2009-02-20 01:19:45 -0800 (Fri, 20 Feb 2009)
New Revision: 3850

Added:
   liboggplay/trunk/src/liboggplay/cpu.c
   liboggplay/trunk/src/liboggplay/cpu.h
   liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c
   liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h
   liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c
   liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h
   liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h
Modified:
   liboggplay/trunk/configure.ac
   liboggplay/trunk/include/oggplay/oggplay_tools.h
   liboggplay/trunk/src/examples/glut-player.c
   liboggplay/trunk/src/examples/mac-player.c
   liboggplay/trunk/src/examples/win32-player.c
   liboggplay/trunk/src/liboggplay/Makefile.am
   liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c
Log:
yuv to rgb conversion enhancments:
 - runtime cpu extension detection (from theora) + altivec detection support
 - altivec implementation of yuv2rgb functions
 - sse2 implementation of yuv2rgb functions

!!!WARNING!!!
The API for the conversion has been changed, in order to avoid confusions:
    oggplay_yuv2bgr -> oggplay_yuv2bgra
    oggplay_yuv2rgb -> oggplay_yuv2rgba
 
new function 'oggplay_yuv2argb' has been introduced, as e.g. directx uses ARGB packaging.



Modified: liboggplay/trunk/configure.ac
===================================================================

--- liboggplay/trunk/configure.ac	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/configure.ac	2009-02-20 09:19:45 UTC (rev 3850)
@@ -277,41 +277,6 @@
 fi
 dnl changequote([,])dnl
 
-dnl Define __SSE2__ to include MMX intrinsics (SSE2) code.
-dnl Sets also SSE2_CFLAGS for the compile time.
-
-SSE2_CFLAGS="-msse2 -march=pentium3"
-
-have_mmx_intrinsics=no
-AC_MSG_CHECKING(For MMX/SSE intrinsics in the compiler)
-liboggplay_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $SSE2_CFLAGS"
-AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
-#error "Need GCC >= 3.4 for MMX SSE2 intrinsics"
-#endif
-#include <xmmintrin.h>
-int main () {
- 	__m64 zero =_mm_setzero_si64(); 
-        __m64 ut = _m_from_int(0);
-        __m64 vt = _m_from_int(0);
-        ut = _m_punpcklbw(ut, zero);
-        vt = _m_punpcklbw(vt, zero);
-        vt = _mm_or_si64 (ut, vt);
-        return _mm_cvtsi64_si32 (vt);
-}], have_mmx_intrinsics=yes)
-CFLAGS=$liboggplay_save_CFLAGS
-AC_MSG_RESULT($have_mmx_intrinsics)
-
-if test $have_mmx_intrinsics = yes ; then
-   AC_DEFINE(__SSE2__, 1, [use MMX SSE2 compiler intrinsics])
-else
-   SSE2_CFLAGS=
-fi
-AC_SUBST(SSE2_CFLAGS)
-
-AM_CONDITIONAL(__SSE2__, test $have_mmx_intrinsics = yes)
-
 dnl
 dnl  Configuration tests complete -- provide summary of results.
 dnl

Modified: liboggplay/trunk/include/oggplay/oggplay_tools.h
===================================================================
--- liboggplay/trunk/include/oggplay/oggplay_tools.h	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/include/oggplay/oggplay_tools.h	2009-02-20 09:19:45 UTC (rev 3850)
@@ -72,11 +72,14 @@
 } OggPlayRGBChannels;
 
 void 
-oggplay_yuv2rgb(OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
+oggplay_yuv2rgba(const OggPlayYUVChannels *yuv, OggPlayRGBChannels * rgb);
 
 void 
-oggplay_yuv2bgr(OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
+oggplay_yuv2bgra(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb);
 
+void 
+oggplay_yuv2argb(const OggPlayYUVChannels *yuv, OggPlayRGBChannels * rgb);
+
 ogg_int64_t
 oggplay_sys_time_in_ms(void);
 

Modified: liboggplay/trunk/src/examples/glut-player.c
===================================================================
--- liboggplay/trunk/src/examples/glut-player.c	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/glut-player.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -141,7 +141,7 @@
   rgb.rgb_width = texture_width;
   rgb.rgb_height = texture_height;  
 
-  oggplay_yuv2rgb(&yuv, &rgb);
+  oggplay_yuv2rgba(&yuv, &rgb);
 
 }
 

Modified: liboggplay/trunk/src/examples/mac-player.c
===================================================================
--- liboggplay/trunk/src/examples/mac-player.c	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/mac-player.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -170,7 +170,7 @@
   rgb.rgb_width = texture_width;
   rgb.rgb_height = texture_height;  
   
-  oggplay_yuv2rgb(&yuv, &rgb);
+  oggplay_yuv2rgba(&yuv, &rgb);
 }
 
 

Modified: liboggplay/trunk/src/examples/win32-player.c
===================================================================
--- liboggplay/trunk/src/examples/win32-player.c	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/examples/win32-player.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -178,58 +178,11 @@
   rgb.rgb_height = texture_height;  
 
 #if OPENGL
-  oggplay_yuv2rgb(&yuv, &rgb);
+  oggplay_yuv2rgba(&yuv, &rgb);
 #else
-  oggplay_yuv2bgr(&yuv, &rgb);
+  oggplay_yuv2bgra(&yuv, &rgb);
 #endif 
-
-#else
-  ptry = video_data->y;
-  ptru = video_data->u;
-  ptrv = video_data->v;
-  ptro = texture_bits;
-
-  for (i = 0; i < y_height; i++) {
-    int j;
-    ptro2 = ptro;
-    for (j = 0; j < y_width; j += 2) {
-
-      short pr, pg, pb;
-      short r, g, b;
-      
-    //pr = ((128 + (ptrv[j/2] - 128) * 292) >> 8) - 16; /* 1.14 * 256 */
-      pr = (-41344 + ptrv[j/2] * 292) >> 8;
-    //pg = ((128 - (ptru[j/2] - 128) * 101 - (ptrv[j/2] - 128) * 149) >> 8)-16; 
-    //                                /* 0.395 & 0.581 */
-      pg = (28032 - ptru[j/2] * 101 - ptrv[j/2] * 149) >> 8;
-    //pb = ((128 + (ptru[j/2] - 128) * 520) >> 8) - 16; /* 2.032 */
-      pb = (-70528 + ptru[j/2] * 520) >> 8;
-
-      r = ptry[j] + pr;
-      g = ptry[j] + pg;
-      b = ptry[j] + pb;
-
-      *ptro2++ = CLAMP(r);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(b);
-      
-      r = ptry[j + 1] + pr;
-      g = ptry[j + 1] + pg;
-      b = ptry[j + 1] + pb;
-      
-      *ptro2++ = CLAMP(b);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(r);
-    }
-    ptry += y_width;
-    if (i & 1) {
-      ptru += uv_width;
-      ptrv += uv_width;
-    }
-    ptro += po2_width * 3;
-  }
   
-  
 #endif
  
 }

Modified: liboggplay/trunk/src/liboggplay/Makefile.am
===================================================================
--- liboggplay/trunk/src/liboggplay/Makefile.am	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/liboggplay/Makefile.am	2009-02-20 09:19:45 UTC (rev 3850)
@@ -1,6 +1,6 @@
 ## Process this file with automake to produce Makefile.in
 
-AM_CFLAGS = -Wall $(SSE2_CFLAGS) -Wdeclaration-after-statement
+AM_CFLAGS = -Wall -Wdeclaration-after-statement
 
 INCLUDES = $(INCLTDL) -I$(top_srcdir)/include
 
@@ -9,13 +9,17 @@
 # Libraries to build
 lib_LTLIBRARIES = liboggplay.la
 
-noinst_HEADERS =            \
-	oggplay_buffer.h	\
-	oggplay_callback.h	\
-	oggplay_data.h		\
-	oggplay_file_reader.h	\
-	oggplay_private.h	\
-	oggplay_tcp_reader.h \
+noinst_HEADERS =			\
+	cpu.h				\
+	oggplay_buffer.h		\
+	oggplay_callback.h		\
+	oggplay_data.h			\
+	oggplay_file_reader.h		\
+	oggplay_private.h		\
+	oggplay_tcp_reader.h 		\
+	oggplay_yuv2rgb_template.h 	\
+	yuv2rgb_x86.h 			\
+	yuv2rgb_x86_vs.h 		\
 	std_semaphore.h  
 
 liboggplay_la_SOURCES =     \
@@ -27,10 +31,10 @@
 	oggplay_data.c		\
 	oggplay_callback_info.c	\
 	oggplay_buffer.c	\
-	oggplay_yuv2rgb.c \
+	oggplay_yuv2rgb.c	\
 	oggplay_seek.c		\
 	oggplay_tools.c
 
-liboggplay_la_CFLAGS = $(AM_CFLAGS) $(OGGZ_CFLAGS) $(FISHSOUND_CFLAGS)
+liboggplay_la_CFLAGS = $(AM_CFLAGS) $(OGGZ_CFLAGS) $(FISHSOUND_CFLAGS) $(ALTIVEC_CFLAGS)
 liboggplay_la_LDFLAGS = -version-info @SHARED_VERSION_INFO@ @SHLIB_VERSION_ARG@ 
 liboggplay_la_LIBADD = @SEMAPHORE_LIBS@ @OGGZ_LIBS@ @FISHSOUND_LIBS@ @THEORA_LIBS@ @KATE_LIBS@

Added: liboggplay/trunk/src/liboggplay/cpu.c
===================================================================
--- liboggplay/trunk/src/liboggplay/cpu.c	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/cpu.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,270 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+/* for detecting AltiVec support */
+# if (defined(__ppc__) || defined(__ppc64__))
+#   if defined(__APPLE__) || defined(__MACOSX__)
+#include <sys/sysctl.h>
+#   else
+#include <signal.h>
+#include <setjmp.h>
+#   endif
+# endif
+
+# if (defined(__ppc__) || defined(__ppc64__)) && !(defined(__APPLE__) || defined(__MACOSX__))
+static jmp_buf jmpbuf;
+
+static void illegal_instruction(int sig)
+{
+        longjmp(jmpbuf, 1);
+}
+# endif
+
+
+# if !defined(_MSC_VER)
+#  if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+#  else
+/*On x86-32, not so much.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+#  endif
+# else
+/*Why does MSVC need this complicated rigamarole?
+  At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+   for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+  _asm {
+    mov eax,[_op]
+    mov esi,_cpu_info
+    cpuid
+    mov [esi+0],eax
+    mov [esi+4],ebx
+    mov [esi+8],ecx
+    mov [esi+12],edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  do{ \
+    ogg_uint32_t cpu_info[4]; \
+    oc_cpuid_helper(cpu_info,_op); \
+    (_eax)=cpu_info[0]; \
+    (_ebx)=cpu_info[1]; \
+    (_ecx)=cpu_info[2]; \
+    (_edx)=cpu_info[3]; \
+  }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+  _asm{
+    pushfd
+    pushfd
+    pop eax
+    mov ebx,eax
+    xor eax,200000h
+    push eax
+    popfd
+    pushfd
+    pop eax
+    popfd
+    mov ecx,_eax
+    mov [ecx],eax
+    mov ecx,_ebx
+    mov [ecx],ebx
+  }
+}
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+static ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags = 0;
+# if defined(__ppc__) || defined(__ppc64__) 
+/* detect AltiVec extension if compiling it for ppc */
+#  if defined(__APPLE__) || defined(__MACOSX__)  || defined(__DARWIN__)
+	int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+	int i_has_altivec = 0;
+	size_t i_length = sizeof( i_has_altivec );
+	int i_error = sysctl( selectors, 2, &i_has_altivec, &i_length, NULL, 0);
+
+	if( i_error == 0 && i_has_altivec != 0 )
+		flags |= OC_CPU_PPC_ALTIVEC;
+#  else
+	void (*handler) (int sig);
+	handler = signal(SIGILL, illegal_instruction);
+	if (setjmp(jmpbuf) == 0) 
+	{
+		__asm__ __volatile__ (
+			"mtspr 256, %0\n\t" 
+			"vand %%v0, %%v0, %%v0" 
+			: : "r"(-1) );
+
+		flags |= OC_CPU_PPC_ALTIVEC;
+	}
+	signal(SIGILL, handler);
+#  endif	
+/* detect x86 CPU extensions */
+# elif defined(i386) || defined(__x86_64__) || defined(_M_IX86)
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+#  if !defined(_MSC_VER)
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+#  else
+  oc_detect_cpuid_helper(&eax,&ebx);
+#  endif
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534E20&&edx==0x79622065&&ebx==0x646F6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+# else
+  /* not x86 or ppc */ 
+# endif
+  return flags;
+}

Added: liboggplay/trunk/src/liboggplay/cpu.h
===================================================================
--- liboggplay/trunk/src/liboggplay/cpu.h	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/cpu.h	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,34 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#ifndef __CPU_H__
+#define __CPU_H__
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+#define OC_CPU_PPC_ALTIVEC  (1<<12)
+
+#endif

Modified: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c	2009-02-19 05:46:08 UTC (rev 3849)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -38,413 +38,209 @@
  * Shane Stephens <shane.stephens at annodex.net>
  * Michael Martin
  * Marcin Lubonski
+ * Viktor Gal
  */
 
 #include "oggplay_private.h"
+#include "oggplay_yuv2rgb_template.h"
 
-/*
- * YUV -> RGB conversion
- *  R = Y + 1.140V
- *  G = Y - 0.395U - 0.581V
- *  B = Y + 2.032U
- *
- * RGB -> YUV conversion
- *  Y = 0.299 R + 0.587 G + 0.114 B
- *  U = 0.147 R - 0.289 G + 0.436 B
- *  V = 0.615 R - 0.515 G - 0.100 B
- */
+/* cpu extension detection */
+#include "cpu.c"
 
-#if defined(__MMX__) || defined(__SSE__) || defined(__SSE2__) || defined(__SSE3__)
-
-#if defined(WIN32)
-#define restrict
-#include <emmintrin.h>
-#else
-#include <xmmintrin.h>
-#ifndef restrict
-#define restrict __restrict__
+/* although we use cpu runtime detection, we still need these
+ * macros as there's no way e.g. we could compile a x86 asm code 
+ * on a ppc machine and vica-versa
+ */
+#if defined(i386) || defined(__x86__) || defined(__x86_64__) || defined(_M_IX86)
+#include "oggplay_yuv2rgb_x86.c"
+#elif defined(__ppc__) || defined(__ppc64__)
+//altivec intristics only working with -maltivec gcc flag, 
+//but we want runtime altivec detection, hence this has to be
+//fixed!
+//#include "oggplay_yuv2rgb_altivec.c"
 #endif
-#endif
 
-/* YUV -> RGB Intel MMX implementation */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+/**
+ * yuv_convert_fptr type is a function pointer type for
+ * the various yuv-rgb converters
+ */
+typedef void (*yuv_convert_fptr) (const OggPlayYUVChannels *yuv, 
+					OggPlayRGBChannels *rgb);
 
-  int               i;
-  unsigned char   * restrict ptry;
-  unsigned char   * restrict ptru;
-  unsigned char   * restrict ptrv;
-  unsigned char   * ptro;
+/* it is useless to determine each YUV conversion run
+ * the cpu type/featurs, thus we save the conversion function
+ * pointers
+ */
+static struct OggPlayYUVConverters {
+	yuv_convert_fptr yuv2rgba; /**< YUV420 to RGBA */
+	yuv_convert_fptr yuv2bgra; /**< YUV420 to BGRA */
+	yuv_convert_fptr yuv2argb; /**< YUV420 to ARGB */
+} yuv_conv = {NULL, NULL, NULL};
 
-  register __m64    *y, *o;
-  register __m64    zero, ut, vt, imm, imm2;
-  register __m64    r, g, b;
-  register __m64    tmp, tmp2;
+/**
+ * vanilla implementation of YUV-to-RGB conversion.
+ *
+ *  - using table-lookups instead of multiplication
+ *  - avoid CLAMPing by incorporating 
+ *
+ */
 
-  zero = _mm_setzero_si64();
+#define CLAMP(v)    ((v) > 255 ? 255 : (v) < 0 ? 0 : (v))
 
-  ptro = rgb->ptro;
-  ptry = yuv->ptry;
-  ptru = yuv->ptru;
-  ptrv = yuv->ptrv;
+#define prec 15 
+static const int CoY	= (int)(1.164 * (1 << prec) + 0.5);
+static const int CoRV	= (int)(1.596 * (1 << prec) + 0.5);
+static const int CoGU	= (int)(0.391 * (1 << prec) + 0.5);
+static const int CoGV	= (int)(0.813 * (1 << prec) + 0.5);
+static const int CoBU	= (int)(2.018 * (1 << prec) + 0.5);
 
-  for (i = 0; i < yuv->y_height; i++) {
-    int j;
-    o = (__m64*)ptro;
-    ptro += rgb->rgb_width * 4;
-    for (j = 0; j < yuv->y_width; j += 8) {
+static int CoefsGU[256] = {0};
+static int CoefsGV[256]; 
+static int CoefsBU[256]; 
+static int CoefsRV[256];
+static int CoefsY[256];
 
-      y = (__m64*)&ptry[j];
+/**
+ * Initialize the lookup-table for vanilla yuv to rgb conversion.
+ */
+static void
+init_tables()
+{
+	int i;
 
-      ut = _m_from_int(*(int *)(ptru + j/2));
-      vt = _m_from_int(*(int *)(ptrv + j/2));
-
-      //ut = _m_from_int(0);
-      //vt = _m_from_int(0);
-
-      ut = _m_punpcklbw(ut, zero);
-      vt = _m_punpcklbw(vt, zero);
-
-      /* subtract 128 from u and v */
-      imm = _mm_set1_pi16(128);
-      ut = _m_psubw(ut, imm);
-      vt = _m_psubw(vt, imm);
-
-      /* transfer and multiply into r, g, b registers */
-      imm = _mm_set1_pi16(-51);
-      g = _m_pmullw(ut, imm);
-      imm = _mm_set1_pi16(130);
-      b = _m_pmullw(ut, imm);
-      imm = _mm_set1_pi16(146);
-      r = _m_pmullw(vt, imm);
-      imm = _mm_set1_pi16(-74);
-      imm = _m_pmullw(vt, imm);
-      g = _m_paddsw(g, imm);
-
-      /* add 64 to r, g and b registers */
-      imm = _mm_set1_pi16(64);
-      r = _m_paddsw(r, imm);
-      g = _m_paddsw(g, imm);
-      imm = _mm_set1_pi16(32);
-      b = _m_paddsw(b, imm);
-
-      /* shift r, g and b registers to the right */
-      r = _m_psrawi(r, 7);
-      g = _m_psrawi(g, 7);
-      b = _m_psrawi(b, 6);
-
-      /* subtract 16 from r, g and b registers */
-      imm = _mm_set1_pi16(16);
-      r = _m_psubsw(r, imm);
-      g = _m_psubsw(g, imm);
-      b = _m_psubsw(b, imm);
-
-      y = (__m64*)&ptry[j];
-
-      /* duplicate u and v channels and add y
-       * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
-       * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
-       * then add y, then interleave again
-       * then pack with saturation, to get the desired output of
-       *   [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
-       */
-      tmp = _m_punpckhwd(r, r);
-      imm = _m_punpckhbw(*y, zero);
-      //printf("tmp: %llx imm: %llx\n", tmp, imm);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_punpcklwd(r, r);
-      imm2 = _m_punpcklbw(*y, zero);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      r = _m_packuswb(tmp2, tmp);
-
-      tmp = _m_punpckhwd(g, g);
-      tmp2 = _m_punpcklwd(g, g);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      g = _m_packuswb(tmp2, tmp);
-
-      tmp = _m_punpckhwd(b, b);
-      tmp2 = _m_punpcklwd(b, b);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      b = _m_packuswb(tmp2, tmp);
-      //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
-
-      /* now we have 8 8-bit r, g and b samples.  we want these to be packed
-       * into 32-bit values.
-       */
-      //r = _m_from_int(0);
-      //b = _m_from_int(0);
-      imm = _mm_set1_pi32(0xFFFFFFFF);
-      tmp = _m_punpcklbw(r, b);
-      tmp2 = _m_punpcklbw(g, imm);
-      *o++ = _m_punpcklbw(tmp, tmp2);
-      *o++ = _m_punpckhbw(tmp, tmp2);
-      //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2,
-      //                _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
-      tmp = _m_punpckhbw(r, b);
-      tmp2 = _m_punpckhbw(g, imm);
-      *o++ = _m_punpcklbw(tmp, tmp2);
-      *o++ = _m_punpckhbw(tmp, tmp2);
-
-      //exit(1);
-    }
-    if (i & 0x1) {
-      ptru += yuv->uv_width;
-      ptrv += yuv->uv_width;
-    }
-    ptry += yuv->y_width;
-  }
-  _m_empty();
-
+	for(i = 0; i < 256; ++i)
+	{
+		CoefsGU[i] = -CoGU * (i - 128);
+		CoefsGV[i] = -CoGV * (i - 128);
+		CoefsBU[i] = CoBU * (i - 128);
+		CoefsRV[i] = CoRV * (i - 128);
+		CoefsY[i]  = CoY * (i - 16) + (prec/2);
+	}
 }
 
-/* YUV -> BGR Intel MMX implementation */
-void oggplay_yuv2bgr(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+#define VANILLA_YUV2RGB_PIXEL(y, ruv, guv, buv)	\
+r = (CoefsY[y] + ruv) >> prec;	\
+g = (CoefsY[y] + guv) >> prec;	\
+b = (CoefsY[y] + buv) >> prec;	\
 
-  int               i;
-  unsigned char   * restrict ptry;
-  unsigned char   * restrict ptru;
-  unsigned char   * restrict ptrv;
-  unsigned char   * ptro;
+#define VANILLA_RGBA_OUT(out, r, g, b) \
+out[0] = CLAMP(r); \
+out[1] = CLAMP(g); \
+out[2] = CLAMP(b); \
+out[3] = 255;
 
-  register __m64    *y, *o;
-  register __m64    zero, ut, vt, imm, imm2;
-  register __m64    r, g, b;
-  register __m64    tmp, tmp2;
+#define VANILLA_BGRA_OUT(out, r, g, b) \
+out[0] = CLAMP(b); \
+out[1] = CLAMP(g); \
+out[2] = CLAMP(r); \
+out[3] = 255;
 
-  zero = _mm_setzero_si64();
+#define VANILLA_ARGB_OUT(out, r, g, b) \
+out[0] = 255;	   \
+out[1] = CLAMP(r); \
+out[2] = CLAMP(g); \
+out[3] = CLAMP(b);
 
-  ptry = yuv->ptry;
-  ptru = yuv->ptru;
-  ptrv = yuv->ptrv;
-  ptro = rgb->ptro;
+/* yuv420p -> */
+#define LOOKUP_COEFFS int ruv = CoefsRV[*pv]; 			\
+		      int guv = CoefsGU[*pu] + CoefsGV[*pv]; 	\
+		      int buv = CoefsBU[*pu]; 			\
+                      int r, g, b;
 
-  for (i = 0; i < yuv->y_height; i++) {
-    int j;
-    o = (__m64*)ptro;
-    ptro += rgb->rgb_width * 4;
-    for (j = 0; j < yuv->y_width; j += 8) {
+#define CONVERT(OUTPUT_FUNC) LOOKUP_COEFFS				 \
+			     VANILLA_YUV2RGB_PIXEL(py[0], ruv, guv, buv);\
+			     OUTPUT_FUNC(dst, r, g, b);			 \
+			     VANILLA_YUV2RGB_PIXEL(py[1], ruv, guv, buv);\
+			     OUTPUT_FUNC((dst+4), r, g, b);
 
-      y = (__m64*)&ptry[j];
+#define CLEANUP
 
-      ut = _m_from_int(*(int *)(ptru + j/2));
-      vt = _m_from_int(*(int *)(ptrv + j/2));
+YUV_CONVERT(yuv420_to_rgba_vanilla, CONVERT(VANILLA_RGBA_OUT), 2, 8, 2, 1)
+YUV_CONVERT(yuv420_to_bgra_vanilla, CONVERT(VANILLA_BGRA_OUT), 2, 8, 2, 1)
+YUV_CONVERT(yuv420_to_argb_vanilla, CONVERT(VANILLA_ARGB_OUT), 2, 8, 2, 1)
 
-      //ut = _m_from_int(0);
-      //vt = _m_from_int(0);
+#undef CONVERT
+#undef CLEANUP
 
-      ut = _m_punpcklbw(ut, zero);
-      vt = _m_punpcklbw(vt, zero);
-
-      /* subtract 128 from u and v */
-      imm = _mm_set1_pi16(128);
-      ut = _m_psubw(ut, imm);
-      vt = _m_psubw(vt, imm);
-
-      /* transfer and multiply into r, g, b registers */
-      imm = _mm_set1_pi16(-51);
-      g = _m_pmullw(ut, imm);
-      imm = _mm_set1_pi16(130);
-      b = _m_pmullw(ut, imm);
-      imm = _mm_set1_pi16(146);
-      r = _m_pmullw(vt, imm);
-      imm = _mm_set1_pi16(-74);
-      imm = _m_pmullw(vt, imm);
-      g = _m_paddsw(g, imm);
-
-      /* add 64 to r, g and b registers */
-      imm = _mm_set1_pi16(64);
-      r = _m_paddsw(r, imm);
-      g = _m_paddsw(g, imm);
-      imm = _mm_set1_pi16(32);
-      b = _m_paddsw(b, imm);
-
-      /* shift r, g and b registers to the right */
-      r = _m_psrawi(r, 7);
-      g = _m_psrawi(g, 7);
-      b = _m_psrawi(b, 6);
-
-      /* subtract 16 from r, g and b registers */
-      imm = _mm_set1_pi16(16);
-      r = _m_psubsw(r, imm);
-      g = _m_psubsw(g, imm);
-      b = _m_psubsw(b, imm);
-
-      y = (__m64*)&ptry[j];
-
-      /* duplicate u and v channels and add y
-       * each of r,g, b in the form [s1(16), s2(16), s3(16), s4(16)]
-       * first interleave, so tmp is [s1(16), s1(16), s2(16), s2(16)]
-       * then add y, then interleave again
-       * then pack with saturation, to get the desired output of
-       *   [s1(8), s1(8), s2(8), s2(8), s3(8), s3(8), s4(8), s4(8)]
-       */
-      tmp = _m_punpckhwd(r, r);
-      imm = _m_punpckhbw(*y, zero);
-      //printf("tmp: %llx imm: %llx\n", tmp, imm);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_punpcklwd(r, r);
-      imm2 = _m_punpcklbw(*y, zero);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      r = _m_packuswb(tmp2, tmp);
-
-      tmp = _m_punpckhwd(g, g);
-      tmp2 = _m_punpcklwd(g, g);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      g = _m_packuswb(tmp2, tmp);
-
-      tmp = _m_punpckhwd(b, b);
-      tmp2 = _m_punpcklwd(b, b);
-      tmp = _m_paddsw(tmp, imm);
-      tmp2 = _m_paddsw(tmp2, imm2);
-      b = _m_packuswb(tmp2, tmp);
-      //printf("duplicated r g and b: %llx %llx %llx\n", r, g, b);
-
-      /* now we have 8 8-bit r, g and b samples.  we want these to be packed
-       * into 32-bit values.
-       */
-      //r = _m_from_int(0);
-      //b = _m_from_int(0);
-      imm = _mm_set1_pi32(0xFFFFFFFF);
-      tmp = _m_punpcklbw(b, r);
-      tmp2 = _m_punpcklbw(g, imm);
-      *o++ = _m_punpcklbw(tmp, tmp2);
-      *o++ = _m_punpckhbw(tmp, tmp2);
-      //printf("tmp, tmp2, write1, write2: %llx %llx %llx %llx\n", tmp, tmp2,
-      //                _m_punpcklbw(tmp, tmp2), _m_punpckhbw(tmp, tmp2));
-      tmp = _m_punpckhbw(b, r);
-      tmp2 = _m_punpckhbw(g, imm);
-      *o++ = _m_punpcklbw(tmp, tmp2);
-      *o++ = _m_punpckhbw(tmp, tmp2);
-
-      //exit(1);
-    }
-    if (i & 0x1) {
-      ptru += yuv->uv_width;
-      ptrv += yuv->uv_width;
-    }
-    ptry += yuv->y_width;
-  }
-  _m_empty();
-
-}
-
-#elif defined(__xxAPPLExx__)
-/*
- * TODO: implement the SIMD method above using Apple's AltiVec code;
- * for now, we'll use the vanilla implementation for Macs.
+/**
+ * Initialize the function pointers in yuv_conv.
  *
- * Also, there's probably a better preprocessor macro for detecting
- * the presence of AltiVec than __APPLE__.
+ * Initialize the function pointers in yuv_conv, based on the
+ * the available CPU extensions.
  */
+static void
+init_yuv_converters(void)
+{
+	ogg_uint32_t features = 0;
 
-/* Macintosh AltiVec implementation */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+	if ( yuv_conv.yuv2rgba == NULL )
+	{
+		features = oc_cpu_flags_get(); 
+#if defined(i386) || defined(__x86__) || defined(__x86_64__) || defined(_M_IX86) 
+		if (features & (OC_CPU_X86_SSE2|OC_CPU_X86_MMX|OC_CPU_X86_SSE))
+		{
+			yuv_conv.yuv2rgba = yuv420_to_rgba_sse2;
+			yuv_conv.yuv2bgra = yuv420_to_bgra_sse2;
+			yuv_conv.yuv2argb = yuv420_to_argb_sse2;
+			return;
+		}
+		else if (features & (OC_CPU_X86_MMX|OC_CPU_X86_SSE))
+		{
+			yuv_conv.yuv2rgba = yuv420_to_rgba_mmx;
+			yuv_conv.yuv2bgra = yuv420_to_bgra_mmx;
+			yuv_conv.yuv2argb = yuv420_to_argb_mmx;
+			return;
+		}
+		else if (features & OC_CPU_X86_MMX)
+		{
+			yuv_conv.yuv2rgba = yuv420_to_rgba_mmx;
+			yuv_conv.yuv2bgra = yuv420_to_bgra_mmx;
+			yuv_conv.yuv2argb = yuv420_to_argb_mmx;
+			return;
+		}
+#elif defined(__ppc__) || defined(__ppc64__)
+		if (features & OC_CPU_PPC_ALTIVEC)
+		{
+			init_altivec();
+			yuv_conv.yuv2rgba = yuv420_to_rgba_vanilla;
+			yuv_conv.yuv2bgra = yuv420_to_bgra_vanilla;
+			yuv_conv.yuv2argb = yuv420_to_argb_vanilla;
+			return;
+		}
+#endif		
+		/* no CPU extension was found... using vanilla converter */
+		init_tables();
+		yuv_conv.yuv2rgba = yuv420_to_rgba_vanilla;
+		yuv_conv.yuv2bgra = yuv420_to_bgra_vanilla;
+		yuv_conv.yuv2argb = yuv420_to_argb_vanilla;
+	}
 }
 
-#else
 
-#define CLAMP(v)    ((v) > 255 ? 255 : (v) < 0 ? 0 : (v))
+void
+oggplay_yuv2rgba(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb)
+{
+	if (yuv_conv.yuv2rgba == NULL)
+		init_yuv_converters();
 
-/* Vanilla implementation if YUV->RGB conversion */
-void oggplay_yuv2rgb(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+	yuv_conv.yuv2rgba(yuv, rgb);
+}
 
-  unsigned char * ptry = yuv->ptry;
-  unsigned char * ptru = yuv->ptru;
-  unsigned char * ptrv = yuv->ptrv;
-  unsigned char * ptro = rgb->ptro;
-  unsigned char * ptro2;
-  int i, j;
+void 
+oggplay_yuv2bgra(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb)
+{
+	if (yuv_conv.yuv2bgra == NULL)
+		init_yuv_converters();
 
-  for (i = 0; i < yuv->y_height; i++) {
-    ptro2 = ptro;
-    for (j = 0; j < yuv->y_width; j += 2) {
-
-      short pr, pg, pb, y;
-      short r, g, b;
-
-      pr = (-56992 + ptrv[j/2] * 409) >> 8;
-      pg = (34784 - ptru[j/2] * 100 - ptrv[j/2] * 208) >> 8;
-      pb = (-70688 + ptru[j/2] * 516) >> 8;
-
-      y = 298*ptry[j] >> 8;
-      r = y + pr;
-      g = y + pg;
-      b = y + pb;
-
-      *ptro2++ = CLAMP(r);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(b);
-      *ptro2++ = 255;
-
-      y = 298*ptry[j + 1] >> 8;
-      r = y + pr;
-      g = y + pg;
-      b = y + pb;
-
-      *ptro2++ = CLAMP(r);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(b);
-      *ptro2++ = 255;
-    }
-    ptry += yuv->y_width;
-    if (i & 1) {
-      ptru += yuv->uv_width;
-      ptrv += yuv->uv_width;
-    }
-    ptro += rgb->rgb_width * 4;
-  }
+	yuv_conv.yuv2bgra(yuv, rgb);
 }
 
-/* Vanilla implementation of YUV->BGR conversion*/
-void oggplay_yuv2bgr(OggPlayYUVChannels * yuv, OggPlayRGBChannels * rgb) {
+void 
+oggplay_yuv2argb(const OggPlayYUVChannels* yuv, OggPlayRGBChannels * rgb)
+{
+	if (yuv_conv.yuv2argb == NULL)
+		init_yuv_converters();
 
-  unsigned char * ptry = yuv->ptry;
-  unsigned char * ptru = yuv->ptru;
-  unsigned char * ptrv = yuv->ptrv;
-  unsigned char * ptro = rgb->ptro;
-  unsigned char * ptro2;
-  int i, j;
-
-  for (i = 0; i < yuv->y_height; i++) {
-    ptro2 = ptro;
-    for (j = 0; j < yuv->y_width; j += 2) {
-
-      short pr, pg, pb, y;
-      short r, g, b;
-
-      pr = (-56992 + ptrv[j/2] * 409) >> 8;
-      pg = (34784 - ptru[j/2] * 100 - ptrv[j/2] * 208) >> 8;
-      pb = (-70688 + ptru[j/2] * 516) >> 8;
-
-      y = 298*ptry[j] >> 8;
-      r = y + pr;
-      g = y + pg;
-      b = y + pb;
-
-      *ptro2++ = CLAMP(b);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(r);
-      *ptro2++ = 255;
-
-      y = 298*ptry[j + 1] >> 8;
-      r = y + pr;
-      g = y + pg;
-      b = y + pb;
-
-      *ptro2++ = CLAMP(b);
-      *ptro2++ = CLAMP(g);
-      *ptro2++ = CLAMP(r);
-      *ptro2++ = 255;
-    }
-    ptry += yuv->y_width;
-    if (i & 1) {
-      ptru += yuv->uv_width;
-      ptrv += yuv->uv_width;
-    }
-    ptro += rgb->rgb_width * 4;
-  }
+	yuv_conv.yuv2argb(yuv, rgb);
 }
 
-#endif

Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_altivec.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,235 @@
+#include <altivec.h>
+
+/* coeffiecients for AltiVec YUV->RGB conversion */
+static vector signed short   CY;
+static vector signed short   CRV;
+static vector signed short   CBU;
+static vector signed short   CGU;
+static vector signed short   CGV;
+static vector unsigned short CSHIFT;
+
+
+/**
+ * Initialize the static coef vectors
+ */
+static void
+init_altivec()
+{
+	CY = vec_splat ((vector signed short){0x253f}, 0);
+	CRV = vec_splat ((vector signed short){0x3312}, 0);
+	CBU = vec_splat ((vector signed short){0x4093}, 0);
+	CGU = vec_splat ((vector signed short){0xf37d}, 0);
+	CGV = vec_splat ((vector signed short){0xe5fc}, 0);
+	CSHIFT = vec_splat ((vector unsigned short){0x2}, 0);
+}
+
+
+#define vec_unh(x) \
+    (vector signed short) \
+        vec_perm(x,(__typeof__(x)){0}, \
+                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
+                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
+#define vec_unl(x) \
+    (vector signed short) \
+        vec_perm(x,(__typeof__(x)){0}, \
+                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
+                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
+
+#define vec_packclp(x,y) \
+    (vector unsigned char)vec_packs \
+        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
+         (vector unsigned short)vec_max (y,((vector signed short) {0})))
+
+#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
+do {                                                                          \
+    T _0,_1,_2,_3;                                                            \
+    _0 = vec_mergeh (x0,x1);                                                  \
+    _1 = vec_mergeh (x2,x3);                                                  \
+    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+    vec_st (_2, 0*16, (T *)ptr);                                              \
+    vec_st (_3, 1*16, (T *)ptr);                                              \
+    _0 = vec_mergel (x0,x1);                                                  \
+    _1 = vec_mergel (x2,x3);                                                  \
+    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+    vec_st (_2, 2*16, (T *)ptr);                                              \
+    vec_st (_3, 3*16, (T *)ptr);                                              \
+    ptr += 4;                                                                 \
+}  while (0);
+
+/**
+ * macros for various output
+ */
+#define OUTPUT_BGRA(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){0xFF}),ptr)
+#define OUTPUT_RGBA(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){0xFF}),ptr)
+#define OUTPUT_ARGB(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0xFF}),a,b,c,ptr)
+
+/**
+ * Function template for YUV420->RGB palet conversion with AltiVec
+ */
+#define YUV420_CONVERT_ALTIVEC(FUNC, OUTPUT_FUNC) \
+static void \
+(FUNC)(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb)	\
+{										\
+	int		i,j, w, h; 						\
+	unsigned char 	*py1, *py2, *pu, *pv, *po0, *po1, *dst0, *dst1; 	\
+	unsigned char 	*ptry0, *ptry1, *ptru, *ptrv; 				\
+  	vector unsigned char y0,y1; 						\
+										\
+  	vector signed char  u,v;						\
+										\
+  	vector signed short Y0,Y1,Y2,Y3;					\
+	vector signed short U,V;						\
+	vector signed short vx,ux,uvx;						\
+	vector signed short vx0,ux0,uvx0;					\
+	vector signed short vx1,ux1,uvx1;					\
+	vector signed short R0,G0,B0;						\
+	vector signed short R1,G1,B1;						\
+	vector unsigned char R,G,B;						\
+										\
+	vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP, *out0, *out1;	\
+	vector unsigned char align_perm;					\
+										\
+	ptry0 = yuv->ptry;							\
+	ptry1 = yuv->ptry+yuv->y_width;						\
+	ptru = yuv->ptru;							\
+	ptrv = yuv->ptrv;							\
+										\
+	/* po0 even, po1 even lines */						\
+	po0 = rgb->ptro;							\
+	po1 = rgb->ptro+(rgb->rgb_width*4);					\
+										\
+	w = yuv->y_width/16;							\
+	h = yuv->y_height/2;							\
+	for (i = 0; i < h; ++i)							\
+	{									\
+		dst0 = po0;							\
+		dst1 = po1;							\
+		pu   = ptru;							\
+		pv   = ptrv;							\
+		py1  = ptry0;							\
+		py2  = ptry1;							\
+		for (j = 0; j < w; ++j, 					\
+				dst0 += 64, dst1 +=64, 				\
+				py1 += 16, py2 += 16,				\
+				pu += 8, pv += 8)				\
+		{								\
+			out0  = (vector unsigned char *) dst0;			\
+			out1  = (vector unsigned char *) dst1;			\
+			y1ivP = (vector unsigned char *) py1;			\
+			y2ivP = (vector unsigned char *) py2;			\
+			uivP  = (vector unsigned char *) pu;			\
+			vivP  = (vector unsigned char *) pv;			\
+										\
+			align_perm = vec_lvsl (0, py1);				\
+			y0 = (vector unsigned char)				\
+				vec_perm (y1ivP[0], y1ivP[1], align_perm);	\
+										\
+			align_perm = vec_lvsl (0, py2);				\
+			y1 = (vector unsigned char)				\
+				vec_perm (y2ivP[0], y2ivP[1], align_perm);	\
+										\
+			align_perm = vec_lvsl (0, pu);				\
+			u = (vector signed char)				\
+				vec_perm (uivP[0], uivP[1], align_perm);	\
+										\
+			align_perm = vec_lvsl (0, pv);				\
+			v = (vector signed char)				\
+				vec_perm (vivP[0], vivP[1], align_perm);	\
+										\
+			/* U -= 128, V -=128 */					\
+			u  = (vector signed char)				\
+				vec_sub (u,(vector signed char)			\
+			     		vec_splat((vector signed char){128},0));\
+			v  = (vector signed char)				\
+				vec_sub (v,(vector signed char)			\
+			     		vec_splat((vector signed char){128},0));\
+										\
+			U  = vec_unpackh (u);					\
+			V  = vec_unpackh (v);					\
+			U = vec_sl (U, CSHIFT);					\
+			V = vec_sl (V, CSHIFT);					\
+										\
+			/* Y -= 16 */						\
+			y0 = (vector unsigned char)				\
+				vec_sub (y0, (vector unsigned char) 		\
+					vec_splat((vector unsigned char){16},0));\
+			y1 = (vector unsigned char)				\
+				vec_sub (y1, (vector unsigned char) 		\
+					vec_splat((vector unsigned char){16},0));\
+										\
+			Y0 = vec_unh (y0);					\
+			Y1 = vec_unl (y0);					\
+			Y2 = vec_unh (y1);					\
+			Y3 = vec_unl (y1);					\
+										\
+			Y0 = vec_sl (Y0, CSHIFT);				\
+			Y1 = vec_sl (Y1, CSHIFT);				\
+			Y2 = vec_sl (Y2, CSHIFT);				\
+			Y3 = vec_sl (Y3, CSHIFT);				\
+										\
+			/* Y *= CY */						\
+			Y0 = vec_mradds (Y0, CY, (vector signed short){0});	\
+			Y1 = vec_mradds (Y1, CY, (vector signed short){0});	\
+			Y2 = vec_mradds (Y2, CY, (vector signed short){0});     \
+			Y3 = vec_mradds (Y3, CY, (vector signed short){0});     \
+										\
+			/*   ux  = CBU*U >> 8 */				\
+			ux = vec_mradds (U, CBU, (vector signed short){0});	\
+			ux0  = vec_mergeh (ux,ux);				\
+			ux1  = vec_mergel (ux,ux);				\
+										\
+			/* vx  = CRV*V >> 8*/ 					\
+			vx = vec_mradds (V, CRV, (vector signed short){0});	\
+			vx0  = vec_mergeh (vx,vx);				\
+			vx1  = vec_mergel (vx,vx);				\
+										\
+			/* uvx = ((CGU*u) + (CGV*V)) >> 8 */			\
+			uvx = vec_mradds (U, CGU, (vector signed short){0});	\
+			uvx = vec_mradds (V, CGV, uvx);				\
+			uvx0 = vec_mergeh (uvx,uvx);				\
+			uvx1 = vec_mergel (uvx,uvx);				\
+										\
+			R0 = vec_add (Y0,vx0);					\
+			G0 = vec_add (Y0,uvx0);					\
+			B0 = vec_add (Y0,ux0);					\
+			R1 = vec_add (Y1,vx1);					\
+			G1 = vec_add (Y1,uvx1);					\
+			B1 = vec_add (Y1,ux1);					\
+										\
+			R  = vec_packclp (R0,R1);				\
+			G  = vec_packclp (G0,G1);				\
+			B  = vec_packclp (B0,B1);				\
+										\
+			OUTPUT_FUNC(R,G,B,out0);				\
+										\
+			R0 = vec_add (Y2,vx0);                                  \
+			G0 = vec_add (Y2,uvx0);                                 \
+			B0 = vec_add (Y2,ux0);                                  \
+			R1 = vec_add (Y3,vx1);                                  \
+			G1 = vec_add (Y3,uvx1);                                 \
+			B1 = vec_add (Y3,ux1);                                  \
+			R  = vec_packclp (R0,R1);                               \
+			G  = vec_packclp (G0,G1);                               \
+			B  = vec_packclp (B0,B1);                        	\
+										\
+			OUTPUT_FUNC(R,G,B,out1);				\
+		}								\
+		po0 += (rgb->rgb_width*8);					\
+		po1 += (rgb->rgb_width*8);					\
+										\
+		ptry0 += yuv->y_width * 2;					\
+		ptry1 += yuv->y_width * 2;					\
+		ptru  += yuv->uv_width;						\
+		ptrv  += yuv->uv_width;						\
+	}									\
+}
+
+/**
+ * yuv420 ->
+ */
+YUV420_CONVERT_ALTIVEC(yuv420_to_rgba_altivec, OUTPUT_RGBA)
+YUV420_CONVERT_ALTIVEC(yuv420_to_bgra_altivec, OUTPUT_BGRA)
+YUV420_CONVERT_ALTIVEC(yuv420_to_argb_altivec, OUTPUT_ARGB)
+

Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_template.h	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,68 @@
+#ifndef __OGGPLAY_YUV2RGB_TEMPLATE_H__
+#define __OGGPLAY_YUV2RGB_TEMPLATE_H__
+
+#if defined(WIN32)
+#define restrict
+#else
+#ifndef restrict
+#define restrict __restrict__
+#endif
+#endif
+
+/**
+ * Template for YUV to RGB conversion
+ *
+ * @param FUNC function name
+ * @param CONVERT a macro that defines 
+ * @param NUM_PIXELS number of pixels processed in one iteration
+ * @param OUT_SHIFT number of pixels to shift after one iteration in rgb data stream
+ * @param Y_SHIFT number of pixels to shift after one iteration in Y data stream
+ * @param UV_SHIFT
+ */
+#define YUV_CONVERT(FUNC, CONVERT, NUM_PIXELS, OUT_SHIFT, Y_SHIFT, UV_SHIFT)\
+static void                                                     \
+(FUNC)(const OggPlayYUVChannels* yuv, OggPlayRGBChannels* rgb)  \
+{                                                               \
+	int             i,j, w, h;                              \
+	unsigned char*  restrict ptry;                          \
+	unsigned char*  restrict ptru;                          \
+	unsigned char*  restrict ptrv;                          \
+	unsigned char*  restrict ptro;                          \
+	unsigned char   *dst, *py, *pu, *pv;                    \
+								\
+	ptro = rgb->ptro;                                       \
+	ptry = yuv->ptry;                                       \
+	ptru = yuv->ptru;                                       \
+	ptrv = yuv->ptrv;                                       \
+								\
+	w = yuv->y_width/NUM_PIXELS;                            \
+	h = yuv->y_height;                                      \
+	for (i = 0; i < h; ++i)                                 \
+	{                                                       \
+		py  = ptry;                                     \
+		pu  = ptru;                                     \
+		pv  = ptrv;                                     \
+		dst = ptro;                                     \
+		for (j = 0; j < w; ++j,                         \
+				dst += OUT_SHIFT,               \
+				py += Y_SHIFT,                  \
+				pu += UV_SHIFT,                 \
+				pv += UV_SHIFT)                 \
+		{                                               \
+			/* use the given conversion function */ \
+			CONVERT                                 \
+		}                                               \
+		ptro += rgb->rgb_width * 4;                     \
+		ptry += yuv->y_width;                           \
+								\
+		if (i & 0x1)                                    \
+		{                                               \
+			ptru += yuv->uv_width;                  \
+			ptrv += yuv->uv_width;                  \
+		}                                               \
+	}                                                       \
+	CLEANUP                                                 \
+}  
+
+#endif
+

Added: liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c
===================================================================
--- liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/oggplay_yuv2rgb_x86.c	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,143 @@
+/*
+   Copyright (C) 2003 Commonwealth Scientific and Industrial Research
+   Organisation (CSIRO) Australia
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   - Neither the name of CSIRO Australia nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ORGANISATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * YUV to RGB conversion using x86 CPU extensions
+ */
+
+#if defined(_MSC_VER)
+#include "yuv2rgb_x86_vs.h" 
+#elif defined(__GNUC__)
+#include "yuv2rgb_x86.h" 
+#endif
+
+#ifdef ATTRIBUTE_ALIGNED_MAX
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
+#else
+#define ATTR_ALIGN(align)
+#endif
+
+typedef union
+{
+	long long 		q[2];
+	unsigned long long	uq[2]; 	
+	int                     d[4]; 
+	unsigned int            ud[4];
+	short                   w[8];
+	unsigned short          uw[8];
+	char                    b[16];
+	unsigned char           ub[16];
+	float                   s[4];
+#if defined(__GNUC__)
+	long long __attribute__ ((__vector_size__ (16), __may_alias__)) int128;
+#endif
+} ATTR_ALIGN(16) simd_t;
+
+#define UV_128 0x0080008000800080LL 
+#define Y_16 0x1010101010101010LL
+#define Y_Co 0x253f253f253f253fLL
+#define GU_Co 0xf37df37df37df37dLL
+#define GV_Co 0xe5fce5fce5fce5fcLL
+#define BU_Co 0x4093409340934093LL
+#define RV_Co 0x3312331233123312LL
+#define Y_MASK 0x00ff00ff00ff00ffLL 
+#define ALFA 0xffffffffffffffffLL 
+
+/**
+ * coefficients and constants for yuv to rgb SIMD conversion
+ */
+static const simd_t simd_80w		= {{UV_128, UV_128}};
+static const simd_t simd_U_green	= {{GU_Co, GU_Co}};
+static const simd_t simd_U_blue 	= {{BU_Co, BU_Co}};
+static const simd_t simd_V_red 		= {{RV_Co, RV_Co}};
+static const simd_t simd_V_green	= {{GV_Co, GV_Co}};
+static const simd_t simd_Y_coeff	= {{Y_Co, Y_Co}};
+static const simd_t simd_10w 		= {{Y_16, Y_16}};
+static const simd_t simd_00ffw 		= {{Y_MASK, Y_MASK}};
+static const simd_t simd_alpha 		= {{ALFA, ALFA}};
+
+/**
+ *  the conversion functions using MMX instructions 
+ */
+
+/* template for the MMX conversion functions */
+#define YUV_CONVERT_MMX(FUNC, CONVERT) YUV_CONVERT(FUNC, CONVERT, 8, 32, 8, 4)
+#define CLEANUP emms()
+#define OUT_RGBA_32 OUTPUT_RGBA_32(movq, mm, 8, 16, 24)
+#define OUT_ARGB_32 OUTPUT_ARGB_32(movq, mm, 8, 16, 24)
+#define OUT_BGRA_32 OUTPUT_BGRA_32(movq, mm, 8, 16, 24)
+#define MOVNTQ MMX_MOVNTQ
+
+/* yuv420 -> */
+#define CONVERT(OUTPUT_FUNC) LOAD_YUV_PLANAR_2(movq, mm) \
+			     YUV_2_RGB(movq, mm) 	\
+			     OUTPUT_FUNC
+
+YUV_CONVERT_MMX(yuv420_to_rgba_mmx, CONVERT(OUT_RGBA_32))
+YUV_CONVERT_MMX(yuv420_to_bgra_mmx, CONVERT(OUT_BGRA_32)) 
+YUV_CONVERT_MMX(yuv420_to_argb_mmx, CONVERT(OUT_ARGB_32)) 
+#undef CONVERT
+
+#undef CLEANUP
+#undef OUT_RGBA_32
+#undef OUT_ARGB_32
+#undef OUT_BGRA_32
+#undef MOVNTQ
+
+/**
+ *  the conversion functions using SSE2 instructions 
+ */
+
+/* template for the SSE2 conversion functions */
+#define YUV_CONVERT_SSE2(FUNC, CONVERT) YUV_CONVERT(FUNC, CONVERT, 16, 64, 16, 8)
+#define OUT_RGBA_32 OUTPUT_RGBA_32(movdqa, xmm, 16, 32, 48)
+#define OUT_ARGB_32 OUTPUT_ARGB_32(movdqa, xmm, 16, 32, 48)
+#define OUT_BGRA_32 OUTPUT_BGRA_32(movdqa, xmm, 16, 32, 48)
+#define MOVNTQ SSE2_MOVNTQ
+#define CLEANUP
+
+/* yuv420 -> */
+#define CONVERT(OUTPUT_FUNC) LOAD_YUV_PLANAR_2(movdqu, xmm) \
+       			     YUV_2_RGB(movdqa, xmm)	\
+			     OUTPUT_FUNC
+
+YUV_CONVERT_SSE2(yuv420_to_rgba_sse2, CONVERT(OUT_RGBA_32))
+YUV_CONVERT_SSE2(yuv420_to_bgra_sse2, CONVERT(OUT_BGRA_32))
+YUV_CONVERT_SSE2(yuv420_to_argb_sse2, CONVERT(OUT_ARGB_32)) 
+#undef CONVERT
+
+#undef OUT_RGBA_32
+#undef OUT_ARGB_32
+#undef OUT_BGRA_32
+#undef MOVNTQ
+#undef CLEANUP 
+

Added: liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h
===================================================================
--- liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/yuv2rgb_x86.h	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,135 @@
+#ifndef __YUV2RGB_X86_H__
+#define __YUV2RGB_X86_H__
+
+#define emms() __asm__ __volatile ( "emms;" );
+#define MMX_MOVNTQ "movntq"
+#define SSE2_MOVNTQ "movdqu"
+
+#define YUV_2_RGB(mov_instr, reg_type) \
+	__asm__ __volatile__ (		\
+			"punpcklbw %%"#reg_type"4, %%"#reg_type"0;" 	/* mm0 = u3 u2 u1 u0 */\
+			"punpcklbw %%"#reg_type"4, %%"#reg_type"1;"	/* mm1 = v3 v2 v1 v0 */\
+			"psubsw simd_80w, %%"#reg_type"0;"		/* u -= 128 */\
+			"psubsw simd_80w, %%"#reg_type"1;"		/* v -= 128 */\
+			"psllw $3, %%"#reg_type"0;"			/* promote precision */\
+			"psllw $3, %%"#reg_type"1;"			/* promote precision */\
+			#mov_instr " %%"#reg_type"0, %%"#reg_type"2;"	/* mm2 = u3 u2 u1 u0 */\
+			#mov_instr " %%"#reg_type"1, %%"#reg_type"3;"	/* mm3 = v3 v2 v1 v0 */\
+			"pmulhw simd_U_green, %%"#reg_type"2;"		/* mm2 = u * u_green */\
+			"pmulhw simd_V_green, %%"#reg_type"3;"		/* mm3 = v * v_green */\
+			"pmulhw simd_U_blue, %%"#reg_type"0;"		/* mm0 = chroma_b */\
+			"pmulhw simd_V_red, %%"#reg_type"1;"		/* mm1 = chroma_r */\
+			"paddsw %%"#reg_type"3, %%"#reg_type"2;"	/* mm2 = chroma_g */\
+			"psubusb simd_10w, %%"#reg_type"6;"		/* Y -= 16  */\
+			#mov_instr " %%"#reg_type"6, %%"#reg_type"7;"	/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
+			"pand simd_00ffw, %%"#reg_type"6;"		/* mm6 =    Y6    Y4    Y2    Y0 */\
+			"psrlw $8, %%"#reg_type"7;"			/* mm7 =    Y7    Y5    Y3    Y1 */\
+			"psllw $3, %%"#reg_type"6;"			/* promote precision */\
+			"psllw $3, %%"#reg_type"7;"			/* promote precision */\
+			"pmulhw simd_Y_coeff, %%"#reg_type"6;"		/* mm6 = luma_rgb even */\
+			"pmulhw simd_Y_coeff, %%"#reg_type"7;"		/* mm7 = luma_rgb odd */\
+			#mov_instr " %%"#reg_type"0, %%"#reg_type"3;"	/* mm3 = chroma_b */\
+			#mov_instr " %%"#reg_type"1, %%"#reg_type"4;"	/* mm4 = chroma_r */\
+			#mov_instr " %%"#reg_type"2, %%"#reg_type"5;"	/* mm5 = chroma_g */\
+			"paddsw %%"#reg_type"6, %%"#reg_type"0;"	/* mm0 = B6 B4 B2 B0 */\
+			"paddsw %%"#reg_type"7, %%"#reg_type"3;"	/* mm3 = B7 B5 B3 B1 */\
+			"paddsw %%"#reg_type"6, %%"#reg_type"1;"	/* mm1 = R6 R4 R2 R0 */\
+			"paddsw %%"#reg_type"7, %%"#reg_type"4;"	/* mm4 = R7 R5 R3 R1 */\
+			"paddsw %%"#reg_type"6, %%"#reg_type"2;"	/* mm2 = G6 G4 G2 G0 */\
+			"paddsw %%"#reg_type"7, %%"#reg_type"5;"	/* mm5 = G7 G5 G3 G1 */\
+			"packuswb %%"#reg_type"0, %%"#reg_type"0;"	/* saturate to 0-255 */\
+			"packuswb %%"#reg_type"1, %%"#reg_type"1;"	/* saturate to 0-255 */\
+			"packuswb %%"#reg_type"2, %%"#reg_type"2;"	/* saturate to 0-255 */\
+			"packuswb %%"#reg_type"3, %%"#reg_type"3;"	/* saturate to 0-255 */\
+			"packuswb %%"#reg_type"4, %%"#reg_type"4;"	/* saturate to 0-255 */\
+			"packuswb %%"#reg_type"5, %%"#reg_type"5;"	/* saturate to 0-255 */\
+			"punpcklbw %%"#reg_type"3, %%"#reg_type"0;"	/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
+			"punpcklbw %%"#reg_type"4, %%"#reg_type"1;"	/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
+			"punpcklbw %%"#reg_type"5, %%"#reg_type"2;"	/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
+			::"m" (simd_80w), \
+			  "m" (simd_U_green), \
+			  "m" (simd_V_green), \
+			  "m" (simd_U_blue), \
+			  "m" (simd_V_red), \
+			  "m" (simd_10w), \
+			  "m" (simd_00ffw), \
+			  "m" (simd_Y_coeff));
+
+#define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm__ __volatile__ (				\
+			/* r0=B, r1=R, r2=G */		\
+			#mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+			#mov_instr " %%"#reg_type"0, %%"#reg_type"4;\n\t"\
+			#mov_instr " %%"#reg_type"1, %%"#reg_type"5;\n\t"\
+			"punpcklbw %%"#reg_type"2, %%"#reg_type"0;\n\t" /* GB GB GB GB low  */\
+			"punpcklbw %%"#reg_type"3, %%"#reg_type"1;\n\t" /* FR FR FR FR low  */\
+			"punpckhbw %%"#reg_type"2, %%"#reg_type"4;\n\t" /* GB GB GB GB high */\
+			"punpckhbw %%"#reg_type"3, %%"#reg_type"5;\n\t" /* FR FR FR FR high */\
+			#mov_instr " %%"#reg_type"0, %%"#reg_type"6;\n\t"\
+			#mov_instr " %%"#reg_type"4, %%"#reg_type"7;\n\t"\
+			"punpcklwd %%"#reg_type"1, %%"#reg_type"0;\n\t" /* FRGB FRGB 0 */\
+			"punpckhwd %%"#reg_type"1, %%"#reg_type"6;\n\t" /* FRGB FRGB 1 */\
+			"punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* FRGB FRGB 2 */\
+			"punpckhwd %%"#reg_type"5, %%"#reg_type"7;\n\t" /* FRGB FRGB 3 */\
+			MOVNTQ " %%"#reg_type"0, (%0);\n\t"\
+			MOVNTQ " %%"#reg_type"6, "#offset0"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"7, "#offset2"(%0);\n\t"\
+			::  "r" (dst), "m" (simd_alpha));
+
+
+#define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm__ __volatile__ (				\
+			/* r0=B, r1=R, r2=G */		\
+			#mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+			#mov_instr " %%"#reg_type"3, %%"#reg_type"4;\n\t"\
+			#mov_instr " %%"#reg_type"2, %%"#reg_type"5;\n\t"\
+			"punpcklbw %%"#reg_type"0, %%"#reg_type"2;\n\t" /* BG BG BG BG low  */\
+			"punpcklbw %%"#reg_type"1, %%"#reg_type"3;\n\t" /* RF RF RF RF low  */\
+			"punpckhbw %%"#reg_type"0, %%"#reg_type"5;\n\t" /* BG BG BG BG high */\
+			"punpckhbw %%"#reg_type"1, %%"#reg_type"4;\n\t" /* RF RF RF RF high */\
+			#mov_instr " %%"#reg_type"3, %%"#reg_type"0;\n\t"\
+			#mov_instr " %%"#reg_type"4, %%"#reg_type"1;\n\t"\
+			"punpcklwd %%"#reg_type"2, %%"#reg_type"3;\n\t" /* BGRF BGRF 0 */\
+			"punpckhwd %%"#reg_type"2, %%"#reg_type"0;\n\t" /* BGRF BGRF 1 */\
+			"punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* BGRF BGRF 2 */\
+			"punpckhwd %%"#reg_type"5, %%"#reg_type"1;\n\t" /* BGRF BGRF 3 */\
+			MOVNTQ " %%"#reg_type"3, (%0);\n\t"\
+			MOVNTQ " %%"#reg_type"0, "#offset0"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"1, "#offset2"(%0);\n\t"\
+			::  "r" (dst), "m" (simd_alpha));
+
+#define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm__ __volatile__ (				\
+			/* r0=B, r1=R, r2=G */		\
+			#mov_instr " simd_alpha, %%"#reg_type"3;\n\t"\
+			#mov_instr " %%"#reg_type"1, %%"#reg_type"4;\n\t"\
+			#mov_instr " %%"#reg_type"0, %%"#reg_type"5;\n\t"\
+			"punpcklbw %%"#reg_type"2, %%"#reg_type"1;\n\t" /* GR GR GR GR low  */\
+			"punpcklbw %%"#reg_type"3, %%"#reg_type"0;\n\t" /* 0B 0B 0B 0B low  */\
+			"punpckhbw %%"#reg_type"2, %%"#reg_type"4;\n\t" /* GR GR GR GR high */\
+			"punpckhbw %%"#reg_type"3, %%"#reg_type"5;\n\t" /* 0B 0B 0B 0B high */\
+			#mov_instr " %%"#reg_type"1, %%"#reg_type"6;\n\t"\
+			#mov_instr " %%"#reg_type"4, %%"#reg_type"7;\n\t"\
+			"punpcklwd %%"#reg_type"0, %%"#reg_type"1;\n\t" /* 0BGR 0BGR 0 */\
+			"punpckhwd %%"#reg_type"0, %%"#reg_type"6;\n\t" /* 0BGR 0BGR 1 */\
+			"punpcklwd %%"#reg_type"5, %%"#reg_type"4;\n\t" /* 0BGR 0BGR 2 */\
+			"punpckhwd %%"#reg_type"5, %%"#reg_type"7;\n\t" /* 0BGR 0BGR 3 */\
+			MOVNTQ " %%"#reg_type"1, (%0);\n\t"\
+			MOVNTQ " %%"#reg_type"6, "#offset0"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"4, "#offset1"(%0);\n\t"\
+			MOVNTQ " %%"#reg_type"7, "#offset2"(%0);\n\t"\
+			::  "r" (dst), "m" (simd_alpha));
+
+#define LOAD_YUV_PLANAR_2(mov_instr, reg_type)				\
+	__asm__ __volatile__ (						\
+			#mov_instr " %0, %%"#reg_type"6;\n\t"		\
+			#mov_instr " %1, %%"#reg_type"0;\n\t"		\
+			#mov_instr " %2, %%"#reg_type"1;\n\t"		\
+			"pxor %%"#reg_type"4, %%"#reg_type"4;\n\t"	\
+			:: "m" (*py), "m" (*pu), "m" (*pv));
+
+
+#endif /* __YUV2RGB_X86_H__ */
+

Added: liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h
===================================================================
--- liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h	                        (rev 0)
+++ liboggplay/trunk/src/liboggplay/yuv2rgb_x86_vs.h	2009-02-20 09:19:45 UTC (rev 3850)
@@ -0,0 +1,129 @@
+#ifndef __OGGPLAY_YUV2RGB_VS_H__
+#define __OGGPLAY_YUV2RGB_VS_H__
+
+#define emms() __asm emms
+#define MMX_MOVNTQ movntq
+#define SSE2_MOVNTQ movdqu
+
+#define LOAD_YUV_PLANAR_2(mov_instr, reg_type)		\
+	__asm {								\
+		__asm mov	eax, py					\
+		__asm mov	edx, pu					\
+		__asm mov_instr	reg_type##6, [eax]			\
+		__asm mov_instr	reg_type##0, [edx]			\
+		__asm mov	eax, pv					\
+		__asm mov_instr	reg_type##1, [eax]			\
+		__asm pxor	reg_type##4, reg_type##4		\
+	}
+
+#define OUTPUT_RGBA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm {								\
+		__asm mov	eax, dst				\
+		__asm mov_instr	reg_type##3, simd_alpha			\
+		__asm mov_instr reg_type##4, reg_type##1		\
+		__asm mov_instr reg_type##5, reg_type##0		\
+		__asm punpcklbw reg_type##1, reg_type##2		\
+		__asm punpcklbw reg_type##0, reg_type##3		\
+		__asm punpckhbw reg_type##4, reg_type##2		\
+		__asm punpckhbw reg_type##5, reg_type##3		\
+		__asm mov_instr reg_type##6, reg_type##1		\
+		__asm mov_instr reg_type##7, reg_type##4		\
+		__asm punpcklwd reg_type##1, reg_type##0                \
+		__asm punpckhwd reg_type##6, reg_type##0                \
+		__asm punpcklwd reg_type##4, reg_type##5                \
+		__asm punpckhwd reg_type##7, reg_type##5                \
+		__asm MOVNTQ	[eax], reg_type##1			\
+		__asm MOVNTQ	[eax+offset0], reg_type##6		\
+		__asm MOVNTQ	[eax+offset1], reg_type##4		\
+		__asm MOVNTQ	[eax+offset2], reg_type##7		\
+	}
+
+#define OUTPUT_ARGB_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm {								\
+		__asm mov	eax, dst				\
+		__asm mov_instr	reg_type##3, simd_alpha			\
+		__asm mov_instr reg_type##4, reg_type##3		\
+		__asm mov_instr reg_type##5, reg_type##2		\
+		__asm punpcklbw reg_type##2, reg_type##0		\
+		__asm punpcklbw reg_type##3, reg_type##1		\
+		__asm punpckhbw reg_type##5, reg_type##0		\
+		__asm punpckhbw reg_type##4, reg_type##1		\
+		__asm mov_instr reg_type##0, reg_type##3		\
+		__asm mov_instr reg_type##1, reg_type##4		\
+		__asm punpcklwd reg_type##3, reg_type##2                \
+		__asm punpckhwd reg_type##0, reg_type##2                \
+		__asm punpcklwd reg_type##4, reg_type##5                \
+		__asm punpckhwd reg_type##1, reg_type##5                \
+		__asm MOVNTQ	[eax], reg_type##3			\
+		__asm MOVNTQ	[eax+offset0], reg_type##0		\
+		__asm MOVNTQ	[eax+offset1], reg_type##4		\
+		__asm MOVNTQ	[eax+offset2], reg_type##1		\
+	}
+
+#define OUTPUT_BGRA_32(mov_instr, reg_type, offset0, offset1, offset2) \
+	__asm {								\
+		__asm mov	eax, dst				\
+		__asm mov_instr	reg_type##3, simd_alpha			\
+		__asm mov_instr reg_type##4, reg_type##0		\
+		__asm mov_instr reg_type##5, reg_type##1		\
+		__asm punpcklbw reg_type##0, reg_type##2		\
+		__asm punpcklbw reg_type##1, reg_type##3		\
+		__asm punpckhbw reg_type##4, reg_type##2		\
+		__asm punpckhbw reg_type##5, reg_type##3		\
+		__asm mov_instr reg_type##6, reg_type##0		\
+		__asm mov_instr reg_type##7, reg_type##4		\
+		__asm punpcklwd reg_type##0, reg_type##1                \
+		__asm punpckhwd reg_type##6, reg_type##1                \
+		__asm punpcklwd reg_type##4, reg_type##5                \
+		__asm punpckhwd reg_type##7, reg_type##5                \
+		__asm MOVNTQ	[eax], reg_type##0			\
+		__asm MOVNTQ	[eax+offset0], reg_type##6		\
+		__asm MOVNTQ	[eax+offset1], reg_type##4		\
+		__asm MOVNTQ	[eax+offset2], reg_type##7		\
+	}
+
+#define YUV_2_RGB(mov_instr, reg_type) \
+	__asm {											\
+		__asm punpcklbw reg_type##0, reg_type##4	/* mm0 = u3 u2 u1 u0 */\
+		__asm punpcklbw reg_type##1, reg_type##4	/* mm1 = v3 v2 v1 v0 */\
+		__asm psubsw	reg_type##0, simd_80w		/* u -= 128 */\
+		__asm psubsw	reg_type##1, simd_80w		/* v -= 128 */\
+		__asm psllw	reg_type##0, 3			/* promote precision */\
+		__asm psllw	reg_type##1, 3			/* promote precision */\
+		__asm mov_instr reg_type##2, reg_type##0	/* mm2 = u3 u2 u1 u0 */\
+		__asm mov_instr reg_type##3, reg_type##1	/* mm3 = v3 v2 v1 v0 */\
+		__asm pmulhw	reg_type##2, simd_U_green	/* mm2 = u * u_green */\
+		__asm pmulhw	reg_type##3, simd_V_green	/* mm3 = v * v_green */\
+		__asm pmulhw	reg_type##0, simd_U_blue	/* mm0 = chroma_b */\
+		__asm pmulhw	reg_type##1, simd_V_red		/* mm1 = chroma_r */\
+		__asm paddsw	reg_type##2, reg_type##3	/* mm2 = chroma_g */\
+		__asm psubusb	reg_type##6, simd_10w		/* Y -= 16  */\
+		__asm mov_instr reg_type##7, reg_type##6	/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
+		__asm pand	reg_type##6, simd_00ffw		/* mm6 =    Y6    Y4    Y2    Y0 */\
+		__asm psrlw	reg_type##7, 8			/* mm7 =    Y7    Y5    Y3    Y1 */\
+		__asm psllw	reg_type##6, 3			/* promote precision */\
+		__asm psllw	reg_type##7, 3			/* promote precision */\
+		__asm pmulhw	reg_type##6, simd_Y_coeff	/* mm6 = luma_rgb even */\
+		__asm pmulhw	reg_type##7, simd_Y_coeff	/* mm7 = luma_rgb odd */\
+		__asm mov_instr reg_type##3, reg_type##0	/* mm3 = chroma_b */\
+		__asm mov_instr reg_type##4, reg_type##1	/* mm4 = chroma_r */\
+		__asm mov_instr reg_type##5, reg_type##2	/* mm5 = chroma_g */\
+		__asm paddsw	reg_type##0, reg_type##6	/* mm0 = B6 B4 B2 B0 */\
+		__asm paddsw	reg_type##3, reg_type##7	/* mm3 = B7 B5 B3 B1 */\
+		__asm paddsw	reg_type##1, reg_type##6	/* mm1 = R6 R4 R2 R0 */\
+		__asm paddsw	reg_type##4, reg_type##7	/* mm4 = R7 R5 R3 R1 */\
+		__asm paddsw	reg_type##2, reg_type##6	/* mm2 = G6 G4 G2 G0 */\
+		__asm paddsw	reg_type##5, reg_type##7	/* mm5 = G7 G5 G3 G1 */\
+		__asm packuswb	reg_type##0, reg_type##0	/* saturate to 0-255 */\
+		__asm packuswb	reg_type##1, reg_type##1	/* saturate to 0-255 */\
+		__asm packuswb	reg_type##2, reg_type##2	/* saturate to 0-255 */\
+		__asm packuswb	reg_type##3, reg_type##3	/* saturate to 0-255 */\
+		__asm packuswb	reg_type##4, reg_type##4	/* saturate to 0-255 */\
+		__asm packuswb	reg_type##5, reg_type##5	/* saturate to 0-255 */\
+		__asm punpcklbw	reg_type##0, reg_type##3	/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */\
+		__asm punpcklbw	reg_type##1, reg_type##4	/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */\
+		__asm punpcklbw	reg_type##2, reg_type##5	/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */\
+	}
+
+#endif
+