[xiph-commits] r14839 - in branches/theora-thusnelda/lib: . enc

xiphmont at svn.xiph.org xiphmont at svn.xiph.org
Tue May 6 00:50:30 PDT 2008


Author: xiphmont
Date: 2008-05-06 00:50:29 -0700 (Tue, 06 May 2008)
New Revision: 14839

Modified:
   branches/theora-thusnelda/lib/cpu.c
   branches/theora-thusnelda/lib/cpu.h
   branches/theora-thusnelda/lib/enc/mode.c
Log:
Quit dicking around with the rho optimization; function call overhead 
swamps everything.



Modified: branches/theora-thusnelda/lib/cpu.c
===================================================================
--- branches/theora-thusnelda/lib/cpu.c	2008-05-06 03:04:19 UTC (rev 14838)
+++ branches/theora-thusnelda/lib/cpu.c	2008-05-06 07:50:29 UTC (rev 14839)
@@ -5,14 +5,14 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
  * by the Xiph.Org Foundation http://www.xiph.org/                  *
  *                                                                  *
  ********************************************************************
- 
+
  CPU capability detection for x86 processors.
   Originally written by Rudolf Marek.
-  
+
  function:
   last mod: $Id$
 
@@ -20,91 +20,181 @@
 
 #include "cpu.h"
 
+#if !defined(USE_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
 
-ogg_uint32_t oc_cpu_flags_get(void){
-  ogg_uint32_t flags = 0;
-#if defined(USE_ASM)
-  ogg_uint32_t eax;
-  ogg_uint32_t ebx;
-  ogg_uint32_t ecx;
-  ogg_uint32_t edx;
-#if (defined(__amd64__) || defined(__x86_64__))
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+# if !defined(_MSC_VER)
+#  if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
-   "push %%rbx\n\t" \
    "cpuid\n\t" \
-   "movl %%ebx,%1\n\t" \
-   "pop  %%rbx\n\t" \
-   :"=a" (_eax), \
-    "=r" (_ebx), \
-    "=c" (_ecx), \
-    "=d" (_edx) \
-   :"a" (_op) \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
    :"cc" \
   )
-#else
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+#  else
+/*On x86-32, not so much.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
   __asm__ __volatile__( \
-   "pushl %%ebx\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
    "cpuid\n\t" \
-   "movl  %%ebx,%1\n\t" \
-   "popl  %%ebx\n\t" \
-   :"=a" (_eax), \
-    "=r" (_ebx), \
-    "=c" (_ecx), \
-    "=d" (_edx) \
-   :"a" (_op) \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
    :"cc" \
   )
+#  endif
+# else
+/*Why does MSVC need this complicated rigamarole?
+  At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+   for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+  _asm{
+    mov eax,[_op]
+    mov esi,_cpu_info
+    cpuid
+    mov [esi+0],eax
+    mov [esi+4],ebx
+    mov [esi+8],ecx
+    mov [esi+12],edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  do{ \
+    ogg_uint32_t cpu_info[4]; \
+    oc_cpuid_helper(cpu_info,_op); \
+    (_eax) = cpu_info[0]; \
+    (_ebx) = cpu_info[1]; \
+    (_ecx) = cpu_info[2]; \
+    (_edx) = cpu_info[3]; \
+  }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+  _asm{
+    pushfd
+    pushfd
+    pop eax
+    mov ebx,eax
+    xor eax,200000h
+    push eax
+    popfd
+    pushfd
+    pop eax
+    popfd
+    mov [_eax],eax
+    mov [_ebx],ebx
+  }
+}
+# endif
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+#  if !defined(_MSC_VER)
   __asm__ __volatile__(
    "pushfl\n\t"
    "pushfl\n\t"
-   "popl          %0\n\t"
-   "movl          %0,%1\n\t"
-   "xorl   $0x200000,%0\n\t"
-   "pushl         %0\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
    "popfl\n\t"
    "pushfl\n\t"
-   "popl          %0\n\t"
+   "popl %[a]\n\t"
    "popfl\n\t"
-   :"=r" (eax),
-    "=r" (ebx)
+   :[a]"=r"(eax),[b]"=r"(ebx)
    :
    :"cc"
   );
+#  else
+  oc_detect_cpuid_helper(&eax,&ebx);
+#  endif
   /*No cpuid.*/
   if(eax==ebx)return 0;
-#endif
+# endif
   cpuid(0,eax,ebx,ecx,edx);
-  if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){
-    /*Intel:*/
-inteltest:
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
     cpuid(1,eax,ebx,ecx,edx);
-    if((edx&0x00800000)==0)return 0;
+    /*If there isn't even MMX, give up.*/
+    if(!(edx&0x00800000))return 0;
     flags=OC_CPU_X86_MMX;
     if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
     if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+    if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
   }
-  else if(ebx==0x68747541&&edx==0x69746e65&&ecx==0x444d4163 ||
-          ebx==0x646f6547&&edx==0x79622065&&ecx==0x43534e20){
-    /*AMD:*/
-    /*Geode:*/
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
     cpuid(0x80000000,eax,ebx,ecx,edx);
-    if(eax<0x80000001)goto inteltest;
+    if(eax<0x80000001){
+      /*No extended functions supported.
+        Use normal cpuid flags.*/
+      cpuid(1,eax,ebx,ecx,edx);
+      /*If there isn't even MMX, give up.*/
+      if(!(edx&0x00800000))return 0;
+      flags=OC_CPU_X86_MMX;
+      if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+    }
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*If there isn't even MMX, give up.*/
+      if(!(edx&0x00800000))return 0;
+      flags=OC_CPU_X86_MMX;
+      if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+      if(edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+      if(edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+      /*Also check for SSE.*/
+      cpuid(1,eax,ebx,ecx,edx);
+      if(edx&0x02000000)flags|=OC_CPU_X86_SSE;
+    }
+    if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+    if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  }
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*The C7 (and later?) processors support Intel-like cpuid info.*/
+    /*The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    if(edx&0x00800000){
+      flags=OC_CPU_X86_MMX;
+      if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+      if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+      if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+    }
+    else flags=0;
+    /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+      We need to check this even if the Intel test succeeds to pick up 3dnow!
+       support on these processors.*/
+    /*TODO: How about earlier chips?*/
     cpuid(0x80000001,eax,ebx,ecx,edx);
-    if((edx&0x00800000)==0)return 0;
-    flags=OC_CPU_X86_MMX;
+    if(edx&0x00800000)flags|=OC_CPU_X86_MMX;
     if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
-    if(edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
-    if(edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
   }
   else{
     /*Implement me.*/
     flags=0;
   }
-  
-#endif
-  
   return flags;
 }
-
+#endif

Modified: branches/theora-thusnelda/lib/cpu.h
===================================================================
--- branches/theora-thusnelda/lib/cpu.h	2008-05-06 03:04:19 UTC (rev 14838)
+++ branches/theora-thusnelda/lib/cpu.h	2008-05-06 07:50:29 UTC (rev 14839)
@@ -24,6 +24,7 @@
 #define OC_CPU_X86_MMXEXT (1<<3)
 #define OC_CPU_X86_SSE    (1<<4)
 #define OC_CPU_X86_SSE2   (1<<5)
+#define OC_CPU_X86_PNI    (1<<6)
 
 ogg_uint32_t oc_cpu_flags_get(void);
 

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2008-05-06 03:04:19 UTC (rev 14838)
+++ branches/theora-thusnelda/lib/enc/mode.c	2008-05-06 07:50:29 UTC (rev 14839)
@@ -488,57 +488,7 @@
 }
 
 #include "quant_lookup.h"
-static int find_nonzero_transition(ogg_int16_t *q, ogg_int16_t in){
-  int i;
-  int val = (abs((int)in)<<1);
-  for(i=63;i>=0;i--)
-    if( val < q[i])break;
-  return i+1;
-}
-/* Don't use it... it is tripping a GCC bug */
-#include<stdio.h>
-int find_nonzero_transition(ogg_int16_t *q, ogg_int16_t in){
-  int ret;
-  __asm__ (
-	   ".balign 16 \n"
-	   "mov       $64,%[ret]\n"
-	   "movd      %[in],%%mm0\n"
-	   "punpcklwd %%mm0,%%mm0\n"
-	   "punpcklwd %%mm0,%%mm0\n"
-	   "jmp       %=2f\n"
-	   
-	   "%=1:\n"
-	   "sub      $8,%[quant]\n"
-	   "sub      $4,%[ret]\n"
-	   "jz       %=3f\n"
-	   
-	   "%=2:\n"
-	   "movq     (%[quant]),%%mm1\n"
-	   "pcmpgtw  %%mm0,%%mm1\n"
-	   "packsswb %%mm1,%%mm1\n"
-	   "movd     %%mm1,%%ecx\n"
-	   "jecxz    %=1b\n"                  
-	   
-	   "not      %%ecx\n"
-	   "jecxz    %=3f\n"                  
-	   "dec      %[ret]\n"
-	   "shl      $8,%%ecx\n"
-	   "jecxz    %=3f\n"                  
-	   "dec      %[ret]\n"
-	   "shl      $8,%%ecx\n"
-	   "jecxz    %=3f\n"                  
-	   "dec      %[ret]\n"              
-	   
-	   "%=3:\n"
-	   "emms\n"
-	   :[ret]"=&r"(ret)
-	   :[quant]"r"(q+60),[in]"r"(abs(in)<<1)
-	   :"%ecx"
-	   );
 
-    return ret;
-}
-*/
 static void TQB (CP_INSTANCE *cpi, int mode, int fi, mv_t mv, int plane, ogg_int16_t re_q[2][3][64], long *rho_count){
   if ( cpi->frag_coded[fi] ) {
     int qi = cpi->BaseQ; /* temporary */;
@@ -606,15 +556,19 @@
 	   NXZ; adds one nonzero token; +1
 	   NXN; replaces a zero run with a nonzero; +0  */
       int i;
-      
+      quant_tables *qq = &(cpi->quant_tables[inter][plane]);
+
       for(i=0;i<64;i++){
 	int ii = dezigzag_index[i];
-	int pos = find_nonzero_transition(cpi->quant_tables[inter][plane][i],buffer[ii]);
+	int pos;
+	int val = abs(buffer[ii])<<1;
+	ogg_int16_t *qqq = (*qq)[i];
+	for(pos=64;pos>0;pos--)
+	  if(val < qqq[pos-1])break;
 	
 	/* rho-domain distribution */
 	rho_count[pos]++;
 
-
 	if(qi<pos){
 	  data[i] = 0;
 	}else{



More information about the commits mailing list