[speex-dev] [PATCH] Make SSE Run Time option. Add Win32 SSE code

Thu Jan 8 21:18:25 PST 2004

All,

 Attached is a patch that does two things. First it makes the use 
of the current SSE code a run time option through the use 
of speex_decoder_ctl() and speex_encoder_ctl
It does this twofold. First there is a modification to the configure.in 
script which introduces a check based upon platform. It will compile in the 
sse assembly if you are on an i?86 based platform by making a special 
define. Second, it adds a new ctl value called SPEEX_SET_ASM_FLAG which 
takes in an integer. The values are defined as:

#define SPEEX_SET_ASM_FLAG 200
#define SPEEX_ASM_MMX_NONE 0
#define SPEEX_ASM_MMX_BASIC 1
#define SPEEX_ASM_MMX_SSE 2
#define SPEEX_ASM_MMX_SSE_FP 4

The current Speex SSE code requires full SSE2 support which corresponds to 
SPEEX_ASM_MMX_SSE_FP. None of the other defines are actively used, but they 
are included since they represent different Intel/AMD processors. For 
example, an AMD Duran only supports SPEEX_ASM_MMX_BASIC while Pentium 3's 
and above support full SPEEX_ASM_MMX_SSE_FP

The second part of the patch adds the equivalent MS Windows assembler for 
the same sections that currently have GCC x86 assembler code.

Notes about implementation: We took the easiest route when hacking in the 
flag support which was to add a global flag for the entire library at 
runtime and extern it in all the various files.
Jean-Marc: We looked at adding the flag into the state structures, however 
they were not passed all the way down into the filters.c files and it would 
have been a massive change to make it pass all the needed data. The 
approach we took should be ok since on a given machine you would have the 
same settings. The decoder_ctl and encoder_ctl set the same global flag 
variable.

The way we setup the asm flags var should allow you to add the ARM assembly 
in the exact same manor. You would add a check in the configure.in for the 
platform and define a _USE_ARM and place the code in the same functions as 
we did. You would then add a SPEEX_ASM_ARM 8 or something and let the 
application decide to turn it on.

Other Notes: This patch obsoletes ltp_sse.h and filters_sse.h . However 
the patch does not remove them. This is thge updated version of the patch 
we sent in November.

Comments are welcome. BTW, we have been shipping our Video Conferencing 
product which only uses the Speex codec for 6 months now and have gotten 
rave reviews (PC Magazine Editors choice) for the audio and video quality. 
We use Speex in Windows, Mac OS-X, and Linux as we have clients for each 
platform. Keep up the great work! Check us out at 
http://www.sightspeed.com and please try our beta version (Mac and Windows 
Clients available now) at http://www.sightspeed.com/page.php?page=beta

Patch in .gz format attached, plain text below.

-----------------------------------------------------------

Aron Rosenberg
CTO
SightSpeed Inc.

http://www.sightspeed.com

diff -ru speex-1.0.3-orig/configure.in speex-1.0.3/configure.in

--- speex-1.0.3-orig/configure.in       2003-10-30 08:57:25.000000000 -0800
+++ speex-1.0.3/configure.in    2004-01-08 20:44:25.000000000 -0800
@@ -18,11 +18,11 @@
  # For automake.
  VERSION=$SPEEX_VERSION
  PACKAGE=speex
+AC_CANONICAL_SYSTEM

  AM_INIT_AUTOMAKE($PACKAGE, $VERSION, no-define)
  AM_MAINTAINER_MODE

-AC_CANONICAL_HOST
  AM_PROG_LIBTOOL

  AC_C_BIGENDIAN
@@ -52,8 +52,18 @@
  AC_CHECK_LIB(gnugetopt, getopt_long)

  AC_DEFINE_UNQUOTED(VERSION, "${VERSION}")
+AC_DEFINE(GCC_COMPILER)
+echo -n "Checking for Platform ASM Speedups..."
+case "$target" in
+    i?86*)
+        CFLAGS="$CFLAGS -D_USE_SSE"
+        echo "Found SSE and SSE2"
+        ;;
+    *)
+        echo "None Found"
+        ;;
+esac

-AC_ARG_ENABLE(sse, [  --enable-sse             enable SSE support], [if 
test "$enableval" = yes; then AC_DEFINE(_USE_SSE) fi])

  dnl Output the makefiles and version.h.

diff -ru speex-1.0.3-orig/libspeex/filters.c speex-1.0.3/libspeex/filters.c
--- speex-1.0.3-orig/libspeex/filters.c 2003-01-05 21:56:56.000000000 -0800
+++ speex-1.0.3/libspeex/filters.c      2004-01-08 20:44:41.000000000 -0800
@@ -32,8 +32,10 @@

  #include "filters.h"
  #include "stack_alloc.h"
+#include "speex.h"
  #include <math.h>

+extern int global_use_mmx_sse;

  void bw_lpc(float gamma, float *lpc_in, float *lpc_out, int order)
  {
@@ -46,41 +48,548 @@
     }
  }

-#ifdef _USE_SSE
-#include "filters_sse.h"
-#else
-void filter_mem2(float *x, float *num, float *den, float *y, int N, int 
ord, float *mem)
+
+void filter_mem2(float *x, float *_num, float *_den, float *y, int N, int 
ord, float *_mem)
  {
-   int i,j;
-   float xi,yi;
-   for (i=0;i<N;i++)
-   {
-      xi=x[i];
-      y[i] = num[0]*xi + mem[0];
-      yi=y[i];
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = mem[j+1] + num[j+1]*xi - den[j+1]*yi;
-      }
-      mem[ord-1] = num[ord]*xi - den[ord]*yi;
-   }
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+          int i,j;
+          float xi,yi;
+          for (i=0;i<N;i++)
+          {
+                 xi=x[i];
+                 y[i] = _num[0]*xi + _mem[0];
+                 yi=y[i];
+                 for (j=0;j<ord-1;j++)
+                 {
+                        _mem[j] = _mem[j+1] + _num[j+1]*xi - _den[j+1]*yi;
+                 }
+                 _mem[ord-1] = _num[ord]*xi - _den[ord]*yi;
+          }
+       }
+       else
+       {
+          float __num[20], __den[20], __mem[20];
+          float *num, *den, *mem;
+          int i;
+
+          num = (float*)(((int)(__num+4))&0xfffffff0)-1;
+          den = (float*)(((int)(__den+4))&0xfffffff0)-1;
+          mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
+
+          for (i=0;i<=10;i++)
+                 num[i]=den[i]=0;
+
+          for (i=0;i<10;i++)
+                 mem[i]=0;
+
+          for (i=0;i<ord+1;i++)
+          {
+                 num[i]=_num[i];
+                 den[i]=_den[i];
+          }
+
+          for (i=0;i<ord;i++)
+                 mem[i]=_mem[i];
+
+          for (i=0;i<N;i+=4)
+          {
+                  float *in1 = x+i;
+                  float *in2 = y+i;
+
+#ifdef WIN32
+                       /*
+                       Do we need to push???
+                       push eax
+                       push ebx
+                       push ecx
+                       push edx
+                       */
+                  _asm
+                       {
+                       mov eax, num
+                       mov ebx, den
+                       mov ecx, mem
+
+                       mov edx, in1
+                       movss xmm0, [edx]
+
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+
+                       mov edx, in2
+                       movss [edx], xmm1
+
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+
+                       mov edx, in1
+                       movss xmm0, [edx+4]
+
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+
+                       mov edx, in2
+                       movss [edx+4], xmm1
+
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+
+                       mov edx, in1
+                       movss xmm0, [edx+8]
+
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+
+                       mov edx, in2
+                       movss [edx+8], xmm1
+
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+
+                       mov edx, in1
+                       movss xmm0, [edx+12]
+
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+
+                       mov edx, in2
+                       movss [edx+12], xmm1
+
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+                  }
+                  /*
+                       pop edx
+                       pop ecx
+                       pop ebx
+                       pop eax
+                       */
+#endif
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                 __asm__ __volatile__
+                 (
+                  "\tmovss (%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, (%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+
+                  "\tmovss 4(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 4(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+
+                  "\tmovss 8(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 8(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+
+                  "\tmovss 12(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 12(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+
+                  : : "r" (mem), "r" (x+i), "r" (y+i), "r" (num), "r" (den)
+                  : "memory" );
+#endif
+               }
+               for (i=0;i<ord;i++)
+                       _mem[i]=mem[i];
+       }
  }

<p>-void iir_mem2(float *x, float *den, float *y, int N, int ord, float *mem)
+void iir_mem2(float *x, float *_den, float *y, int N, int ord, float *_mem)
  {
-   int i,j;
-   for (i=0;i<N;i++)
-   {
-      y[i] = x[i] + mem[0];
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = mem[j+1] - den[j+1]*y[i];
-      }
-      mem[ord-1] = - den[ord]*y[i];
-   }
-}
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+          int i,j;
+          for (i=0;i<N;i++)
+          {
+                 y[i] = x[i] + _mem[0];
+                 for (j=0;j<ord-1;j++)
+                 {
+                        _mem[j] = _mem[j+1] - _den[j+1]*y[i];
+                 }
+                 _mem[ord-1] = - _den[ord]*y[i];
+          }
+       }
+       else
+       {
+
+               float  __den[20], __mem[20];
+               float *den, *mem;
+               int i;
+
+               den = (float*)(((int)(__den+4))&0xfffffff0)-1;
+               mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
+               for (i=0;i<=10;i++)
+               {
+                       den[i]=0;
+               }
+               for (i=0;i<10;i++)
+               {
+                       mem[i]=0;
+               }
+               for (i=0;i<ord+1;i++)
+               {
+                       den[i]=_den[i];
+               }
+               for (i=0;i<ord;i++)
+               {
+                       mem[i]=_mem[i];
+               }
+
+               for (i=0;i<N;i++)
+               {
+
+/* The assembler block below is equivalent to this code
+                       y[i] = x[i] + mem[0];
+                       for (j=0;j<ord-1;j++)
+                       {
+                               mem[j] = mem[j+1] - den[j+1]*y[i];
+                       }
+                       mem[ord-1] = - den[ord]*y[i]
+*/
+
+                       float *in1 = x+i;
+                       float *in2 = y+i;
+
+#ifdef WIN32
+                       /*
+                       Do we need to push???
+                       push eax
+                       push ebx
+                       push ecx
+                       push edx
+                       */
+                       _asm
+                       {
+                       mov eax, den
+                       mov ebx, mem
+                       mov ecx, in1
+                       mov edx, in2
+
+                       movss xmm0, [ecx]
+                       movss xmm1, [ebx]
+                       addss xmm1, xmm0
+
+                       movss [edx], xmm1
+
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [eax+20]
+                       mulps  xmm2, xmm1
+                       mulps  xmm3, xmm1
+                       movss  xmm4, [eax+36]
+                       movss  xmm5, [eax+40]
+                       mulss  xmm4, xmm1
+                       mulss  xmm5, xmm1
+                       movaps xmm6, [ebx+4]
+                       subps  xmm6, xmm2
+                       movups [ebx], xmm6
+                       movaps xmm7, [ebx+20]
+                       subps  xmm7, xmm3
+                       movups [ebx+16], xmm7
+
+                       movss  xmm7, [ebx+36]
+                       subss  xmm7, xmm4
+                       movss  [ebx+32], xmm7
+                       xorps  xmm2, xmm2
+                       subss  xmm2, xmm5
+                       movss  [ebx+36], xmm2
+                       }
+                  /*
+                       pop edx
+                       pop ecx
+                       pop ebx
+                       pop eax
+                       */
+#endif
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                       __asm__ __volatile__ (
+                       "\tmovss (%1), %%xmm0\n"
+                       "\tmovss (%0), %%xmm1\n"
+                       "\taddss %%xmm0, %%xmm1\n"
+                       "\tmovss %%xmm1, (%2)\n"
+                       "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                       "\tshufps $0x00, %%xmm1, %%xmm1\n"
+
+                       "\tmovaps 4(%3),  %%xmm2\n"
+                       "\tmovaps 20(%3), %%xmm3\n"
+                       "\tmulps  %%xmm1, %%xmm2\n"
+                       "\tmulps  %%xmm1, %%xmm3\n"
+                       "\tmovss  36(%3), %%xmm4\n"
+                       "\tmovss  40(%3), %%xmm5\n"
+                       "\tmulss  %%xmm1, %%xmm4\n"
+                       "\tmulss  %%xmm1, %%xmm5\n"
+                       "\tmovaps 4(%0),  %%xmm6\n"
+                       "\tsubps  %%xmm2, %%xmm6\n"
+                       "\tmovups %%xmm6, (%0)\n"
+                       "\tmovaps 20(%0), %%xmm7\n"
+                       "\tsubps  %%xmm3, %%xmm7\n"
+                       "\tmovups %%xmm7, 16(%0)\n"
+
+
+                       "\tmovss  36(%0), %%xmm7\n"
+                       "\tsubss  %%xmm4, %%xmm7\n"
+                       "\tmovss  %%xmm7, 32(%0)\n"
+                       "\txorps  %%xmm2, %%xmm2\n"
+                       "\tsubss  %%xmm5, %%xmm2\n"
+                       "\tmovss  %%xmm2, 36(%0)\n"
+
+                       : : "r" (mem), "r" (x+i), "r" (y+i), "r" (den) : 
"memory" );
  #endif
+               }
+               for (i=0;i<ord;i++)
+               {
+                       _mem[i]=mem[i];
+               }
+       }
+}
+

  void fir_mem2(float *x, float *num, float *y, int N, int ord, float *mem)
  {
diff -ru speex-1.0.3-orig/libspeex/ltp.c speex-1.0.3/libspeex/ltp.c
--- speex-1.0.3-orig/libspeex/ltp.c     2003-05-05 18:20:26.000000000 -0700
+++ speex-1.0.3/libspeex/ltp.c  2004-01-08 20:44:45.000000000 -0800
@@ -34,37 +34,168 @@
  #include "ltp.h"
  #include "stack_alloc.h"
  #include "filters.h"
+#include "speex.h"
  #include "speex_bits.h"

-#ifdef _USE_SSE
-#include "ltp_sse.h"
-#else
-static float inner_prod(float *x, float *y, int len)
+extern int global_use_mmx_sse;
+
+static float inner_prod(float *a, float *b, int len)
  {
-   int i;
-   float sum1=0,sum2=0,sum3=0,sum4=0;
-   for (i=0;i<len;)
-   {
-      sum1 += x[i]*y[i];
-      sum2 += x[i+1]*y[i+1];
-      sum3 += x[i+2]*y[i+2];
-      sum4 += x[i+3]*y[i+3];
-      i+=4;
-   }
-   return sum1+sum2+sum3+sum4;
-}
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+/*
+//Older, slower version
+               int i;
+               float sum=0;
+               for (i=0;i<len;i++)
+                       sum += a[i]*b[i];
+               return sum;
+*/             int i;
+               float sum1=0,sum2=0,sum3=0,sum4=0;
+               for (i=0;i<len;)
+               {
+                       sum1 += a[i]*b[i];
+                       sum2 += a[i+1]*b[i+1];
+                       sum3 += a[i+2]*b[i+2];
+                       sum4 += a[i+3]*b[i+3];
+                       i+=4;
+               }
+               return sum1+sum2+sum3+sum4;
+       }
+       else
+       {
+               float sum;
+               float* sum1 = &sum;
+
+#ifdef WIN32
+               _asm
+               {
+                 push eax
+                 push ebx
+                 push ecx
+                 push edx
+                 push edi
+
+                 mov eax, a
+                 mov edi, b
+                 mov ecx, len
+                 mov edx, sum1
+
+                 xorps xmm3, xmm3
+                 xorps xmm4, xmm4
+                 sub ecx, 20
+
+               mul20_loop:
+
+                 movups xmm0, [eax]
+                 movups xmm1, [edi]
+                 mulps xmm1, xmm0
+
+                 movups xmm5, [eax+16]
+                 movups xmm6, [edi+16]
+                 mulps xmm6, xmm5
+                 addps xmm3, xmm1
+
+                 movups xmm0, [eax+32]
+                 movups xmm1, [edi+32]
+                 mulps xmm1, xmm0
+                 addps xmm4, xmm6
+
+                 movups xmm5, [eax+48]
+                 movups xmm6, [edi+48]
+                 mulps xmm6, xmm5
+                 addps xmm3, xmm1
+
+                 movups xmm0, [eax+64]
+                 movups xmm1, [edi+64]
+                 mulps xmm1, xmm0
+                 addps xmm4, xmm6
+                 addps xmm3, xmm1
+
+                 add eax, 80
+                 add edi, 80
+                 sub ecx, 20
+
+                 jae mul20_loop
+
+                 addps xmm3, xmm4
+                 movhlps xmm4, xmm3
+                 addps xmm3, xmm4
+                 movaps xmm4, xmm3
+                 shufps xmm4, xmm4, 0x55
+                 addss xmm3, xmm4
+                 movss [edx], xmm3
+
+                 pop edi
+                 pop edx
+                 pop ecx
+                 pop ebx
+                 pop eax
+                 }
+#endif
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                 __asm__ __volatile__ (
+                 "\tpush %%eax\n"
+                 "\tpush %%edi\n"
+                 "\tpush %%ecx\n"
+                 "\txorps %%xmm3, %%xmm3\n"
+                 "\txorps %%xmm4, %%xmm4\n"
+
+                 "\tsub $20, %%ecx\n"
+
+               ".mul20_loop%=:\n"
+
+                 "\tmovups (%%eax), %%xmm0\n"
+                 "\tmovups (%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+
+                 "\tmovups 16(%%eax), %%xmm5\n"
+                 "\tmovups 16(%%edi), %%xmm6\n"
+                 "\tmulps %%xmm5, %%xmm6\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+
+                 "\tmovups 32(%%eax), %%xmm0\n"
+                 "\tmovups 32(%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+                 "\taddps %%xmm6, %%xmm4\n"
+
+                 "\tmovups 48(%%eax), %%xmm5\n"
+                 "\tmovups 48(%%edi), %%xmm6\n"
+                 "\tmulps %%xmm5, %%xmm6\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+
+                 "\tmovups 64(%%eax), %%xmm0\n"
+                 "\tmovups 64(%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+                 "\taddps %%xmm6, %%xmm4\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+
+
+                 "\tadd $80, %%eax\n"
+                 "\tadd $80, %%edi\n"
+
+                 "\tsub $20,  %%ecx\n"
+
+                 "\tjae .mul20_loop%=\n"
+
+                 "\taddps %%xmm4, %%xmm3\n"
+
+                 "\tmovhlps %%xmm3, %%xmm4\n"
+                 "\taddps %%xmm4, %%xmm3\n"
+                 "\tmovaps %%xmm3, %%xmm4\n"
+                 "\tshufps $0x55, %%xmm4, %%xmm4\n"
+                 "\taddss %%xmm4, %%xmm3\n"
+                 "\tmovss %%xmm3, (%%edx)\n"
+
+                 "\tpop %%ecx\n"
+                 "\tpop %%edi\n"
+                 "\tpop %%eax\n"
+                 : : "a" (a), "D" (b), "c" (len), "d" (&sum) : "memory");
  #endif
+               return sum;
+       }

-/*Original, non-optimized version*/
-/*static float inner_prod(float *x, float *y, int len)
-{
-   int i;
-   float sum=0;
-   for (i=0;i<len;i++)
-      sum += x[i]*y[i];
-   return sum;
  }
-*/

<p>  void open_loop_nbest_pitch(float *sw, int start, int end, int len, int 
*pitch, float *gain, int N, char *stack)
diff -ru speex-1.0.3-orig/libspeex/modes.c speex-1.0.3/libspeex/modes.c
--- speex-1.0.3-orig/libspeex/modes.c   2003-06-02 22:29:39.000000000 -0700
+++ speex-1.0.3/libspeex/modes.c        2004-01-08 20:46:26.000000000 -0800
@@ -45,6 +45,7 @@
  #define NULL 0
  #endif

+int global_use_mmx_sse = 0;
  SpeexMode *speex_mode_list[SPEEX_NB_MODES] = {&speex_nb_mode, 
&speex_wb_mode, &speex_uwb_mode};

  /* Extern declarations for all codebooks we use here */
@@ -585,16 +586,27 @@

  int speex_encoder_ctl(void *state, int request, void *ptr)
  {
-   return (*((SpeexMode**)state))->enc_ctl(state, request, ptr);
+       if (request == SPEEX_SET_ASM_FLAG)
+       {
+               global_use_mmx_sse = *((int*)ptr);
+               return 0;
+       }
+       else
+               return (*((SpeexMode**)state))->enc_ctl(state, request, ptr);
  }

  int speex_decoder_ctl(void *state, int request, void *ptr)
  {
-   return (*((SpeexMode**)state))->dec_ctl(state, request, ptr);
+       if (request == SPEEX_SET_ASM_FLAG)
+       {
+               global_use_mmx_sse = *((int*)ptr);
+               return 0;
+       }
+       else
+          return (*((SpeexMode**)state))->dec_ctl(state, request, ptr);
  }

<p>-
  static int nb_mode_query(void *mode, int request, void *ptr)
  {
     SpeexNBMode *m = (SpeexNBMode*)mode;
diff -ru speex-1.0.3-orig/libspeex/speex.h speex-1.0.3/libspeex/speex.h
--- speex-1.0.3-orig/libspeex/speex.h   2003-01-22 23:29:39.000000000 -0800
+++ speex-1.0.3/libspeex/speex.h        2004-01-08 20:46:21.000000000 -0800
@@ -156,6 +156,13 @@
  /** Number of defined modes in Speex */
  #define SPEEX_NB_MODES 3

+
+#define SPEEX_SET_ASM_FLAG             200
+#define SPEEX_ASM_MMX_NONE             0
+#define SPEEX_ASM_MMX_BASIC            1
+#define SPEEX_ASM_MMX_SSE              2
+#define SPEEX_ASM_MMX_SSE_FP   4
+
  struct SpeexMode;

<p><p><p>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: speex-sse-patch.gz
Type: application/octet-stream
Size: 3886 bytes
Desc: speex-sse-patch.gz
Url : http://lists.xiph.org/pipermail/speex-dev/attachments/20040108/99cf64fa/speex-sse-patch.obj